mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-01 06:42:23 +08:00
update flake8 version to support pre-commit in python3.12 (#3000)
* update flake8 version to support pre-commit in python3.12 * polish code
This commit is contained in:
2
.flake8
2
.flake8
@@ -1,5 +1,5 @@
|
|||||||
[flake8]
|
[flake8]
|
||||||
ignore = E203, E402, E501, E731, E741, W503, W605, E722
|
ignore = E203, E402, E501, E731, E741, W503, W605, E722, E231, W604, E702, E226, E221, E713, E271
|
||||||
max-line-length = 119
|
max-line-length = 119
|
||||||
|
|
||||||
# E402: module level import not at top of file
|
# E402: module level import not at top of file
|
||||||
|
@@ -7,7 +7,7 @@ default_stages:
|
|||||||
# - manual # Run in CI
|
# - manual # Run in CI
|
||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/psf/black.git
|
- repo: https://github.com/psf/black.git
|
||||||
rev: 22.8.0
|
rev: 25.1.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: black
|
- id: black
|
||||||
files: \.(py|pyi)$
|
files: \.(py|pyi)$
|
||||||
@@ -18,7 +18,7 @@ repos:
|
|||||||
hooks:
|
hooks:
|
||||||
- id: isort
|
- id: isort
|
||||||
- repo: https://github.com/PyCQA/flake8
|
- repo: https://github.com/PyCQA/flake8
|
||||||
rev: 4.0.1
|
rev: 7.0.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: flake8
|
- id: flake8
|
||||||
# 代码检查
|
# 代码检查
|
||||||
|
@@ -29,7 +29,13 @@ for i in range(bs):
|
|||||||
ids_len = seq_lens[i, 0]
|
ids_len = seq_lens[i, 0]
|
||||||
input_ids[i, 0:ids_len] = np.random.randint(1, 10, seq_lens[i, 0], "int64")
|
input_ids[i, 0:ids_len] = np.random.randint(1, 10, seq_lens[i, 0], "int64")
|
||||||
|
|
||||||
(x_remove_padding, cum_offsets_out, padding_offset, cu_seqlens_q, cu_seqlens_k,) = get_padding_offset(
|
(
|
||||||
|
x_remove_padding,
|
||||||
|
cum_offsets_out,
|
||||||
|
padding_offset,
|
||||||
|
cu_seqlens_q,
|
||||||
|
cu_seqlens_k,
|
||||||
|
) = get_padding_offset(
|
||||||
paddle.to_tensor(input_ids),
|
paddle.to_tensor(input_ids),
|
||||||
paddle.to_tensor(cum_offset),
|
paddle.to_tensor(cum_offset),
|
||||||
paddle.to_tensor(token_num),
|
paddle.to_tensor(token_num),
|
||||||
|
@@ -473,7 +473,10 @@ class PrefixCacheManager:
|
|||||||
current_time = time.time()
|
current_time = time.time()
|
||||||
self._update_matched_node_info(req_id, match_block_node, current_time)
|
self._update_matched_node_info(req_id, match_block_node, current_time)
|
||||||
# 2. prepare cache
|
# 2. prepare cache
|
||||||
(gpu_recv_block_ids, gpu_extra_block_ids,) = self._prepare_cache(
|
(
|
||||||
|
gpu_recv_block_ids,
|
||||||
|
gpu_extra_block_ids,
|
||||||
|
) = self._prepare_cache(
|
||||||
req_id,
|
req_id,
|
||||||
input_ids,
|
input_ids,
|
||||||
block_size,
|
block_size,
|
||||||
|
@@ -113,10 +113,7 @@ class CudaRTLibrary:
|
|||||||
Function(
|
Function(
|
||||||
"cudaStreamIsCapturing",
|
"cudaStreamIsCapturing",
|
||||||
cudaError_t,
|
cudaError_t,
|
||||||
[
|
[cudaStream_t, ctypes.POINTER(cudaStreamCaptureStatus)],
|
||||||
cudaStream_t,
|
|
||||||
ctypes.POINTER(cudaStreamCaptureStatus)
|
|
||||||
]
|
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -197,9 +194,8 @@ class CudaRTLibrary:
|
|||||||
self.funcs["cudaIpcOpenMemHandle"](ctypes.byref(devPtr), handle, cudaIpcMemLazyEnablePeerAccess)
|
self.funcs["cudaIpcOpenMemHandle"](ctypes.byref(devPtr), handle, cudaIpcMemLazyEnablePeerAccess)
|
||||||
)
|
)
|
||||||
return devPtr
|
return devPtr
|
||||||
|
|
||||||
def cudaStreamIsCapturing(self, stream: cudaStream_t) -> ctypes.c_int:
|
def cudaStreamIsCapturing(self, stream: cudaStream_t) -> ctypes.c_int:
|
||||||
is_capturing = ctypes.c_int()
|
is_capturing = ctypes.c_int()
|
||||||
self.CUDART_CHECK(
|
self.CUDART_CHECK(self.funcs["cudaStreamIsCapturing"](stream, is_capturing))
|
||||||
self.funcs["cudaStreamIsCapturing"](stream, is_capturing)
|
|
||||||
)
|
|
||||||
return is_capturing
|
return is_capturing
|
||||||
|
@@ -559,8 +559,8 @@ class EngineArgs:
|
|||||||
"--ips",
|
"--ips",
|
||||||
type=lambda s: s.split(",") if s else None,
|
type=lambda s: s.split(",") if s else None,
|
||||||
default=EngineArgs.ips,
|
default=EngineArgs.ips,
|
||||||
help=
|
help="IP addresses of all nodes participating in distributed inference.",
|
||||||
"IP addresses of all nodes participating in distributed inference.")
|
)
|
||||||
|
|
||||||
# Performance tuning parameters group
|
# Performance tuning parameters group
|
||||||
perf_group = parser.add_argument_group("Performance Tuning")
|
perf_group = parser.add_argument_group("Performance Tuning")
|
||||||
|
@@ -41,7 +41,7 @@ class EngineClient:
|
|||||||
mm_processor_kwargs,
|
mm_processor_kwargs,
|
||||||
enable_mm=False,
|
enable_mm=False,
|
||||||
reasoning_parser=None,
|
reasoning_parser=None,
|
||||||
data_parallel_size=1
|
data_parallel_size=1,
|
||||||
):
|
):
|
||||||
input_processor = InputPreprocessor(
|
input_processor = InputPreprocessor(
|
||||||
tokenizer,
|
tokenizer,
|
||||||
@@ -55,8 +55,7 @@ class EngineClient:
|
|||||||
self.data_processor = input_processor.create_processor()
|
self.data_processor = input_processor.create_processor()
|
||||||
self.max_model_len = max_model_len
|
self.max_model_len = max_model_len
|
||||||
max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
|
max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
|
||||||
array_size = min(
|
array_size = min(max_chips_per_node, tensor_parallel_size * data_parallel_size)
|
||||||
max_chips_per_node, tensor_parallel_size * data_parallel_size)
|
|
||||||
self.worker_healthy_live_recorded_time_array = np.zeros(shape=[array_size], dtype=np.int32)
|
self.worker_healthy_live_recorded_time_array = np.zeros(shape=[array_size], dtype=np.int32)
|
||||||
self.worker_healthy_live_signal = IPCSignal(
|
self.worker_healthy_live_signal = IPCSignal(
|
||||||
name="worker_healthy_live_signal",
|
name="worker_healthy_live_signal",
|
||||||
|
@@ -113,7 +113,7 @@ async def lifespan(app: FastAPI):
|
|||||||
args.mm_processor_kwargs,
|
args.mm_processor_kwargs,
|
||||||
args.enable_mm,
|
args.enable_mm,
|
||||||
args.reasoning_parser,
|
args.reasoning_parser,
|
||||||
args.data_parallel_size
|
args.data_parallel_size,
|
||||||
)
|
)
|
||||||
app.state.dynamic_load_weight = args.dynamic_load_weight
|
app.state.dynamic_load_weight = args.dynamic_load_weight
|
||||||
chat_handler = OpenAIServingChat(engine_client, pid, args.ips)
|
chat_handler = OpenAIServingChat(engine_client, pid, args.ips)
|
||||||
|
@@ -478,7 +478,7 @@ class ChatCompletionRequest(BaseModel):
|
|||||||
top_p: Optional[float] = None
|
top_p: Optional[float] = None
|
||||||
top_k: Optional[int] = None
|
top_k: Optional[int] = None
|
||||||
min_p: Optional[float] = None
|
min_p: Optional[float] = None
|
||||||
user: Optional[str] = None
|
user: Optional[str] = None
|
||||||
metadata: Optional[dict] = None
|
metadata: Optional[dict] = None
|
||||||
extra_body: Optional[dict] = None
|
extra_body: Optional[dict] = None
|
||||||
return_token_ids: Optional[bool] = False
|
return_token_ids: Optional[bool] = False
|
||||||
|
@@ -19,9 +19,10 @@ import time
|
|||||||
import traceback
|
import traceback
|
||||||
import uuid
|
import uuid
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
import numpy as np
|
|
||||||
import aiozmq
|
import aiozmq
|
||||||
import msgpack
|
import msgpack
|
||||||
|
import numpy as np
|
||||||
from aiozmq import zmq
|
from aiozmq import zmq
|
||||||
|
|
||||||
from fastdeploy.entrypoints.openai.protocol import (
|
from fastdeploy.entrypoints.openai.protocol import (
|
||||||
@@ -151,7 +152,9 @@ class OpenAIServingChat:
|
|||||||
if request.metadata is not None:
|
if request.metadata is not None:
|
||||||
enable_thinking = request.metadata.get("enable_thinking")
|
enable_thinking = request.metadata.get("enable_thinking")
|
||||||
include_stop_str_in_output = request.metadata.get("include_stop_str_in_output", False)
|
include_stop_str_in_output = request.metadata.get("include_stop_str_in_output", False)
|
||||||
enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False))
|
enable_return_token_ids = request.return_token_ids or (
|
||||||
|
request.extra_body is not None and request.extra_body.get("return_token_ids", False)
|
||||||
|
)
|
||||||
while num_choices > 0:
|
while num_choices > 0:
|
||||||
try:
|
try:
|
||||||
raw_data = await asyncio.wait_for(dealer.read(), timeout=10)
|
raw_data = await asyncio.wait_for(dealer.read(), timeout=10)
|
||||||
@@ -193,13 +196,13 @@ class OpenAIServingChat:
|
|||||||
choice = ChatCompletionResponseStreamChoice(
|
choice = ChatCompletionResponseStreamChoice(
|
||||||
index=i,
|
index=i,
|
||||||
delta=DeltaMessage(
|
delta=DeltaMessage(
|
||||||
role="assistant",
|
role="assistant",
|
||||||
content="",
|
content="",
|
||||||
reasoning_content="",
|
reasoning_content="",
|
||||||
tool_calls=None,
|
tool_calls=None,
|
||||||
prompt_token_ids=None,
|
prompt_token_ids=None,
|
||||||
completion_token_ids=None,
|
completion_token_ids=None,
|
||||||
)
|
),
|
||||||
)
|
)
|
||||||
if enable_return_token_ids:
|
if enable_return_token_ids:
|
||||||
choice.delta.prompt_token_ids = list(prompt_token_ids)
|
choice.delta.prompt_token_ids = list(prompt_token_ids)
|
||||||
@@ -238,10 +241,10 @@ class OpenAIServingChat:
|
|||||||
|
|
||||||
previous_num_tokens += len(output["token_ids"])
|
previous_num_tokens += len(output["token_ids"])
|
||||||
delta_message = DeltaMessage(
|
delta_message = DeltaMessage(
|
||||||
content=delta_text,
|
content=delta_text,
|
||||||
reasoning_content=output.get("reasoning_content"), \
|
reasoning_content=output.get("reasoning_content"),
|
||||||
prompt_token_ids=None,
|
prompt_token_ids=None,
|
||||||
completion_token_ids=None,
|
completion_token_ids=None,
|
||||||
tool_calls=output.get("tool_call_content", []),
|
tool_calls=output.get("tool_call_content", []),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -329,7 +332,9 @@ class OpenAIServingChat:
|
|||||||
final_res = None
|
final_res = None
|
||||||
enable_thinking = None
|
enable_thinking = None
|
||||||
include_stop_str_in_output = False
|
include_stop_str_in_output = False
|
||||||
enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False))
|
enable_return_token_ids = request.return_token_ids or (
|
||||||
|
request.extra_body is not None and request.extra_body.get("return_token_ids", False)
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
dealer = await aiozmq.create_zmq_stream(zmq.DEALER, connect=f"ipc:///dev/shm/router_{self.pid}.ipc")
|
dealer = await aiozmq.create_zmq_stream(zmq.DEALER, connect=f"ipc:///dev/shm/router_{self.pid}.ipc")
|
||||||
dealer.write([b"", request_id.encode("utf-8")])
|
dealer.write([b"", request_id.encode("utf-8")])
|
||||||
@@ -403,7 +408,7 @@ class OpenAIServingChat:
|
|||||||
reasoning_content=output.get("reasoning_content"),
|
reasoning_content=output.get("reasoning_content"),
|
||||||
tool_calls=output.get("tool_call_content"),
|
tool_calls=output.get("tool_call_content"),
|
||||||
prompt_token_ids=prompt_token_ids if enable_return_token_ids else None,
|
prompt_token_ids=prompt_token_ids if enable_return_token_ids else None,
|
||||||
completion_token_ids=completion_token_ids if enable_return_token_ids else None,
|
completion_token_ids=(completion_token_ids if enable_return_token_ids else None),
|
||||||
)
|
)
|
||||||
logprobs_full_res = None
|
logprobs_full_res = None
|
||||||
if logprob_contents:
|
if logprob_contents:
|
||||||
|
@@ -18,9 +18,10 @@ import asyncio
|
|||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
from typing import List
|
from typing import List
|
||||||
import numpy as np
|
|
||||||
import aiozmq
|
import aiozmq
|
||||||
import msgpack
|
import msgpack
|
||||||
|
import numpy as np
|
||||||
from aiozmq import zmq
|
from aiozmq import zmq
|
||||||
|
|
||||||
from fastdeploy.engine.request import RequestOutput
|
from fastdeploy.engine.request import RequestOutput
|
||||||
@@ -48,7 +49,6 @@ class OpenAIServingCompletion:
|
|||||||
else:
|
else:
|
||||||
self.master_ip = self.master_ip.split(",")[0]
|
self.master_ip = self.master_ip.split(",")[0]
|
||||||
|
|
||||||
|
|
||||||
def _check_master(self):
|
def _check_master(self):
|
||||||
if self.master_ip is None:
|
if self.master_ip is None:
|
||||||
return True
|
return True
|
||||||
@@ -238,7 +238,9 @@ class OpenAIServingCompletion:
|
|||||||
model=model_name,
|
model=model_name,
|
||||||
choices=choices,
|
choices=choices,
|
||||||
)
|
)
|
||||||
enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False))
|
enable_return_token_ids = request.return_token_ids or (
|
||||||
|
request.extra_body is not None and request.extra_body.get("return_token_ids", False)
|
||||||
|
)
|
||||||
current_waiting_time = 0
|
current_waiting_time = 0
|
||||||
while num_choices > 0:
|
while num_choices > 0:
|
||||||
try:
|
try:
|
||||||
@@ -267,12 +269,16 @@ class OpenAIServingCompletion:
|
|||||||
id=request_id,
|
id=request_id,
|
||||||
created=created_time,
|
created=created_time,
|
||||||
model=model_name,
|
model=model_name,
|
||||||
choices=[CompletionResponseStreamChoice(
|
choices=[
|
||||||
index=idx,
|
CompletionResponseStreamChoice(
|
||||||
text="",
|
index=idx,
|
||||||
prompt_token_ids=list(prompt_batched_token_ids[idx]) if enable_return_token_ids else None,
|
text="",
|
||||||
completion_token_ids=None,
|
prompt_token_ids=(
|
||||||
)]
|
list(prompt_batched_token_ids[idx]) if enable_return_token_ids else None
|
||||||
|
),
|
||||||
|
completion_token_ids=None,
|
||||||
|
)
|
||||||
|
],
|
||||||
)
|
)
|
||||||
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
|
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
|
||||||
first_iteration[idx] = False
|
first_iteration[idx] = False
|
||||||
@@ -286,15 +292,17 @@ class OpenAIServingCompletion:
|
|||||||
|
|
||||||
output = res["outputs"]
|
output = res["outputs"]
|
||||||
|
|
||||||
choices.append(CompletionResponseStreamChoice(
|
choices.append(
|
||||||
index=idx,
|
CompletionResponseStreamChoice(
|
||||||
text=output["text"],
|
index=idx,
|
||||||
prompt_token_ids=None,
|
text=output["text"],
|
||||||
completion_token_ids=output.get("token_ids") if enable_return_token_ids else None,
|
prompt_token_ids=None,
|
||||||
tool_calls=output.get("tool_call_content"),
|
completion_token_ids=(output.get("token_ids") if enable_return_token_ids else None),
|
||||||
reasoning_content=output.get("reasoning_content"),
|
tool_calls=output.get("tool_call_content"),
|
||||||
arrival_time=arrival_time
|
reasoning_content=output.get("reasoning_content"),
|
||||||
))
|
arrival_time=arrival_time,
|
||||||
|
)
|
||||||
|
)
|
||||||
if res["finished"]:
|
if res["finished"]:
|
||||||
if request.max_tokens is None or output_tokens[idx] + 1 != request.max_tokens:
|
if request.max_tokens is None or output_tokens[idx] + 1 != request.max_tokens:
|
||||||
chunk.choices[0].finish_reason = "stop"
|
chunk.choices[0].finish_reason = "stop"
|
||||||
@@ -353,12 +361,14 @@ class OpenAIServingCompletion:
|
|||||||
created_time: int,
|
created_time: int,
|
||||||
model_name: str,
|
model_name: str,
|
||||||
prompt_batched_token_ids: list(),
|
prompt_batched_token_ids: list(),
|
||||||
completion_batched_token_ids: list()
|
completion_batched_token_ids: list(),
|
||||||
) -> CompletionResponse:
|
) -> CompletionResponse:
|
||||||
choices: List[CompletionResponseChoice] = []
|
choices: List[CompletionResponseChoice] = []
|
||||||
num_prompt_tokens = 0
|
num_prompt_tokens = 0
|
||||||
num_generated_tokens = 0
|
num_generated_tokens = 0
|
||||||
enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False))
|
enable_return_token_ids = request.return_token_ids or (
|
||||||
|
request.extra_body is not None and request.extra_body.get("return_token_ids", False)
|
||||||
|
)
|
||||||
|
|
||||||
for idx in range(len(final_res_batch)):
|
for idx in range(len(final_res_batch)):
|
||||||
final_res = final_res_batch[idx]
|
final_res = final_res_batch[idx]
|
||||||
@@ -385,8 +395,8 @@ class OpenAIServingCompletion:
|
|||||||
index=len(choices),
|
index=len(choices),
|
||||||
text=output_text,
|
text=output_text,
|
||||||
prompt_token_ids=prompt_token_ids if enable_return_token_ids else None,
|
prompt_token_ids=prompt_token_ids if enable_return_token_ids else None,
|
||||||
completion_token_ids=completion_token_ids if enable_return_token_ids else None,
|
completion_token_ids=(completion_token_ids if enable_return_token_ids else None),
|
||||||
reasoning_content=output.get('reasoning_content'),
|
reasoning_content=output.get("reasoning_content"),
|
||||||
tool_calls=output.get("tool_call_content"),
|
tool_calls=output.get("tool_call_content"),
|
||||||
logprobs=None,
|
logprobs=None,
|
||||||
finish_reason=None,
|
finish_reason=None,
|
||||||
|
@@ -99,8 +99,7 @@ class ErnieProcessor(BaseDataProcessor):
|
|||||||
|
|
||||||
if request.prompt_token_ids is None or len(request.prompt_token_ids) == 0:
|
if request.prompt_token_ids is None or len(request.prompt_token_ids) == 0:
|
||||||
if request.prompt is None and request.messages is None:
|
if request.prompt is None and request.messages is None:
|
||||||
raise ValueError(
|
raise ValueError(f"The request should have `prompt_token_ids`, `prompt` or `messages`: {request}.")
|
||||||
f"The request should have `prompt_token_ids`, `prompt` or `messages`: {request}.")
|
|
||||||
if request.prompt is not None:
|
if request.prompt is not None:
|
||||||
prompt = request.prompt if request.prompt is not None else request.messages[0]
|
prompt = request.prompt if request.prompt is not None else request.messages[0]
|
||||||
prompt = prompt[0] if isinstance(prompt, list) else prompt
|
prompt = prompt[0] if isinstance(prompt, list) else prompt
|
||||||
@@ -164,8 +163,8 @@ class ErnieProcessor(BaseDataProcessor):
|
|||||||
req_id = request.get("request_id", None)
|
req_id = request.get("request_id", None)
|
||||||
data_processor_logger.info(f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}")
|
data_processor_logger.info(f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}")
|
||||||
else:
|
else:
|
||||||
request['prompt_token_ids'] = self.messages2ids(request)
|
request["prompt_token_ids"] = self.messages2ids(request)
|
||||||
if len(request['prompt_token_ids']) == 0:
|
if len(request["prompt_token_ids"]) == 0:
|
||||||
raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
|
raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
|
||||||
|
|
||||||
# truncate prompts that exceed the length limit
|
# truncate prompts that exceed the length limit
|
||||||
@@ -246,8 +245,7 @@ class ErnieProcessor(BaseDataProcessor):
|
|||||||
if is_end:
|
if is_end:
|
||||||
full_text = previous_texts + delta_text
|
full_text = previous_texts + delta_text
|
||||||
if enable_thinking and self.reasoning_parser:
|
if enable_thinking and self.reasoning_parser:
|
||||||
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
|
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict)
|
||||||
full_text, response_dict)
|
|
||||||
response_dict["outputs"]["text"] = text
|
response_dict["outputs"]["text"] = text
|
||||||
response_dict["outputs"]["reasoning_content"] = reasoning_content
|
response_dict["outputs"]["reasoning_content"] = reasoning_content
|
||||||
else:
|
else:
|
||||||
|
@@ -507,5 +507,6 @@ class DataProcessor:
|
|||||||
tokens = self.tokenizer.tokenize(prompt_token_str)
|
tokens = self.tokenizer.tokenize(prompt_token_str)
|
||||||
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
|
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
|
||||||
data_processor_logger.info(
|
data_processor_logger.info(
|
||||||
f"req_id:{request.get('request_id', ''),} tokens: {tokens}, token_ids: {token_ids}")
|
f"req_id:{request.get('request_id', ''), } tokens: {tokens}, token_ids: {token_ids}"
|
||||||
|
)
|
||||||
return token_ids
|
return token_ids
|
||||||
|
@@ -239,9 +239,7 @@ class DataProcessor(BaseDataProcessor):
|
|||||||
task["enable_thinking"] = kwargs.get("enable_thinking", True)
|
task["enable_thinking"] = kwargs.get("enable_thinking", True)
|
||||||
request.prompt_token_ids = self.messages2ids(task)
|
request.prompt_token_ids = self.messages2ids(task)
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(f"The request should have `input_ids`, `text` or `messages`: {request}.")
|
||||||
f"The request should have `input_ids`, `text` or `messages`: {request}."
|
|
||||||
)
|
|
||||||
if len(request.prompt_token_ids) == 0:
|
if len(request.prompt_token_ids) == 0:
|
||||||
raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
|
raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
|
||||||
if request.get("max_tokens") is None:
|
if request.get("max_tokens") is None:
|
||||||
@@ -281,18 +279,16 @@ class DataProcessor(BaseDataProcessor):
|
|||||||
|
|
||||||
data_processor_logger.info(f"Processing request {request}")
|
data_processor_logger.info(f"Processing request {request}")
|
||||||
# processing prompt_token_ids
|
# processing prompt_token_ids
|
||||||
if not request.get('prompt_token_ids'):
|
if not request.get("prompt_token_ids"):
|
||||||
if 'prompt' in request:
|
if "prompt" in request:
|
||||||
request['prompt_token_ids'] = self.text2ids(request['prompt'], max_model_len).tolist()
|
request["prompt_token_ids"] = self.text2ids(request["prompt"], max_model_len).tolist()
|
||||||
elif 'messages' in request:
|
elif "messages" in request:
|
||||||
if self.tokenizer.chat_template is None:
|
if self.tokenizer.chat_template is None:
|
||||||
raise ValueError("This model does not support chat_template.")
|
raise ValueError("This model does not support chat_template.")
|
||||||
request["prompt_token_ids"] = self.messages2ids(request)
|
request["prompt_token_ids"] = self.messages2ids(request)
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}")
|
||||||
f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}"
|
if len(request["prompt_token_ids"]) == 0:
|
||||||
)
|
|
||||||
if len(request['prompt_token_ids']) == 0:
|
|
||||||
raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
|
raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
|
||||||
if request.get("max_tokens") is None:
|
if request.get("max_tokens") is None:
|
||||||
request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"]))
|
request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"]))
|
||||||
@@ -357,8 +353,7 @@ class DataProcessor(BaseDataProcessor):
|
|||||||
if is_end:
|
if is_end:
|
||||||
full_text = previous_texts + delta_text
|
full_text = previous_texts + delta_text
|
||||||
if enable_thinking and self.reasoning_parser:
|
if enable_thinking and self.reasoning_parser:
|
||||||
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
|
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict)
|
||||||
full_text, response_dict)
|
|
||||||
response_dict["outputs"]["text"] = text
|
response_dict["outputs"]["text"] = text
|
||||||
response_dict["outputs"]["reasoning_content"] = reasoning_content
|
response_dict["outputs"]["reasoning_content"] = reasoning_content
|
||||||
else:
|
else:
|
||||||
|
@@ -21,8 +21,8 @@ import paddle.nn.layer
|
|||||||
from paddle.device.cuda import graphs
|
from paddle.device.cuda import graphs
|
||||||
|
|
||||||
from fastdeploy.config import FDConfig
|
from fastdeploy.config import FDConfig
|
||||||
from fastdeploy.utils import get_logger
|
|
||||||
from fastdeploy.distributed.communication import capture_custom_allreduce
|
from fastdeploy.distributed.communication import capture_custom_allreduce
|
||||||
|
from fastdeploy.utils import get_logger
|
||||||
|
|
||||||
logger = get_logger("cudagrpah_piecewise_backend", "cudagraph_piecewise_backend.log")
|
logger = get_logger("cudagrpah_piecewise_backend", "cudagraph_piecewise_backend.log")
|
||||||
|
|
||||||
@@ -99,7 +99,7 @@ class CudaGraphPiecewiseBackend:
|
|||||||
entry.runnable(**kwargs)
|
entry.runnable(**kwargs)
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"[CUDA GRAPH] Warm up for batch size {padding_batch_size}, "
|
f"[CUDA GRAPH] Warm up for batch size {padding_batch_size}, "
|
||||||
f"finished ({n+1}/{entry.num_finished_warmup}) times"
|
f"finished ({n + 1}/{entry.num_finished_warmup}) times"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Store input addresses for debug
|
# Store input addresses for debug
|
||||||
@@ -114,7 +114,6 @@ class CudaGraphPiecewiseBackend:
|
|||||||
new_grpah.capture_begin()
|
new_grpah.capture_begin()
|
||||||
output = entry.runnable(**kwargs)
|
output = entry.runnable(**kwargs)
|
||||||
new_grpah.capture_end()
|
new_grpah.capture_end()
|
||||||
|
|
||||||
|
|
||||||
# Store output buffer
|
# Store output buffer
|
||||||
entry.cuda_graph = new_grpah
|
entry.cuda_graph = new_grpah
|
||||||
|
@@ -17,7 +17,11 @@ dcu backend methods
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
from .fused_moe_triton_backends import DCUTritonWeightOnlyMoEMethod
|
from .fused_moe_triton_backends import DCUTritonWeightOnlyMoEMethod
|
||||||
from .weight_only import DCUWeightOnlyLinearMethod
|
|
||||||
from .top_p_sampling import native_top_p_sampling
|
from .top_p_sampling import native_top_p_sampling
|
||||||
|
from .weight_only import DCUWeightOnlyLinearMethod
|
||||||
|
|
||||||
__all__ = ["DCUTritonWeightOnlyMoEMethod", "DCUWeightOnlyLinearMethod", "native_top_p_sampling"]
|
__all__ = [
|
||||||
|
"DCUTritonWeightOnlyMoEMethod",
|
||||||
|
"DCUWeightOnlyLinearMethod",
|
||||||
|
"native_top_p_sampling",
|
||||||
|
]
|
||||||
|
@@ -13,13 +13,11 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import paddle
|
import paddle
|
||||||
|
|
||||||
|
|
||||||
def native_top_p_sampling(
|
def native_top_p_sampling(probs: paddle.Tensor, top_p: paddle.Tensor) -> tuple[paddle.Tensor, paddle.Tensor]:
|
||||||
probs: paddle.Tensor,
|
|
||||||
top_p: paddle.Tensor
|
|
||||||
) -> tuple[paddle.Tensor, paddle.Tensor]:
|
|
||||||
sorted_indices = paddle.argsort(probs, descending=True)
|
sorted_indices = paddle.argsort(probs, descending=True)
|
||||||
sorted_probs = paddle.sort(probs, descending=True)
|
sorted_probs = paddle.sort(probs, descending=True)
|
||||||
cumulative_probs = paddle.cumsum(sorted_probs, axis=-1)
|
cumulative_probs = paddle.cumsum(sorted_probs, axis=-1)
|
||||||
@@ -30,7 +28,9 @@ def native_top_p_sampling(
|
|||||||
sorted_indices = sorted_indices + paddle.arange(probs.shape[0], dtype="int64").unsqueeze(-1) * probs.shape[-1]
|
sorted_indices = sorted_indices + paddle.arange(probs.shape[0], dtype="int64").unsqueeze(-1) * probs.shape[-1]
|
||||||
|
|
||||||
condition = paddle.scatter(
|
condition = paddle.scatter(
|
||||||
sorted_indices_to_remove.flatten(), sorted_indices.flatten(), sorted_indices_to_remove.flatten()
|
sorted_indices_to_remove.flatten(),
|
||||||
|
sorted_indices.flatten(),
|
||||||
|
sorted_indices_to_remove.flatten(),
|
||||||
)
|
)
|
||||||
|
|
||||||
condition = paddle.cast(condition, "bool").reshape(probs.shape)
|
condition = paddle.cast(condition, "bool").reshape(probs.shape)
|
||||||
|
@@ -143,7 +143,13 @@ class DeepEPEngine:
|
|||||||
event: the event after executing the kernel (valid only if `async_finish` is set).
|
event: the event after executing the kernel (valid only if `async_finish` is set).
|
||||||
hook: the receiving hook function (valid only if `return_recv_hook` is set).
|
hook: the receiving hook function (valid only if `return_recv_hook` is set).
|
||||||
"""
|
"""
|
||||||
(packed_recv_x, recv_expert_count, handle, _, dispatch_hook,) = self.deepep_engine.low_latency_dispatch(
|
(
|
||||||
|
packed_recv_x,
|
||||||
|
recv_expert_count,
|
||||||
|
handle,
|
||||||
|
_,
|
||||||
|
dispatch_hook,
|
||||||
|
) = self.deepep_engine.low_latency_dispatch(
|
||||||
hidden_states,
|
hidden_states,
|
||||||
topk_idx,
|
topk_idx,
|
||||||
expertwise_scale,
|
expertwise_scale,
|
||||||
|
@@ -21,15 +21,21 @@ import fastdeploy
|
|||||||
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
|
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
|
||||||
from fastdeploy.model_executor.ops.gpu import (
|
from fastdeploy.model_executor.ops.gpu import (
|
||||||
MoeWna16MarlinGemmApi,
|
MoeWna16MarlinGemmApi,
|
||||||
tritonmoe_preprocess_func,
|
|
||||||
noaux_tc,
|
noaux_tc,
|
||||||
|
tritonmoe_preprocess_func,
|
||||||
)
|
)
|
||||||
|
|
||||||
from ..quantization.quant_base import QuantMethodBase
|
from ..quantization.quant_base import QuantMethodBase
|
||||||
|
|
||||||
def get_moe_scores(gating_output: paddle.Tensor, n_group, topk_group, top_k,
|
|
||||||
routed_scaling_factor,
|
def get_moe_scores(
|
||||||
e_score_correction_bias) -> paddle.Tensor:
|
gating_output: paddle.Tensor,
|
||||||
|
n_group,
|
||||||
|
topk_group,
|
||||||
|
top_k,
|
||||||
|
routed_scaling_factor,
|
||||||
|
e_score_correction_bias,
|
||||||
|
) -> paddle.Tensor:
|
||||||
"""
|
"""
|
||||||
compute moe scores using e_score_correction_bias.
|
compute moe scores using e_score_correction_bias.
|
||||||
"""
|
"""
|
||||||
@@ -45,6 +51,7 @@ def get_moe_scores(gating_output: paddle.Tensor, n_group, topk_group, top_k,
|
|||||||
)
|
)
|
||||||
return scores
|
return scores
|
||||||
|
|
||||||
|
|
||||||
def gptq_marlin_moe_repack(
|
def gptq_marlin_moe_repack(
|
||||||
b_q_weight: paddle.Tensor,
|
b_q_weight: paddle.Tensor,
|
||||||
perm: paddle.Tensor,
|
perm: paddle.Tensor,
|
||||||
@@ -226,10 +233,14 @@ class MarlinWeightOnlyMoEMethod(QuantMethodBase):
|
|||||||
topk_method = layer.topk_method
|
topk_method = layer.topk_method
|
||||||
|
|
||||||
if topk_method == "noaux_tc":
|
if topk_method == "noaux_tc":
|
||||||
gate_out = get_moe_scores(gate_out, layer.n_group,
|
gate_out = get_moe_scores(
|
||||||
layer.topk_group, layer.top_k,
|
gate_out,
|
||||||
layer.routed_scaling_factor,
|
layer.n_group,
|
||||||
layer.gate_correction_bias)
|
layer.topk_group,
|
||||||
|
layer.top_k,
|
||||||
|
layer.routed_scaling_factor,
|
||||||
|
layer.gate_correction_bias,
|
||||||
|
)
|
||||||
|
|
||||||
topk_weights, topk_ids = paddle.topk(gate_out, k=layer.top_k, axis=-1, sorted=False)
|
topk_weights, topk_ids = paddle.topk(gate_out, k=layer.top_k, axis=-1, sorted=False)
|
||||||
else:
|
else:
|
||||||
|
@@ -609,11 +609,11 @@ class BlockWiseFP8MoEMethod(QuantMethodBase):
|
|||||||
from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess_func
|
from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess_func
|
||||||
|
|
||||||
sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess_func(
|
sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess_func(
|
||||||
topk_ids, num_local_experts, config["BLOCK_SIZE_M"])
|
topk_ids, num_local_experts, config["BLOCK_SIZE_M"]
|
||||||
|
)
|
||||||
# cache13 = create_empty_tensor(tuple([token_num * top_k * max(N1, N2)]), x.dtype)
|
# cache13 = create_empty_tensor(tuple([token_num * top_k * max(N1, N2)]), x.dtype)
|
||||||
cache13 = paddle.empty([token_num * top_k * max(N1, N2)], dtype=x.dtype)
|
cache13 = paddle.empty([token_num * top_k * max(N1, N2)], dtype=x.dtype)
|
||||||
intermediate_cache1 = cache13[:token_num * top_k * N1].view(
|
intermediate_cache1 = cache13[: token_num * top_k * N1].view([token_num * top_k, N1])
|
||||||
[token_num * top_k, N1])
|
|
||||||
max_num_tokens_padded = sorted_token_ids.shape[0]
|
max_num_tokens_padded = sorted_token_ids.shape[0]
|
||||||
|
|
||||||
grid = (
|
grid = (
|
||||||
@@ -669,11 +669,11 @@ class BlockWiseFP8MoEMethod(QuantMethodBase):
|
|||||||
|
|
||||||
intermediate_cache2 = paddle.incubate.nn.functional.swiglu(intermediate_cache1)
|
intermediate_cache2 = paddle.incubate.nn.functional.swiglu(intermediate_cache1)
|
||||||
|
|
||||||
intermediate_cache3 = cache13[:token_num * top_k * N2].view(
|
intermediate_cache3 = cache13[: token_num * top_k * N2].view([token_num * top_k, N2])
|
||||||
[token_num * top_k, N2])
|
|
||||||
|
|
||||||
grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
|
grid = (
|
||||||
ceil_div(hidden_size, config["BLOCK_SIZE_N"]), )
|
ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) * ceil_div(hidden_size, config["BLOCK_SIZE_N"]),
|
||||||
|
)
|
||||||
|
|
||||||
x_q, x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(
|
x_q, x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(
|
||||||
intermediate_cache2, self.quant_config.weight_block_size[0]
|
intermediate_cache2, self.quant_config.weight_block_size[0]
|
||||||
|
@@ -125,7 +125,7 @@ class FusedMoE(nn.Layer):
|
|||||||
self.init_moe_weights()
|
self.init_moe_weights()
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"{moe_tag}MoE config is {num_experts=}[{expert_id_offset}, {expert_id_offset+self.num_local_experts}), \
|
f"{moe_tag}MoE config is {num_experts=}[{expert_id_offset}, {expert_id_offset + self.num_local_experts}), \
|
||||||
{top_k=}, hidden_size={self.hidden_size}, {moe_intermediate_size=}, \
|
{top_k=}, hidden_size={self.hidden_size}, {moe_intermediate_size=}, \
|
||||||
, ep_size={self.ep_size}, \
|
, ep_size={self.ep_size}, \
|
||||||
tp_size={self.tp_size}."
|
tp_size={self.tp_size}."
|
||||||
@@ -232,17 +232,21 @@ class FusedMoE(nn.Layer):
|
|||||||
up_gate_proj_expert_weight_key_name = up_gate_proj_expert_weight_key.format(expert_idx)
|
up_gate_proj_expert_weight_key_name = up_gate_proj_expert_weight_key.format(expert_idx)
|
||||||
up_gate_proj_weights.append(
|
up_gate_proj_weights.append(
|
||||||
get_tensor(
|
get_tensor(
|
||||||
state_dict.pop(up_gate_proj_expert_weight_key_name)
|
(
|
||||||
if up_gate_proj_expert_weight_key_name in state_dict
|
state_dict.pop(up_gate_proj_expert_weight_key_name)
|
||||||
else up_gate_proj_expert_weight_key_name,
|
if up_gate_proj_expert_weight_key_name in state_dict
|
||||||
|
else up_gate_proj_expert_weight_key_name
|
||||||
|
),
|
||||||
self.fd_config.parallel_config.model_name_or_path,
|
self.fd_config.parallel_config.model_name_or_path,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
down_proj_weights.append(
|
down_proj_weights.append(
|
||||||
get_tensor(
|
get_tensor(
|
||||||
state_dict.pop(down_proj_expert_weight_key_name)
|
(
|
||||||
if down_proj_expert_weight_key_name in state_dict
|
state_dict.pop(down_proj_expert_weight_key_name)
|
||||||
else down_proj_expert_weight_key_name,
|
if down_proj_expert_weight_key_name in state_dict
|
||||||
|
else down_proj_expert_weight_key_name
|
||||||
|
),
|
||||||
self.fd_config.parallel_config.model_name_or_path,
|
self.fd_config.parallel_config.model_name_or_path,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@@ -255,23 +259,29 @@ class FusedMoE(nn.Layer):
|
|||||||
up_expert_weight_key_name = up_expert_weight_key.format(expert_idx)
|
up_expert_weight_key_name = up_expert_weight_key.format(expert_idx)
|
||||||
down_proj_expert_weight_key_name = down_proj_expert_weight_key.format(expert_idx)
|
down_proj_expert_weight_key_name = down_proj_expert_weight_key.format(expert_idx)
|
||||||
gate = get_tensor(
|
gate = get_tensor(
|
||||||
state_dict.pop(gate_expert_weight_key_name)
|
(
|
||||||
if gate_expert_weight_key_name in state_dict
|
state_dict.pop(gate_expert_weight_key_name)
|
||||||
else gate_expert_weight_key_name,
|
if gate_expert_weight_key_name in state_dict
|
||||||
|
else gate_expert_weight_key_name
|
||||||
|
),
|
||||||
self.fd_config.parallel_config.model_name_or_path,
|
self.fd_config.parallel_config.model_name_or_path,
|
||||||
)
|
)
|
||||||
up = get_tensor(
|
up = get_tensor(
|
||||||
state_dict.pop(up_expert_weight_key_name)
|
(
|
||||||
if up_expert_weight_key_name in state_dict
|
state_dict.pop(up_expert_weight_key_name)
|
||||||
else up_expert_weight_key_name,
|
if up_expert_weight_key_name in state_dict
|
||||||
|
else up_expert_weight_key_name
|
||||||
|
),
|
||||||
self.fd_config.parallel_config.model_name_or_path,
|
self.fd_config.parallel_config.model_name_or_path,
|
||||||
)
|
)
|
||||||
up_gate_proj_weights.append(paddle.concat([gate, up], axis=-1))
|
up_gate_proj_weights.append(paddle.concat([gate, up], axis=-1))
|
||||||
down_proj_weights.append(
|
down_proj_weights.append(
|
||||||
get_tensor(
|
get_tensor(
|
||||||
state_dict.pop(down_proj_expert_weight_key_name)
|
(
|
||||||
if down_proj_expert_weight_key_name in state_dict
|
state_dict.pop(down_proj_expert_weight_key_name)
|
||||||
else down_proj_expert_weight_key_name,
|
if down_proj_expert_weight_key_name in state_dict
|
||||||
|
else down_proj_expert_weight_key_name
|
||||||
|
),
|
||||||
self.fd_config.parallel_config.model_name_or_path,
|
self.fd_config.parallel_config.model_name_or_path,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@@ -54,8 +54,8 @@ def apply_penalty_multi_scores(
|
|||||||
eos_token_ids,
|
eos_token_ids,
|
||||||
)
|
)
|
||||||
elif current_platform.is_dcu():
|
elif current_platform.is_dcu():
|
||||||
from fastdeploy.model_executor.ops.gpu import \
|
from fastdeploy.model_executor.ops.gpu import get_token_penalty_multi_scores
|
||||||
get_token_penalty_multi_scores
|
|
||||||
logits = get_token_penalty_multi_scores(
|
logits = get_token_penalty_multi_scores(
|
||||||
pre_token_ids,
|
pre_token_ids,
|
||||||
prompt_ids,
|
prompt_ids,
|
||||||
|
@@ -81,6 +81,7 @@ def top_k_top_p_sampling(
|
|||||||
_, ids = gcu_top_p_sampling(x, top_p)
|
_, ids = gcu_top_p_sampling(x, top_p)
|
||||||
elif current_platform.is_dcu():
|
elif current_platform.is_dcu():
|
||||||
from fastdeploy.model_executor.layers.backends import native_top_p_sampling
|
from fastdeploy.model_executor.layers.backends import native_top_p_sampling
|
||||||
|
|
||||||
_, ids = native_top_p_sampling(x, top_p)
|
_, ids = native_top_p_sampling(x, top_p)
|
||||||
else:
|
else:
|
||||||
_, ids = paddle.tensor.top_p_sampling(
|
_, ids = paddle.tensor.top_p_sampling(
|
||||||
|
@@ -300,7 +300,13 @@ def speculate_remove_padding(
|
|||||||
if current_platform.is_cuda():
|
if current_platform.is_cuda():
|
||||||
cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time)
|
cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time)
|
||||||
token_num = paddle.sum(seq_lens_this_time)
|
token_num = paddle.sum(seq_lens_this_time)
|
||||||
(ids_remove_padding, cum_offsets, padding_offset, cu_seqlens_q, cu_seqlens_k,) = speculate_get_padding_offset(
|
(
|
||||||
|
ids_remove_padding,
|
||||||
|
cum_offsets,
|
||||||
|
padding_offset,
|
||||||
|
cu_seqlens_q,
|
||||||
|
cu_seqlens_k,
|
||||||
|
) = speculate_get_padding_offset(
|
||||||
input_ids,
|
input_ids,
|
||||||
draft_tokens,
|
draft_tokens,
|
||||||
cum_offsets_now,
|
cum_offsets_now,
|
||||||
|
@@ -103,9 +103,9 @@ def extract_triton_kernel(kernel, file_name):
|
|||||||
import textwrap
|
import textwrap
|
||||||
|
|
||||||
fn = kernel
|
fn = kernel
|
||||||
if type(kernel) == triton.runtime.jit.JITFunction:
|
if isinstance(kernel, triton.runtime.jit.JITFunction):
|
||||||
fn = kernel.fn
|
fn = kernel.fn
|
||||||
elif type(kernel) == triton.runtime.autotuner.Autotuner:
|
elif isinstance(kernel, triton.runtime.autotuner.Autotuner):
|
||||||
fn = kernel.fn.fn
|
fn = kernel.fn.fn
|
||||||
else:
|
else:
|
||||||
AssertionError("error occurs")
|
AssertionError("error occurs")
|
||||||
@@ -195,14 +195,14 @@ def get_value_hint(x):
|
|||||||
"""
|
"""
|
||||||
hint = ""
|
hint = ""
|
||||||
for ele in x:
|
for ele in x:
|
||||||
if type(ele) == int:
|
if isinstance(ele, int):
|
||||||
if ele % 16 == 0 and ele > 0:
|
if ele % 16 == 0 and ele > 0:
|
||||||
hint += "i64:16,"
|
hint += "i64:16,"
|
||||||
elif ele == 1:
|
elif ele == 1:
|
||||||
hint += "i64:1,"
|
hint += "i64:1,"
|
||||||
else:
|
else:
|
||||||
hint += "i64,"
|
hint += "i64,"
|
||||||
if type(ele) == float:
|
if isinstance(ele, float):
|
||||||
hint += "fp32,"
|
hint += "fp32,"
|
||||||
return hint
|
return hint
|
||||||
|
|
||||||
@@ -467,16 +467,16 @@ def rendering_common_template(
|
|||||||
if arg_defaults[i] is None:
|
if arg_defaults[i] is None:
|
||||||
input_and_attr += f"paddle::optional<paddle::Tensor> & {arg_names[i]},"
|
input_and_attr += f"paddle::optional<paddle::Tensor> & {arg_names[i]},"
|
||||||
paddle_input_sig += f"""paddle::Optional("{arg_names[i]}"),"""
|
paddle_input_sig += f"""paddle::Optional("{arg_names[i]}"),"""
|
||||||
elif type(arg_defaults[i]) == float:
|
elif isinstance(arg_defaults[i], float):
|
||||||
input_and_attr += f"float {arg_names[i]},"
|
input_and_attr += f"float {arg_names[i]},"
|
||||||
paddle_attr_sig += f""""{arg_names[i]}: float","""
|
paddle_attr_sig += f""""{arg_names[i]}: float","""
|
||||||
elif type(arg_defaults[i]) == bool:
|
elif isinstance(arg_defaults[i], bool):
|
||||||
input_and_attr += f"bool {arg_names[i]},"
|
input_and_attr += f"bool {arg_names[i]},"
|
||||||
paddle_attr_sig += f""""{arg_names[i]}: bool","""
|
paddle_attr_sig += f""""{arg_names[i]}: bool","""
|
||||||
elif type(arg_defaults[i]) == int:
|
elif isinstance(arg_defaults[i], int):
|
||||||
input_and_attr += f"int64_t {arg_names[i]},"
|
input_and_attr += f"int64_t {arg_names[i]},"
|
||||||
paddle_attr_sig += f""""{arg_names[i]}: int64_t","""
|
paddle_attr_sig += f""""{arg_names[i]}: int64_t","""
|
||||||
elif type(arg_defaults[i]) == str:
|
elif isinstance(arg_defaults[i], str):
|
||||||
input_and_attr += f"std::string {arg_names[i]},"
|
input_and_attr += f"std::string {arg_names[i]},"
|
||||||
paddle_attr_sig += f""""{arg_names[i]}: std::string","""
|
paddle_attr_sig += f""""{arg_names[i]}: std::string","""
|
||||||
elif arg_names[i] == "config":
|
elif arg_names[i] == "config":
|
||||||
@@ -629,11 +629,11 @@ class KernelInterface:
|
|||||||
for i in range(len(all_input)):
|
for i in range(len(all_input)):
|
||||||
ele = all_input[i]
|
ele = all_input[i]
|
||||||
if (
|
if (
|
||||||
type(ele) == paddle.Tensor
|
isinstance(ele, paddle.Tensor)
|
||||||
or type(ele) == paddle.base.framework.EagerParamBase
|
or isinstance(ele, paddle.base.framework.EagerParamBase)
|
||||||
or type(ele) == paddle.base.framework.Parameter
|
or isinstance(ele, paddle.base.framework.Parameter)
|
||||||
or type(ele) == paddle.base.framework.Variable
|
or isinstance(ele, paddle.base.framework.Variable)
|
||||||
or type(ele) == paddle.base.libpaddle.pir.Value
|
or isinstance(ele, paddle.base.libpaddle.pir.Value)
|
||||||
):
|
):
|
||||||
dtypes.append(ele.dtype)
|
dtypes.append(ele.dtype)
|
||||||
modified_arg_exclude_constexpr[i] = f"input_ptrs[{i}]"
|
modified_arg_exclude_constexpr[i] = f"input_ptrs[{i}]"
|
||||||
@@ -668,7 +668,7 @@ class KernelInterface:
|
|||||||
lanuch_grid = list(self.grid)
|
lanuch_grid = list(self.grid)
|
||||||
for i in range(len(lanuch_grid)):
|
for i in range(len(lanuch_grid)):
|
||||||
ele = lanuch_grid[i]
|
ele = lanuch_grid[i]
|
||||||
if type(ele) == str:
|
if isinstance(ele, str):
|
||||||
for key in const_hint_dict.keys():
|
for key in const_hint_dict.keys():
|
||||||
if key in ele:
|
if key in ele:
|
||||||
ele = ele.replace(key, f"{{{key}}}")
|
ele = ele.replace(key, f"{{{key}}}")
|
||||||
|
@@ -153,14 +153,14 @@ class Ernie4_5_MoeForCausalLMRL(Ernie4_5_MoeForCausalLM, BaseRLModel):
|
|||||||
# Helper function to add layer mappings
|
# Helper function to add layer mappings
|
||||||
def _add_layer_mappings(layer_idx: int):
|
def _add_layer_mappings(layer_idx: int):
|
||||||
# MoE specific mappings
|
# MoE specific mappings
|
||||||
self.infer_to_train_mapping[
|
self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.fused_moe.gate_weight"] = (
|
||||||
f"{base_name}.{layer_idx}.mlp.fused_moe.gate_weight"
|
f"{base_name}.{layer_idx}.mlp.gate.weight"
|
||||||
] = f"{base_name}.{layer_idx}.mlp.gate.weight"
|
)
|
||||||
|
|
||||||
if self.fd_config.model_config.moe_use_aux_free:
|
if self.fd_config.model_config.moe_use_aux_free:
|
||||||
self.infer_to_train_mapping[
|
self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias"] = (
|
||||||
f"{base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias"
|
f"{base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias"
|
||||||
] = f"{base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias"
|
)
|
||||||
|
|
||||||
# MoE experts mappings
|
# MoE experts mappings
|
||||||
for expert_idx in range(self.fd_config.model_config.moe_num_experts):
|
for expert_idx in range(self.fd_config.model_config.moe_num_experts):
|
||||||
@@ -184,7 +184,8 @@ class Ernie4_5_MoeForCausalLMRL(Ernie4_5_MoeForCausalLM, BaseRLModel):
|
|||||||
assert isinstance(self.fd_config.model_config.moe_layer_start_index, int)
|
assert isinstance(self.fd_config.model_config.moe_layer_start_index, int)
|
||||||
# Process MoE layers
|
# Process MoE layers
|
||||||
for layer_idx in range(
|
for layer_idx in range(
|
||||||
self.fd_config.model_config.moe_layer_start_index, self.fd_config.model_config.num_hidden_layers
|
self.fd_config.model_config.moe_layer_start_index,
|
||||||
|
self.fd_config.model_config.num_hidden_layers,
|
||||||
):
|
):
|
||||||
_add_layer_mappings(layer_idx)
|
_add_layer_mappings(layer_idx)
|
||||||
|
|
||||||
@@ -226,9 +227,9 @@ class Ernie4_5_VLMoeForConditionalGenerationRL(Ernie4_5_VLMoeForConditionalGener
|
|||||||
def _add_expert_mappings(layer_idx: int, moe_tag: str, expert_start: int):
|
def _add_expert_mappings(layer_idx: int, moe_tag: str, expert_start: int):
|
||||||
# MoE specific mappings
|
# MoE specific mappings
|
||||||
gate_suffix = "" if moe_tag == "text" else "_1"
|
gate_suffix = "" if moe_tag == "text" else "_1"
|
||||||
self.infer_to_train_mapping[
|
self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.{moe_tag}_fused_moe.gate_weight"] = (
|
||||||
f"{base_name}.{layer_idx}.mlp.{moe_tag}_fused_moe.gate_weight"
|
f"{base_name}.{layer_idx}.mlp.gate.weight{gate_suffix}"
|
||||||
] = f"{base_name}.{layer_idx}.mlp.gate.weight{gate_suffix}"
|
)
|
||||||
|
|
||||||
if self.fd_config.model_config.moe_use_aux_free:
|
if self.fd_config.model_config.moe_use_aux_free:
|
||||||
self.infer_to_train_mapping[
|
self.infer_to_train_mapping[
|
||||||
@@ -245,7 +246,10 @@ class Ernie4_5_VLMoeForConditionalGenerationRL(Ernie4_5_VLMoeForConditionalGener
|
|||||||
|
|
||||||
expert_mappings = defaultdict(list)
|
expert_mappings = defaultdict(list)
|
||||||
for expert_idx in _generate_ranges(
|
for expert_idx in _generate_ranges(
|
||||||
expert_start, total_moe_num, expert_num_per_rank * 2, expert_num_per_rank
|
expert_start,
|
||||||
|
total_moe_num,
|
||||||
|
expert_num_per_rank * 2,
|
||||||
|
expert_num_per_rank,
|
||||||
):
|
):
|
||||||
for ph in place_holders:
|
for ph in place_holders:
|
||||||
expert_mappings[f"{base_name}.{layer_idx}.mlp.{moe_tag}_fused_moe.up_gate_proj_weight"].append(
|
expert_mappings[f"{base_name}.{layer_idx}.mlp.{moe_tag}_fused_moe.up_gate_proj_weight"].append(
|
||||||
@@ -323,9 +327,9 @@ class Qwen2ForCausalLMRL(Qwen2ForCausalLM, BaseRLModel):
|
|||||||
def _add_layer_mappings(layer_idx):
|
def _add_layer_mappings(layer_idx):
|
||||||
# FFN mappings
|
# FFN mappings
|
||||||
for ph in place_holders:
|
for ph in place_holders:
|
||||||
self.infer_to_train_mapping[
|
self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.up_gate_proj.{ph}"] = (
|
||||||
f"{base_name}.{layer_idx}.mlp.up_gate_proj.{ph}"
|
f"{base_name}.{layer_idx}.mlp.gate_up_fused_proj.{ph}"
|
||||||
] = f"{base_name}.{layer_idx}.mlp.gate_up_fused_proj.{ph}"
|
)
|
||||||
|
|
||||||
for layer_idx in range(self.fd_config.model_config.num_hidden_layers):
|
for layer_idx in range(self.fd_config.model_config.num_hidden_layers):
|
||||||
_add_layer_mappings(layer_idx)
|
_add_layer_mappings(layer_idx)
|
||||||
@@ -368,14 +372,14 @@ class Qwen3MoeForCausalLMRL(Qwen3MoeForCausalLM, BaseRLModel):
|
|||||||
# Helper function to add layer mappings
|
# Helper function to add layer mappings
|
||||||
def _add_layer_mappings(layer_idx: int):
|
def _add_layer_mappings(layer_idx: int):
|
||||||
# MoE specific mappings
|
# MoE specific mappings
|
||||||
self.infer_to_train_mapping[
|
self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.gate_weight"] = (
|
||||||
f"{base_name}.{layer_idx}.mlp.gate_weight"
|
f"{base_name}.{layer_idx}.mlp.gate.weight"
|
||||||
] = f"{base_name}.{layer_idx}.mlp.gate.weight"
|
)
|
||||||
|
|
||||||
if self.fd_config.moe_config.moe_use_aux_free:
|
if self.fd_config.moe_config.moe_use_aux_free:
|
||||||
self.infer_to_train_mapping[
|
self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias"] = (
|
||||||
f"{base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias"
|
f"{base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias"
|
||||||
] = f"{base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias"
|
)
|
||||||
|
|
||||||
# MoE experts mappings
|
# MoE experts mappings
|
||||||
for expert_idx in range(self.fd_config.moe_config.num_experts):
|
for expert_idx in range(self.fd_config.moe_config.num_experts):
|
||||||
|
@@ -6,7 +6,7 @@ known_third_party = ["paddle"]
|
|||||||
[tool.black]
|
[tool.black]
|
||||||
line-length = 119
|
line-length = 119
|
||||||
target_version = ['py35', 'py36', 'py37', 'py38', 'py39', 'py310']
|
target_version = ['py35', 'py36', 'py37', 'py38', 'py39', 'py310']
|
||||||
exclude = ['.flake8']
|
exclude = '.flake8'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@@ -342,10 +342,12 @@ def test_streaming(openai_client, capsys):
|
|||||||
output.append(chunk.choices[0].text)
|
output.append(chunk.choices[0].text)
|
||||||
assert len(output) > 0
|
assert len(output) > 0
|
||||||
|
|
||||||
|
|
||||||
# ==========================
|
# ==========================
|
||||||
# OpenAI Client additional chat/completions test
|
# OpenAI Client additional chat/completions test
|
||||||
# ==========================
|
# ==========================
|
||||||
|
|
||||||
|
|
||||||
def test_non_streaming_with_stop_str(openai_client):
|
def test_non_streaming_with_stop_str(openai_client):
|
||||||
"""
|
"""
|
||||||
Test non-streaming chat functionality with the local service
|
Test non-streaming chat functionality with the local service
|
||||||
@@ -423,12 +425,12 @@ def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
|
|||||||
extra_body={"return_token_ids": True},
|
extra_body={"return_token_ids": True},
|
||||||
stream=False,
|
stream=False,
|
||||||
)
|
)
|
||||||
assert hasattr(response, 'choices')
|
assert hasattr(response, "choices")
|
||||||
assert len(response.choices) > 0
|
assert len(response.choices) > 0
|
||||||
assert hasattr(response.choices[0], 'message')
|
assert hasattr(response.choices[0], "message")
|
||||||
assert hasattr(response.choices[0].message, 'prompt_token_ids')
|
assert hasattr(response.choices[0].message, "prompt_token_ids")
|
||||||
assert isinstance(response.choices[0].message.prompt_token_ids, list)
|
assert isinstance(response.choices[0].message.prompt_token_ids, list)
|
||||||
assert hasattr(response.choices[0].message, 'completion_token_ids')
|
assert hasattr(response.choices[0].message, "completion_token_ids")
|
||||||
assert isinstance(response.choices[0].message.completion_token_ids, list)
|
assert isinstance(response.choices[0].message.completion_token_ids, list)
|
||||||
|
|
||||||
# disable return_token_ids
|
# disable return_token_ids
|
||||||
@@ -440,12 +442,12 @@ def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
|
|||||||
extra_body={"return_token_ids": False},
|
extra_body={"return_token_ids": False},
|
||||||
stream=False,
|
stream=False,
|
||||||
)
|
)
|
||||||
assert hasattr(response, 'choices')
|
assert hasattr(response, "choices")
|
||||||
assert len(response.choices) > 0
|
assert len(response.choices) > 0
|
||||||
assert hasattr(response.choices[0], 'message')
|
assert hasattr(response.choices[0], "message")
|
||||||
assert hasattr(response.choices[0].message, 'prompt_token_ids')
|
assert hasattr(response.choices[0].message, "prompt_token_ids")
|
||||||
assert response.choices[0].message.prompt_token_ids is None
|
assert response.choices[0].message.prompt_token_ids is None
|
||||||
assert hasattr(response.choices[0].message, 'completion_token_ids')
|
assert hasattr(response.choices[0].message, "completion_token_ids")
|
||||||
assert response.choices[0].message.completion_token_ids is None
|
assert response.choices[0].message.completion_token_ids is None
|
||||||
|
|
||||||
|
|
||||||
@@ -464,11 +466,11 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys):
|
|||||||
)
|
)
|
||||||
is_first_chunk = True
|
is_first_chunk = True
|
||||||
for chunk in response:
|
for chunk in response:
|
||||||
assert hasattr(chunk, 'choices')
|
assert hasattr(chunk, "choices")
|
||||||
assert len(chunk.choices) > 0
|
assert len(chunk.choices) > 0
|
||||||
assert hasattr(chunk.choices[0], 'delta')
|
assert hasattr(chunk.choices[0], "delta")
|
||||||
assert hasattr(chunk.choices[0].delta, 'prompt_token_ids')
|
assert hasattr(chunk.choices[0].delta, "prompt_token_ids")
|
||||||
assert hasattr(chunk.choices[0].delta, 'completion_token_ids')
|
assert hasattr(chunk.choices[0].delta, "completion_token_ids")
|
||||||
if is_first_chunk:
|
if is_first_chunk:
|
||||||
is_first_chunk = False
|
is_first_chunk = False
|
||||||
assert isinstance(chunk.choices[0].delta.prompt_token_ids, list)
|
assert isinstance(chunk.choices[0].delta.prompt_token_ids, list)
|
||||||
@@ -487,12 +489,12 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys):
|
|||||||
stream=True,
|
stream=True,
|
||||||
)
|
)
|
||||||
for chunk in response:
|
for chunk in response:
|
||||||
assert hasattr(chunk, 'choices')
|
assert hasattr(chunk, "choices")
|
||||||
assert len(chunk.choices) > 0
|
assert len(chunk.choices) > 0
|
||||||
assert hasattr(chunk.choices[0], 'delta')
|
assert hasattr(chunk.choices[0], "delta")
|
||||||
assert hasattr(chunk.choices[0].delta, 'prompt_token_ids')
|
assert hasattr(chunk.choices[0].delta, "prompt_token_ids")
|
||||||
assert chunk.choices[0].delta.prompt_token_ids is None
|
assert chunk.choices[0].delta.prompt_token_ids is None
|
||||||
assert hasattr(chunk.choices[0].delta, 'completion_token_ids')
|
assert hasattr(chunk.choices[0].delta, "completion_token_ids")
|
||||||
assert chunk.choices[0].delta.completion_token_ids is None
|
assert chunk.choices[0].delta.completion_token_ids is None
|
||||||
|
|
||||||
|
|
||||||
@@ -509,11 +511,11 @@ def test_non_streaming_completion_with_return_token_ids(openai_client, capsys):
|
|||||||
extra_body={"return_token_ids": True},
|
extra_body={"return_token_ids": True},
|
||||||
stream=False,
|
stream=False,
|
||||||
)
|
)
|
||||||
assert hasattr(response, 'choices')
|
assert hasattr(response, "choices")
|
||||||
assert len(response.choices) > 0
|
assert len(response.choices) > 0
|
||||||
assert hasattr(response.choices[0], 'prompt_token_ids')
|
assert hasattr(response.choices[0], "prompt_token_ids")
|
||||||
assert isinstance(response.choices[0].prompt_token_ids, list)
|
assert isinstance(response.choices[0].prompt_token_ids, list)
|
||||||
assert hasattr(response.choices[0], 'completion_token_ids')
|
assert hasattr(response.choices[0], "completion_token_ids")
|
||||||
assert isinstance(response.choices[0].completion_token_ids, list)
|
assert isinstance(response.choices[0].completion_token_ids, list)
|
||||||
|
|
||||||
# disable return_token_ids
|
# disable return_token_ids
|
||||||
@@ -525,11 +527,11 @@ def test_non_streaming_completion_with_return_token_ids(openai_client, capsys):
|
|||||||
extra_body={"return_token_ids": False},
|
extra_body={"return_token_ids": False},
|
||||||
stream=False,
|
stream=False,
|
||||||
)
|
)
|
||||||
assert hasattr(response, 'choices')
|
assert hasattr(response, "choices")
|
||||||
assert len(response.choices) > 0
|
assert len(response.choices) > 0
|
||||||
assert hasattr(response.choices[0], 'prompt_token_ids')
|
assert hasattr(response.choices[0], "prompt_token_ids")
|
||||||
assert response.choices[0].prompt_token_ids is None
|
assert response.choices[0].prompt_token_ids is None
|
||||||
assert hasattr(response.choices[0], 'completion_token_ids')
|
assert hasattr(response.choices[0], "completion_token_ids")
|
||||||
assert response.choices[0].completion_token_ids is None
|
assert response.choices[0].completion_token_ids is None
|
||||||
|
|
||||||
|
|
||||||
@@ -548,10 +550,10 @@ def test_streaming_completion_with_return_token_ids(openai_client, capsys):
|
|||||||
)
|
)
|
||||||
is_first_chunk = True
|
is_first_chunk = True
|
||||||
for chunk in response:
|
for chunk in response:
|
||||||
assert hasattr(chunk, 'choices')
|
assert hasattr(chunk, "choices")
|
||||||
assert len(chunk.choices) > 0
|
assert len(chunk.choices) > 0
|
||||||
assert hasattr(chunk.choices[0], 'prompt_token_ids')
|
assert hasattr(chunk.choices[0], "prompt_token_ids")
|
||||||
assert hasattr(chunk.choices[0], 'completion_token_ids')
|
assert hasattr(chunk.choices[0], "completion_token_ids")
|
||||||
if is_first_chunk:
|
if is_first_chunk:
|
||||||
is_first_chunk = False
|
is_first_chunk = False
|
||||||
assert isinstance(chunk.choices[0].prompt_token_ids, list)
|
assert isinstance(chunk.choices[0].prompt_token_ids, list)
|
||||||
@@ -570,11 +572,11 @@ def test_streaming_completion_with_return_token_ids(openai_client, capsys):
|
|||||||
stream=True,
|
stream=True,
|
||||||
)
|
)
|
||||||
for chunk in response:
|
for chunk in response:
|
||||||
assert hasattr(chunk, 'choices')
|
assert hasattr(chunk, "choices")
|
||||||
assert len(chunk.choices) > 0
|
assert len(chunk.choices) > 0
|
||||||
assert hasattr(chunk.choices[0], 'prompt_token_ids')
|
assert hasattr(chunk.choices[0], "prompt_token_ids")
|
||||||
assert chunk.choices[0].prompt_token_ids is None
|
assert chunk.choices[0].prompt_token_ids is None
|
||||||
assert hasattr(chunk.choices[0], 'completion_token_ids')
|
assert hasattr(chunk.choices[0], "completion_token_ids")
|
||||||
assert chunk.choices[0].completion_token_ids is None
|
assert chunk.choices[0].completion_token_ids is None
|
||||||
|
|
||||||
|
|
||||||
@@ -587,13 +589,13 @@ def test_non_streaming_chat_with_prompt_token_ids(openai_client, capsys):
|
|||||||
messages=[],
|
messages=[],
|
||||||
temperature=1,
|
temperature=1,
|
||||||
max_tokens=5,
|
max_tokens=5,
|
||||||
extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]},
|
extra_body={"prompt_token_ids": [5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937]},
|
||||||
stream=False,
|
stream=False,
|
||||||
)
|
)
|
||||||
assert hasattr(response, 'choices')
|
assert hasattr(response, "choices")
|
||||||
assert len(response.choices) > 0
|
assert len(response.choices) > 0
|
||||||
assert hasattr(response, 'usage')
|
assert hasattr(response, "usage")
|
||||||
assert hasattr(response.usage, 'prompt_tokens')
|
assert hasattr(response.usage, "prompt_tokens")
|
||||||
assert response.usage.prompt_tokens == 9
|
assert response.usage.prompt_tokens == 9
|
||||||
|
|
||||||
|
|
||||||
@@ -606,17 +608,17 @@ def test_streaming_chat_with_prompt_token_ids(openai_client, capsys):
|
|||||||
messages=[],
|
messages=[],
|
||||||
temperature=1,
|
temperature=1,
|
||||||
max_tokens=5,
|
max_tokens=5,
|
||||||
extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]},
|
extra_body={"prompt_token_ids": [5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937]},
|
||||||
stream=True,
|
stream=True,
|
||||||
stream_options={"include_usage": True},
|
stream_options={"include_usage": True},
|
||||||
)
|
)
|
||||||
for chunk in response:
|
for chunk in response:
|
||||||
assert hasattr(chunk, 'choices')
|
assert hasattr(chunk, "choices")
|
||||||
assert hasattr(chunk, 'usage')
|
assert hasattr(chunk, "usage")
|
||||||
if len(chunk.choices) > 0:
|
if len(chunk.choices) > 0:
|
||||||
assert chunk.usage is None
|
assert chunk.usage is None
|
||||||
else:
|
else:
|
||||||
assert hasattr(chunk.usage, 'prompt_tokens')
|
assert hasattr(chunk.usage, "prompt_tokens")
|
||||||
assert chunk.usage.prompt_tokens == 9
|
assert chunk.usage.prompt_tokens == 9
|
||||||
|
|
||||||
|
|
||||||
@@ -629,13 +631,13 @@ def test_non_streaming_completion_with_prompt_token_ids(openai_client, capsys):
|
|||||||
prompt="",
|
prompt="",
|
||||||
temperature=1,
|
temperature=1,
|
||||||
max_tokens=5,
|
max_tokens=5,
|
||||||
extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]},
|
extra_body={"prompt_token_ids": [5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937]},
|
||||||
stream=False,
|
stream=False,
|
||||||
)
|
)
|
||||||
assert hasattr(response, 'choices')
|
assert hasattr(response, "choices")
|
||||||
assert len(response.choices) > 0
|
assert len(response.choices) > 0
|
||||||
assert hasattr(response, 'usage')
|
assert hasattr(response, "usage")
|
||||||
assert hasattr(response.usage, 'prompt_tokens')
|
assert hasattr(response.usage, "prompt_tokens")
|
||||||
assert response.usage.prompt_tokens == 9
|
assert response.usage.prompt_tokens == 9
|
||||||
|
|
||||||
|
|
||||||
@@ -648,16 +650,15 @@ def test_streaming_completion_with_prompt_token_ids(openai_client, capsys):
|
|||||||
prompt="",
|
prompt="",
|
||||||
temperature=1,
|
temperature=1,
|
||||||
max_tokens=5,
|
max_tokens=5,
|
||||||
extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]},
|
extra_body={"prompt_token_ids": [5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937]},
|
||||||
stream=True,
|
stream=True,
|
||||||
stream_options={"include_usage": True},
|
stream_options={"include_usage": True},
|
||||||
)
|
)
|
||||||
for chunk in response:
|
for chunk in response:
|
||||||
assert hasattr(chunk, 'choices')
|
assert hasattr(chunk, "choices")
|
||||||
assert hasattr(chunk, 'usage')
|
assert hasattr(chunk, "usage")
|
||||||
if len(chunk.choices) > 0:
|
if len(chunk.choices) > 0:
|
||||||
assert chunk.usage is None
|
assert chunk.usage is None
|
||||||
else:
|
else:
|
||||||
assert hasattr(chunk.usage, 'prompt_tokens')
|
assert hasattr(chunk.usage, "prompt_tokens")
|
||||||
assert chunk.usage.prompt_tokens == 9
|
assert chunk.usage.prompt_tokens == 9
|
||||||
|
|
||||||
|
@@ -325,11 +325,11 @@ def test_streaming_chat(openai_client, capsys):
|
|||||||
assert len(output) > 2
|
assert len(output) > 2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ==========================
|
# ==========================
|
||||||
# OpenAI Client additional chat/completions test
|
# OpenAI Client additional chat/completions test
|
||||||
# ==========================
|
# ==========================
|
||||||
|
|
||||||
|
|
||||||
def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
|
def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
|
||||||
"""
|
"""
|
||||||
Test return_token_ids option in non-streaming chat functionality with the local service
|
Test return_token_ids option in non-streaming chat functionality with the local service
|
||||||
@@ -340,35 +340,33 @@ def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
|
|||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
"role": "system",
|
"role": "system",
|
||||||
"content": "You are a helpful AI assistant."
|
"content": "You are a helpful AI assistant.",
|
||||||
}, # system不是必需,可选
|
}, # system不是必需,可选
|
||||||
{
|
{
|
||||||
"role":
|
"role": "user",
|
||||||
"user",
|
"content": [
|
||||||
"content": [{
|
{
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
"image_url": {
|
"image_url": {
|
||||||
"url":
|
"url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
|
||||||
"https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
|
"detail": "high",
|
||||||
"detail": "high"
|
},
|
||||||
}
|
},
|
||||||
}, {
|
{"type": "text", "text": "请描述图片内容"},
|
||||||
"type": "text",
|
],
|
||||||
"text": "请描述图片内容"
|
},
|
||||||
}]
|
|
||||||
}
|
|
||||||
],
|
],
|
||||||
temperature=1,
|
temperature=1,
|
||||||
max_tokens=53,
|
max_tokens=53,
|
||||||
extra_body={"return_token_ids": True},
|
extra_body={"return_token_ids": True},
|
||||||
stream=False,
|
stream=False,
|
||||||
)
|
)
|
||||||
assert hasattr(response, 'choices')
|
assert hasattr(response, "choices")
|
||||||
assert len(response.choices) > 0
|
assert len(response.choices) > 0
|
||||||
assert hasattr(response.choices[0], 'message')
|
assert hasattr(response.choices[0], "message")
|
||||||
assert hasattr(response.choices[0].message, 'prompt_token_ids')
|
assert hasattr(response.choices[0].message, "prompt_token_ids")
|
||||||
assert isinstance(response.choices[0].message.prompt_token_ids, list)
|
assert isinstance(response.choices[0].message.prompt_token_ids, list)
|
||||||
assert hasattr(response.choices[0].message, 'completion_token_ids')
|
assert hasattr(response.choices[0].message, "completion_token_ids")
|
||||||
assert isinstance(response.choices[0].message.completion_token_ids, list)
|
assert isinstance(response.choices[0].message.completion_token_ids, list)
|
||||||
|
|
||||||
# 不设定 return_token_ids
|
# 不设定 return_token_ids
|
||||||
@@ -377,35 +375,33 @@ def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
|
|||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
"role": "system",
|
"role": "system",
|
||||||
"content": "You are a helpful AI assistant."
|
"content": "You are a helpful AI assistant.",
|
||||||
}, # system不是必需,可选
|
}, # system不是必需,可选
|
||||||
{
|
{
|
||||||
"role":
|
"role": "user",
|
||||||
"user",
|
"content": [
|
||||||
"content": [{
|
{
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
"image_url": {
|
"image_url": {
|
||||||
"url":
|
"url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
|
||||||
"https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
|
"detail": "high",
|
||||||
"detail": "high"
|
},
|
||||||
}
|
},
|
||||||
}, {
|
{"type": "text", "text": "请描述图片内容"},
|
||||||
"type": "text",
|
],
|
||||||
"text": "请描述图片内容"
|
},
|
||||||
}]
|
|
||||||
}
|
|
||||||
],
|
],
|
||||||
temperature=1,
|
temperature=1,
|
||||||
max_tokens=53,
|
max_tokens=53,
|
||||||
extra_body={"return_token_ids": False},
|
extra_body={"return_token_ids": False},
|
||||||
stream=False,
|
stream=False,
|
||||||
)
|
)
|
||||||
assert hasattr(response, 'choices')
|
assert hasattr(response, "choices")
|
||||||
assert len(response.choices) > 0
|
assert len(response.choices) > 0
|
||||||
assert hasattr(response.choices[0], 'message')
|
assert hasattr(response.choices[0], "message")
|
||||||
assert hasattr(response.choices[0].message, 'prompt_token_ids')
|
assert hasattr(response.choices[0].message, "prompt_token_ids")
|
||||||
assert response.choices[0].message.prompt_token_ids is None
|
assert response.choices[0].message.prompt_token_ids is None
|
||||||
assert hasattr(response.choices[0].message, 'completion_token_ids')
|
assert hasattr(response.choices[0].message, "completion_token_ids")
|
||||||
assert response.choices[0].message.completion_token_ids is None
|
assert response.choices[0].message.completion_token_ids is None
|
||||||
|
|
||||||
|
|
||||||
@@ -419,23 +415,21 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys):
|
|||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
"role": "system",
|
"role": "system",
|
||||||
"content": "You are a helpful AI assistant."
|
"content": "You are a helpful AI assistant.",
|
||||||
}, # system不是必需,可选
|
}, # system不是必需,可选
|
||||||
{
|
{
|
||||||
"role":
|
"role": "user",
|
||||||
"user",
|
"content": [
|
||||||
"content": [{
|
{
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
"image_url": {
|
"image_url": {
|
||||||
"url":
|
"url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
|
||||||
"https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
|
"detail": "high",
|
||||||
"detail": "high"
|
},
|
||||||
}
|
},
|
||||||
}, {
|
{"type": "text", "text": "请描述图片内容"},
|
||||||
"type": "text",
|
],
|
||||||
"text": "请描述图片内容"
|
},
|
||||||
}]
|
|
||||||
}
|
|
||||||
],
|
],
|
||||||
temperature=1,
|
temperature=1,
|
||||||
max_tokens=53,
|
max_tokens=53,
|
||||||
@@ -444,11 +438,11 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys):
|
|||||||
)
|
)
|
||||||
is_first_chunk = True
|
is_first_chunk = True
|
||||||
for chunk in response:
|
for chunk in response:
|
||||||
assert hasattr(chunk, 'choices')
|
assert hasattr(chunk, "choices")
|
||||||
assert len(chunk.choices) > 0
|
assert len(chunk.choices) > 0
|
||||||
assert hasattr(chunk.choices[0], 'delta')
|
assert hasattr(chunk.choices[0], "delta")
|
||||||
assert hasattr(chunk.choices[0].delta, 'prompt_token_ids')
|
assert hasattr(chunk.choices[0].delta, "prompt_token_ids")
|
||||||
assert hasattr(chunk.choices[0].delta, 'completion_token_ids')
|
assert hasattr(chunk.choices[0].delta, "completion_token_ids")
|
||||||
if is_first_chunk:
|
if is_first_chunk:
|
||||||
is_first_chunk = False
|
is_first_chunk = False
|
||||||
assert isinstance(chunk.choices[0].delta.prompt_token_ids, list)
|
assert isinstance(chunk.choices[0].delta.prompt_token_ids, list)
|
||||||
@@ -463,23 +457,21 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys):
|
|||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
"role": "system",
|
"role": "system",
|
||||||
"content": "You are a helpful AI assistant."
|
"content": "You are a helpful AI assistant.",
|
||||||
}, # system不是必需,可选
|
}, # system不是必需,可选
|
||||||
{
|
{
|
||||||
"role":
|
"role": "user",
|
||||||
"user",
|
"content": [
|
||||||
"content": [{
|
{
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
"image_url": {
|
"image_url": {
|
||||||
"url":
|
"url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
|
||||||
"https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
|
"detail": "high",
|
||||||
"detail": "high"
|
},
|
||||||
}
|
},
|
||||||
}, {
|
{"type": "text", "text": "请描述图片内容"},
|
||||||
"type": "text",
|
],
|
||||||
"text": "请描述图片内容"
|
},
|
||||||
}]
|
|
||||||
}
|
|
||||||
],
|
],
|
||||||
temperature=1,
|
temperature=1,
|
||||||
max_tokens=53,
|
max_tokens=53,
|
||||||
@@ -487,10 +479,10 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys):
|
|||||||
stream=True,
|
stream=True,
|
||||||
)
|
)
|
||||||
for chunk in response:
|
for chunk in response:
|
||||||
assert hasattr(chunk, 'choices')
|
assert hasattr(chunk, "choices")
|
||||||
assert len(chunk.choices) > 0
|
assert len(chunk.choices) > 0
|
||||||
assert hasattr(chunk.choices[0], 'delta')
|
assert hasattr(chunk.choices[0], "delta")
|
||||||
assert hasattr(chunk.choices[0].delta, 'prompt_token_ids')
|
assert hasattr(chunk.choices[0].delta, "prompt_token_ids")
|
||||||
assert chunk.choices[0].delta.prompt_token_ids is None
|
assert chunk.choices[0].delta.prompt_token_ids is None
|
||||||
assert hasattr(chunk.choices[0].delta, 'completion_token_ids')
|
assert hasattr(chunk.choices[0].delta, "completion_token_ids")
|
||||||
assert chunk.choices[0].delta.completion_token_ids is None
|
assert chunk.choices[0].delta.completion_token_ids is None
|
||||||
|
@@ -294,4 +294,6 @@ def test_non_thinking_prompt(api_url, headers):
|
|||||||
assert False, f"Response is not valid JSON: {e}"
|
assert False, f"Response is not valid JSON: {e}"
|
||||||
|
|
||||||
content = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").lower()
|
content = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").lower()
|
||||||
assert not any(x in content for x in ["根据", "我认为", "推测", "可能"]), "Expected no reasoning in non-thinking response"
|
assert not any(
|
||||||
|
x in content for x in ["根据", "我认为", "推测", "可能"]
|
||||||
|
), "Expected no reasoning in non-thinking response"
|
||||||
|
Reference in New Issue
Block a user