mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
[Feature] Add temp_scaled_logprobs and top_p_normalized_logprobs parameters for logits and logprobs post processing (#3536)
* [feature] Add temp_scaled_logprobs and top_p_normalized_logprobs parameters for logits and logprobs post processing * infer engine support temp_scaled_logprobs and top_p_normalized_logprobs * code check * code check * fix tokenizer.decoder(-1), return 'Invalid Token' * check seq len time shape * logprob clip inf * code check --------- Co-authored-by: sunlei1024 <sunlei5788@gmail.com>
This commit is contained in:
@@ -267,6 +267,10 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0)
|
||||
self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0)
|
||||
self.share_inputs["presence_score"][idx : idx + 1] = request.get("presence_penalty", 0.0)
|
||||
self.share_inputs["temp_scaled_logprobs"][idx : idx + 1] = request.get("temp_scaled_logprobs", False)
|
||||
self.share_inputs["top_p_normalized_logprobs"][idx : idx + 1] = request.get(
|
||||
"top_p_normalized_logprobs", False
|
||||
)
|
||||
|
||||
self.share_inputs["min_dec_len"][idx : idx + 1] = request.get("min_tokens", 1)
|
||||
self.share_inputs["max_dec_len"][idx : idx + 1] = request.get(
|
||||
@@ -431,6 +435,12 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["presence_score"][idx : idx + 1] = get_attr_from_request(
|
||||
request, "presence_penalty", 0.0
|
||||
)
|
||||
self.share_inputs["temp_scaled_logprobs"][idx : idx + 1] = get_attr_from_request(
|
||||
request, "temp_scaled_logprobs", False
|
||||
)
|
||||
self.share_inputs["top_p_normalized_logprobs"][idx : idx + 1] = get_attr_from_request(
|
||||
request, "top_p_normalized_logprobs", False
|
||||
)
|
||||
|
||||
self.share_inputs["min_dec_len"][idx : idx + 1] = request.get("min_tokens", 1)
|
||||
self.share_inputs["max_dec_len"][idx : idx + 1] = request.get(
|
||||
@@ -543,6 +553,8 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["presence_score"] = paddle.full(
|
||||
[max_num_seqs, 1], self.model_config.presence_score, dtype="float32"
|
||||
)
|
||||
self.share_inputs["temp_scaled_logprobs"] = paddle.full([max_num_seqs, 1], False, dtype="bool")
|
||||
self.share_inputs["top_p_normalized_logprobs"] = paddle.full([max_num_seqs, 1], False, dtype="bool")
|
||||
|
||||
self.share_inputs["min_dec_len"] = paddle.full([max_num_seqs, 1], self.model_config.min_length, dtype="int64")
|
||||
self.share_inputs["max_dec_len"] = paddle.full(
|
||||
@@ -748,6 +760,9 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
bad_words_token_ids=self.share_inputs["bad_tokens"],
|
||||
eos_token_ids=self.share_inputs["eos_token_id"],
|
||||
max_num_logprobs=20 if self.enable_logprob else None,
|
||||
temp_scaled_logprobs=self.share_inputs["temp_scaled_logprobs"],
|
||||
top_p_normalized_logprobs=self.share_inputs["top_p_normalized_logprobs"],
|
||||
share_inputs=self.share_inputs,
|
||||
)
|
||||
|
||||
def load_model(self) -> None:
|
||||
|
Reference in New Issue
Block a user