[Bug fix] fix pooling models (#5358)

* fix * fix * fix test * fix gpu_model_runner --------- Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
2025-12-24 13:28:13 +08:00 · 2025-12-04 11:06:30 +08:00
parent a52aea073c
commit 946025480e
5 changed files with 26 additions and 47 deletions
--- a/fastdeploy/entrypoints/openai/protocol.py
+++ b/fastdeploy/entrypoints/openai/protocol.py
@@ -922,6 +922,16 @@ class EmbeddingChatRequest(BaseModel):
    user: Optional[str] = None
    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None

+    # --8<-- [start:chat-embedding-extra-params]
+    add_generation_prompt: bool = Field(
+        default=False,
+        description=(
+            "If true, the generation prompt will be added to the chat template. "
+            "This is a parameter used by chat template in tokenizer config of the "
+            "model."
+        ),
+    )
+
    add_special_tokens: bool = Field(
        default=False,
        description=(