diff --git a/docs/features/sampling.md b/docs/features/sampling.md index 01edb5dd8..3a0d22869 100644 --- a/docs/features/sampling.md +++ b/docs/features/sampling.md @@ -98,7 +98,7 @@ curl -X POST "http://0.0.0.0:9222/v1/chat/completions" \ {"role": "user", "content": "How old are you"} ], "top_p": 0.8, - "top_k": 50 + "top_k": 20 }' ``` @@ -117,7 +117,7 @@ response = client.chat.completions.create( ], stream=True, top_p=0.8, - top_k=50 + extra_body={"top_k": 20, "min_p":0.1} ) for chunk in response: if chunk.choices[0].delta: @@ -159,8 +159,7 @@ response = client.chat.completions.create( ], stream=True, top_p=0.8, - top_k=20, - min_p=0.1 + extra_body={"top_k": 20, "min_p":0.1} ) for chunk in response: if chunk.choices[0].delta: diff --git a/docs/offline_inference.md b/docs/offline_inference.md index 2da2286b8..31f79b749 100644 --- a/docs/offline_inference.md +++ b/docs/offline_inference.md @@ -183,6 +183,7 @@ For ```LLM``` configuration, refer to [Parameter Documentation](parameters.md). * min_p(float): Minimum probability relative to the maximum probability for a token to be considered (>0 filters low-probability tokens to improve quality) * max_tokens(int): Maximum generated tokens (input + output) * min_tokens(int): Minimum forced generation length +* bad_words(list[str]): Prohibited words ### 2.5 fastdeploy.engine.request.RequestOutput diff --git a/docs/online_serving/README.md b/docs/online_serving/README.md index 761e79720..5cb5bf188 100644 --- a/docs/online_serving/README.md +++ b/docs/online_serving/README.md @@ -137,6 +137,9 @@ When sending requests using openai.Client, these parameters need to be placed in The following sampling parameters are supported. ```python +bad_words: Optional[List[int]] = None +# List of forbidden words that the model should avoid generating (default None means no restriction). + top_k: Optional[int] = None # Limits the consideration to the top K tokens with the highest probability at each generation step, used to control randomness (default None means no limit). diff --git a/docs/zh/features/sampling.md b/docs/zh/features/sampling.md index 829006d31..24cc003b5 100644 --- a/docs/zh/features/sampling.md +++ b/docs/zh/features/sampling.md @@ -98,7 +98,7 @@ curl -X POST "http://0.0.0.0:9222/v1/chat/completions" \ {"role": "user", "content": "How old are you"} ], "top_p": 0.8, - "top_k": 50 + "top_k": 20 }' ``` @@ -118,7 +118,7 @@ response = client.chat.completions.create( ], stream=True, top_p=0.8, - extra_body={"top_k": 50} + extra_body={"top_k": 20} ) for chunk in response: if chunk.choices[0].delta: @@ -161,8 +161,7 @@ response = client.chat.completions.create( ], stream=True, top_p=0.8, - extra_body={"top_k": 20}, - min_p=0.1 + extra_body={"top_k": 20, "min_p": 0.1} ) for chunk in response: if chunk.choices[0].delta: diff --git a/docs/zh/offline_inference.md b/docs/zh/offline_inference.md index 855116484..a77311495 100644 --- a/docs/zh/offline_inference.md +++ b/docs/zh/offline_inference.md @@ -183,6 +183,7 @@ for output in outputs: * min_p(float): token入选的最小概率阈值(相对于最高概率token的比值,设为>0可通过过滤低概率token来提升文本生成质量) * max_tokens(int): 限制模型生成的最大token数量(包括输入和输出) * min_tokens(int): 强制模型生成的最少token数量,避免过早结束 +* bad_words(list[str]): 禁止生成的词列表, 防止模型生成不希望出现的词 ### 2.5 fastdeploy.engine.request.RequestOutput diff --git a/docs/zh/online_serving/README.md b/docs/zh/online_serving/README.md index a68eedbdb..d056418d5 100644 --- a/docs/zh/online_serving/README.md +++ b/docs/zh/online_serving/README.md @@ -137,6 +137,9 @@ metadata: Optional[dict] = None 额外采样参数的支持如下: ```python +bad_words: Optional[List[str]] = None +# 禁止生成的词汇列表,模型会避免输出这些词(默认 None 表示不限制)。 + top_k: Optional[int] = None # 限制每一步生成时只考虑概率最高的 K 个 token,用于控制随机性(默认 None 表示不限制)。