diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index 05bd57183..c65f8c38d 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -115,6 +115,7 @@ class OpenAIServingChat: except Exception as e: error_msg = f"request[{request_id}] generator error: {str(e)}, {str(traceback.format_exc())}" api_server_logger.error(error_msg) + self.engine_client.semaphore.release() return ErrorResponse(code=400, message=error_msg) del current_req_dict diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index 704330373..75f890c44 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -128,22 +128,22 @@ class OpenAIServingCompletion: return ErrorResponse(code=408, message=error_msg) try: - for idx, prompt in enumerate(request_prompts): - request_id_idx = f"{request_id}-{idx}" - current_req_dict = request.to_dict_for_infer(request_id_idx, prompt) - try: + try: + for idx, prompt in enumerate(request_prompts): + request_id_idx = f"{request_id}-{idx}" + current_req_dict = request.to_dict_for_infer(request_id_idx, prompt) current_req_dict["arrival_time"] = time.time() prompt_token_ids = self.engine_client.format_and_add_data(current_req_dict) if isinstance(prompt_token_ids, np.ndarray): prompt_token_ids = prompt_token_ids.tolist() text_after_process_list.append(current_req_dict.get("text_after_process")) prompt_batched_token_ids.append(prompt_token_ids) - except Exception as e: - error_msg = f"OpenAIServingCompletion format error: {e}, {str(traceback.format_exc())}" - api_server_logger.error(error_msg) - return ErrorResponse(message=str(e), code=400) - - del current_req_dict + del current_req_dict + except Exception as e: + error_msg = f"OpenAIServingCompletion format error: {e}, {str(traceback.format_exc())}" + api_server_logger.error(error_msg) + self.engine_client.semaphore.release() + return ErrorResponse(message=str(e), code=400) if request.stream: return self.completion_stream_generator(