mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-06 00:57:33 +08:00
[BugFix] fix control signal release failed (#3374)
* [BugFix] * [BugFix] * [BugFix] * [BugFix] * fix * fix --------- Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
@@ -165,9 +165,9 @@ async def connection_manager():
|
||||
yield
|
||||
except asyncio.TimeoutError:
|
||||
api_server_logger.info(f"Reach max request release: {connection_semaphore.status()}")
|
||||
if connection_semaphore.locked():
|
||||
connection_semaphore.release()
|
||||
raise HTTPException(status_code=429, detail="Too many requests")
|
||||
raise HTTPException(
|
||||
status_code=429, detail=f"Too many requests, current max concurrency is {args.max_concurrency}"
|
||||
)
|
||||
|
||||
|
||||
def wrap_streaming_generator(original_generator: AsyncGenerator):
|
||||
@@ -180,7 +180,7 @@ def wrap_streaming_generator(original_generator: AsyncGenerator):
|
||||
async for chunk in original_generator:
|
||||
yield chunk
|
||||
finally:
|
||||
api_server_logger.debug(f"release: {connection_semaphore.status()}")
|
||||
api_server_logger.debug(f"current concurrency status: {connection_semaphore.status()}")
|
||||
connection_semaphore.release()
|
||||
|
||||
return wrapped_generator
|
||||
@@ -255,9 +255,11 @@ async def create_chat_completion(request: ChatCompletionRequest):
|
||||
generator = await app.state.chat_handler.create_chat_completion(request)
|
||||
if isinstance(generator, ErrorResponse):
|
||||
connection_semaphore.release()
|
||||
api_server_logger.debug(f"current concurrency status: {connection_semaphore.status()}")
|
||||
return JSONResponse(content={"detail": generator.model_dump()}, status_code=generator.code)
|
||||
elif isinstance(generator, ChatCompletionResponse):
|
||||
connection_semaphore.release()
|
||||
api_server_logger.debug(f"current concurrency status: {connection_semaphore.status()}")
|
||||
return JSONResponse(content=generator.model_dump())
|
||||
else:
|
||||
wrapped_generator = wrap_streaming_generator(generator)
|
||||
|
@@ -78,45 +78,45 @@ class OpenAIServingChat:
|
||||
api_server_logger.error(err_msg)
|
||||
return ErrorResponse(message=err_msg, code=400)
|
||||
|
||||
if request.user is not None:
|
||||
request_id = f"chatcmpl-{request.user}-{uuid.uuid4()}"
|
||||
else:
|
||||
request_id = f"chatcmpl-{uuid.uuid4()}"
|
||||
api_server_logger.info(f"create chat completion request: {request_id}")
|
||||
text_after_process = None
|
||||
try:
|
||||
current_req_dict = request.to_dict_for_infer(request_id)
|
||||
current_req_dict["arrival_time"] = time.time()
|
||||
prompt_token_ids = self.engine_client.format_and_add_data(current_req_dict)
|
||||
text_after_process = current_req_dict.get("text_after_process")
|
||||
if isinstance(prompt_token_ids, np.ndarray):
|
||||
prompt_token_ids = prompt_token_ids.tolist()
|
||||
except Exception as e:
|
||||
return ErrorResponse(code=400, message=str(e))
|
||||
|
||||
del current_req_dict
|
||||
|
||||
try:
|
||||
api_server_logger.debug(f"{self.engine_client.semaphore.status()}")
|
||||
if self.max_waiting_time < 0:
|
||||
await self.engine_client.semaphore.acquire()
|
||||
else:
|
||||
await asyncio.wait_for(self.engine_client.semaphore.acquire(), timeout=self.max_waiting_time)
|
||||
except Exception:
|
||||
return ErrorResponse(code=408, message=f"Request queued time exceed {self.max_waiting_time}")
|
||||
api_server_logger.debug(f"current waiting request {self.engine_client.semaphore.status()}")
|
||||
|
||||
if request.stream:
|
||||
return self.chat_completion_stream_generator(
|
||||
request, request_id, request.model, prompt_token_ids, text_after_process
|
||||
)
|
||||
else:
|
||||
if request.user is not None:
|
||||
request_id = f"chatcmpl-{request.user}-{uuid.uuid4()}"
|
||||
else:
|
||||
request_id = f"chatcmpl-{uuid.uuid4()}"
|
||||
api_server_logger.info(f"create chat completion request: {request_id}")
|
||||
text_after_process = None
|
||||
try:
|
||||
return await self.chat_completion_full_generator(
|
||||
request, request_id, request.model, prompt_token_ids, text_after_process
|
||||
)
|
||||
current_req_dict = request.to_dict_for_infer(request_id)
|
||||
current_req_dict["arrival_time"] = time.time()
|
||||
prompt_token_ids = self.engine_client.format_and_add_data(current_req_dict)
|
||||
text_after_process = current_req_dict.get("text_after_process")
|
||||
if isinstance(prompt_token_ids, np.ndarray):
|
||||
prompt_token_ids = prompt_token_ids.tolist()
|
||||
except Exception as e:
|
||||
return ErrorResponse(code=400, message=str(e))
|
||||
|
||||
del current_req_dict
|
||||
|
||||
if request.stream:
|
||||
return self.chat_completion_stream_generator(
|
||||
request, request_id, request.model, prompt_token_ids, text_after_process
|
||||
)
|
||||
else:
|
||||
try:
|
||||
return await self.chat_completion_full_generator(
|
||||
request, request_id, request.model, prompt_token_ids, text_after_process
|
||||
)
|
||||
except Exception as e:
|
||||
return ErrorResponse(code=400, message=str(e))
|
||||
except Exception:
|
||||
return ErrorResponse(code=408, message=f"Request queued time exceed {self.max_waiting_time}")
|
||||
|
||||
def _create_streaming_error_response(self, message: str) -> str:
|
||||
error_response = ErrorResponse(
|
||||
code=400,
|
||||
|
@@ -101,6 +101,13 @@ class OpenAIServingCompletion:
|
||||
api_server_logger.info(f"start inference for request {num_choices}")
|
||||
prompt_batched_token_ids = []
|
||||
text_after_process_list = []
|
||||
try:
|
||||
if self.max_waiting_time < 0:
|
||||
await self.engine_client.semaphore.acquire()
|
||||
else:
|
||||
await asyncio.wait_for(self.engine_client.semaphore.acquire(), timeout=self.max_waiting_time)
|
||||
except Exception:
|
||||
return ErrorResponse(code=408, message=f"Request queued time exceed {self.max_waiting_time}")
|
||||
try:
|
||||
for idx, prompt in enumerate(request_prompts):
|
||||
request_id_idx = f"{request_id}-{idx}"
|
||||
@@ -117,14 +124,6 @@ class OpenAIServingCompletion:
|
||||
|
||||
del current_req_dict
|
||||
|
||||
try:
|
||||
if self.max_waiting_time < 0:
|
||||
await self.engine_client.semaphore.acquire()
|
||||
else:
|
||||
await asyncio.wait_for(self.engine_client.semaphore.acquire(), timeout=self.max_waiting_time)
|
||||
except Exception:
|
||||
return ErrorResponse(code=408, message=f"Request queued time exceed {self.max_waiting_time}")
|
||||
|
||||
if request.stream:
|
||||
return self.completion_stream_generator(
|
||||
request=request,
|
||||
|
@@ -67,6 +67,7 @@ class ZmqClient:
|
||||
"""
|
||||
self.router = self.context.socket(zmq.ROUTER)
|
||||
self.router.setsockopt(zmq.SNDHWM, self.ZMQ_SNDHWM)
|
||||
self.router.setsockopt(zmq.ROUTER_MANDATORY, 1)
|
||||
self.router.setsockopt(zmq.SNDTIMEO, -1)
|
||||
self.router.bind(f"ipc://{self.router_path}")
|
||||
|
||||
@@ -111,7 +112,6 @@ class ZmqClient:
|
||||
"""
|
||||
if self.router is None:
|
||||
raise RuntimeError("Router socket not created. Call create_router() first.")
|
||||
|
||||
while self.running:
|
||||
with self.mutex:
|
||||
if req_id not in self.req_dict:
|
||||
@@ -124,7 +124,11 @@ class ZmqClient:
|
||||
continue
|
||||
else:
|
||||
break
|
||||
|
||||
if self.req_dict[req_id] == -1:
|
||||
if data[-1].finished:
|
||||
with self.mutex:
|
||||
self.req_dict.pop(req_id, None)
|
||||
return
|
||||
try:
|
||||
start_send = time.time()
|
||||
if self.aggregate_send:
|
||||
@@ -133,7 +137,9 @@ class ZmqClient:
|
||||
result = msgpack.packb([response.to_dict() for response in data])
|
||||
self.router.send_multipart([self.req_dict[req_id], b"", result])
|
||||
llm_logger.debug(f"send_multipart result: {req_id} len {len(data)} elapse: {time.time()-start_send}")
|
||||
|
||||
except zmq.ZMQError as e:
|
||||
llm_logger.error(f"[{req_id}] zmq error: {e}")
|
||||
self.req_dict[req_id] = -1
|
||||
except Exception as e:
|
||||
llm_logger.error(f"Send result to zmq client failed: {e}")
|
||||
|
||||
|
Reference in New Issue
Block a user