适配vLLM无arrival_time;适配vLLM model必传;RequestFuncInput/RequestFuncOutput/SampleRequest新增用例编号no

This commit is contained in:
lijingning
2025-07-15 16:53:03 +08:00
committed by Zhang Yulong
parent 1b712bba82
commit 9d6a42b334
3 changed files with 215 additions and 17 deletions

View File

@@ -36,6 +36,7 @@ AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
@dataclass
class RequestFuncInput:
"""Input for requesting LLMs via API"""
no: int
prompt: str
history_QA: Optional[dict]
hyper_parameters: dict
@@ -54,6 +55,7 @@ class RequestFuncInput:
@dataclass
class RequestFuncOutput:
"""Output for requesting LLMs via API"""
no: int = 0
generated_text: str = ""
reasoning_content: str = ""
success: bool = False
@@ -84,7 +86,7 @@ async def async_request_eb_openai_chat_completions(
if request_func_input.multi_modal_content:
content.append(request_func_input.multi_modal_content)
payload = {
"model": "default",
"model": request_func_input.model,
"messages": request_func_input.history_QA,
"stream": True,
"stream_options": {
@@ -97,6 +99,9 @@ async def async_request_eb_openai_chat_completions(
if request_func_input.ignore_eos:
payload["ignore_eos"] = request_func_input.ignore_eos
print("payload:{}".format(json.dumps(payload, ensure_ascii=False)))
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
@@ -104,6 +109,7 @@ async def async_request_eb_openai_chat_completions(
output = RequestFuncOutput()
output.prompt_len = 0
output.no = request_func_input.no
ttft = 0.0
st = time.perf_counter()
@@ -132,7 +138,8 @@ async def async_request_eb_openai_chat_completions(
ttft = timestamp - st
output.ttft = ttft
# cached_tokens
output.prompt_len = data["usage"]["prompt_tokens_details"]["cached_tokens"]
output.prompt_len = data["usage"].get("prompt_tokens_details", {}).get("cached_tokens", 0)
# Decoding phase
else:
@@ -141,12 +148,12 @@ async def async_request_eb_openai_chat_completions(
output.generated_text += content or ""
output.reasoning_content += reason_content or ""
output.arrival_time.append(choices[0].get("arrival_time"))
elif usage := data.get("usage"):
output.arrival_time.append(choices[0].get("arrival_time", timestamp))
elif usage := data.get("usage", {}):
output.output_tokens = usage.get(
"completion_tokens")
"completion_tokens", 0)
output.prompt_tokens = usage.get(
"prompt_tokens")
"prompt_tokens", 0)
most_recent_timestamp = timestamp
@@ -173,6 +180,7 @@ async def async_request_eb_openai_chat_completions(
f.write(str(output) + "\n")
if pbar:
pbar.update(1)
print("#####final_output:", output)
return output
@@ -189,7 +197,7 @@ async def async_request_eb_openai_completions(
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
payload = {
"model": "default",
"model": request_func_input.model,
"prompt": request_func_input.prompt,
"stream": True,
"stream_options": {
@@ -202,14 +210,20 @@ async def async_request_eb_openai_completions(
if request_func_input.ignore_eos:
payload["ignore_eos"] = request_func_input.ignore_eos
print("payload:", json.dumps(payload, ensure_ascii=False))
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
"Content-Type": "application/json"
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
output.no = request_func_input.no
generated_text = ""
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
@@ -226,6 +240,7 @@ async def async_request_eb_openai_completions(
"data: ")
if chunk != "[DONE]":
# print("####chunk:", chunk, chunk.usage)
timestamp = time.perf_counter()
data = json.loads(chunk)
# NOTE: Some completion API might have a last
@@ -235,21 +250,22 @@ async def async_request_eb_openai_completions(
# Note that text could be empty here
# e.g. for special tokens
text = choices[0].get("text")
timestamp = time.perf_counter()
# First token
if not first_chunk_received:
first_chunk_received = True
ttft = time.perf_counter() - st
ttft = timestamp - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(timestamp -
most_recent_timestamp)
generated_text += text or ""
most_recent_timestamp = timestamp
output.arrival_time.append(choices[0].get("arrival_time"))
generated_text += text or ""
output.arrival_time.append(choices[0].get("arrival_time", timestamp))
elif usage := data.get("usage"):
output.prompt_tokens = usage.get(
"prompt_tokens")
@@ -262,8 +278,15 @@ async def async_request_eb_openai_completions(
output.error = (
"Never received a valid chunk to calculate TTFT."
"This response will be marked as failed!")
output.generated_text = generated_text
output.latency = most_recent_timestamp - st
if output.generated_text == "":
output.success = False
output.error = "No generated text found!"
else:
output.success = True
else:
output.error = response.reason or ""
output.success = False
@@ -271,6 +294,8 @@ async def async_request_eb_openai_completions(
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
print("final_output:{}".format(output))
if pbar:
pbar.update(1)