mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Feature] Support for request-level speculative decoding metrics monitoring. (#5518)
* support spec metrics monitor per request * fix bug * remove debug log * fix ut bugs
This commit is contained in:
@@ -46,6 +46,7 @@ class TestCompletionEcho(unittest.IsolatedAsyncioTestCase):
|
||||
"finished": True,
|
||||
},
|
||||
"output_token_ids": 3,
|
||||
"metrics": {},
|
||||
}
|
||||
self.mock_engine.generate.return_value = [mock_output]
|
||||
|
||||
@@ -80,6 +81,7 @@ class TestCompletionEcho(unittest.IsolatedAsyncioTestCase):
|
||||
"finished": True,
|
||||
},
|
||||
"output_token_ids": 3,
|
||||
"metrics": {},
|
||||
}
|
||||
self.mock_engine.generate.return_value = [mock_output]
|
||||
|
||||
@@ -109,10 +111,12 @@ class TestCompletionEcho(unittest.IsolatedAsyncioTestCase):
|
||||
{
|
||||
"outputs": {"text": " response1", "token_ids": [1, 2], "top_logprobs": None, "finished": True},
|
||||
"output_token_ids": 2,
|
||||
"metrics": {},
|
||||
},
|
||||
{
|
||||
"outputs": {"text": " response2", "token_ids": [3, 4], "top_logprobs": None, "finished": True},
|
||||
"output_token_ids": 2,
|
||||
"metrics": {},
|
||||
},
|
||||
]
|
||||
self.mock_engine.generate.return_value = mock_outputs
|
||||
@@ -146,10 +150,12 @@ class TestCompletionEcho(unittest.IsolatedAsyncioTestCase):
|
||||
{
|
||||
"outputs": {"text": " response1", "token_ids": [1, 2], "top_logprobs": None, "finished": True},
|
||||
"output_token_ids": 2,
|
||||
"metrics": {},
|
||||
},
|
||||
{
|
||||
"outputs": {"text": " response2", "token_ids": [3, 4], "top_logprobs": None, "finished": True},
|
||||
"output_token_ids": 2,
|
||||
"metrics": {},
|
||||
},
|
||||
]
|
||||
self.mock_engine.generate.return_value = mock_outputs
|
||||
|
||||
@@ -321,6 +321,7 @@ class TestMaxStreamingResponseTokens(IsolatedAsyncioTestCase):
|
||||
],
|
||||
},
|
||||
"finished": True,
|
||||
"metrics": {},
|
||||
},
|
||||
{
|
||||
"request_id": "test_request_id_1",
|
||||
@@ -334,6 +335,7 @@ class TestMaxStreamingResponseTokens(IsolatedAsyncioTestCase):
|
||||
],
|
||||
},
|
||||
"finished": True,
|
||||
"metrics": {},
|
||||
},
|
||||
]
|
||||
|
||||
@@ -493,6 +495,7 @@ class TestMaxStreamingResponseTokens(IsolatedAsyncioTestCase):
|
||||
prompt_logprobs_res_list=prompt_logprobs_res_list,
|
||||
response_processor=mock_response_processor,
|
||||
max_tokens=max_tokens_list[idx],
|
||||
speculate_metrics=None,
|
||||
)
|
||||
|
||||
expected = case["expected"]
|
||||
|
||||
@@ -129,6 +129,7 @@ class TestOpenAIServingCompletion(unittest.IsolatedAsyncioTestCase):
|
||||
"reasoning_token_num": 10,
|
||||
},
|
||||
"output_token_ids": 3,
|
||||
"metrics": {},
|
||||
},
|
||||
{
|
||||
"outputs": {
|
||||
@@ -141,6 +142,7 @@ class TestOpenAIServingCompletion(unittest.IsolatedAsyncioTestCase):
|
||||
"reasoning_token_num": 20,
|
||||
},
|
||||
"output_token_ids": 3,
|
||||
"metrics": {},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@@ -142,13 +142,10 @@ class TestTokenProcessorProcessBatchOutput(unittest.TestCase):
|
||||
processor.num_accepted_tokens = 0
|
||||
processor.num_emitted_tokens = 0
|
||||
processor.max_num_emitted_tokens = 0
|
||||
processor.num_rest_requests_per_head = [
|
||||
0,
|
||||
] * MAX_DRAFT_TOKENS
|
||||
processor.num_accept_requests_per_head = [
|
||||
0,
|
||||
] * MAX_DRAFT_TOKENS
|
||||
processor.speculative_stats_step = 0
|
||||
processor.total_step_per_request = {}
|
||||
processor.accept_token_num_per_head_per_request = {}
|
||||
processor.accept_token_num_per_head = [0] * MAX_DRAFT_TOKENS
|
||||
|
||||
# processor._recycle_resources = Mock()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user