Launch expert_service before kv_cache initialization in worker_process (#3045)

* launch expert_service before kv_cache initialization

* add two signal make sure model loading and expert_service lauching finished

* fix the EP bug

* fix ep

* update launching way

* fix ep

* update

* roback ep

* pre-commit all files

---------

Co-authored-by: RAM <gstian5555@outlook.com>
Co-authored-by: Divano <dddivano@outlook.com>
This commit is contained in:
Zero Rains
2025-08-11 19:38:46 +08:00
committed by GitHub
parent c27a3dc43b
commit b23af29d0b
6 changed files with 175 additions and 100 deletions

View File

@@ -7,23 +7,16 @@
Boundary value checking for API parameters
"""
import json
from core import (
TEMPLATE,
URL,
build_request_payload,
send_request,
)
from core import TEMPLATE, URL, build_request_payload, send_request
def test_max_min_1_token():
data = {
"stream": False,
"messages": [{"role": "user", "content": "非洲的首都是?"}],
"max_tokens": 1,
"metadata": {
"min_tokens": 1
},
"metadata": {"min_tokens": 1},
}
payload = build_request_payload(TEMPLATE, data)
response = send_request(URL, payload).json()
@@ -33,4 +26,4 @@ def test_max_min_1_token():
completion_tokens = response["usage"]["completion_tokens"]
assert completion_tokens == 1, f"实际生成的token数为: {completion_tokens}, 应该为1"
finish_reason = response["choices"][0]["finish_reason"]
assert finish_reason == "length", f"内容不可能完整生成, 但实际finish_reason为: {response}"
assert finish_reason == "length", f"内容不可能完整生成, 但实际finish_reason为: {response}"