[NewFeature]Support dp multi api server && Fix some bug in mixed ep && merge develop (#3598)

* [Feature] update ep

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix queue ports idx

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* Update engine.py

* fix ci

* fix some bug in mixed ep

* add server fix and op fix

* rm some log

* fix code style

* ltd fix

* fix

* fix

* fix some bug

* fix bug

* fix bug

* fix style

* Update config.py

* Update splitwise_connector.py

* Update cache_messager.py

* Update __init__.py

* merge and fix

* Update engine.py

* Update common_engine.py

* Update run_ci_xpu.sh

* Update ernie_processor.py

* Update ernie_processor.py

---------

Co-authored-by: ltd0924 <ltd0924@sina.com>
Co-authored-by: ltd0924 <32387785+ltd0924@users.noreply.github.com>
This commit is contained in:
gaoziyuan
2025-08-26 19:59:02 +08:00
committed by GitHub
parent cbce94a00e
commit 82e64b13e1
24 changed files with 1244 additions and 1200 deletions

View File

@@ -49,6 +49,7 @@ def get_moe_scores(
compute moe scores using e_score_correction_bias.
"""
scores = paddle.nn.functional.sigmoid(gating_output)
assert e_score_correction_bias is not None, "e_score_correction_bias is none!"
scores_with_bias = scores + e_score_correction_bias
scores, topk_values, topk_idx = noaux_tc(
scores,
@@ -104,11 +105,12 @@ class DeepEPEngine:
# In mixed EP mode on a single node, we dynamically switch between
# high throughput and low latency modes.
if splitwise_role == "mixed":
self.deepep_engine = deep_ep.Buffer(
self.group,
int(2e9),
int(5e9),
int(6e9),
low_latency_mode=True,
num_qps_per_rank=24,
)
@@ -387,6 +389,7 @@ class EPPrefillRunner(EPRunner):
*args,
**kwargs,
):
(
num_tokens_per_rank,
num_tokens_per_rdma_rank,