[Perf] Support tensor transmission between work and engine with zero-copy to improve efficiency (#4839)

* feat(zmq): support tensor transmission with zero-copy for improved efficiency

* perf: zmq.send disable copy

* zmq recv data for debug

* convert logprobs tensor to cpu
This commit is contained in:
SunLei
2025-11-11 15:43:11 +08:00
committed by GitHub
parent 8b61f01c68
commit 3098aee05f
8 changed files with 23 additions and 18 deletions

View File

@@ -2735,7 +2735,7 @@ class GPUModelRunner(ModelRunnerBase):
logprobs_tensors = self.in_progress_prompt_logprobs.get(req_id)
if not logprobs_tensors:
logprobs_tensors = LogprobsTensors.empty(num_prompt_tokens - 1, num_prompt_logprobs + 1)
logprobs_tensors = LogprobsTensors.empty_cpu(num_prompt_tokens - 1, num_prompt_logprobs + 1)
self.in_progress_prompt_logprobs[req_id] = logprobs_tensors
start_idx = request.prefill_start_index
start_tok = start_idx + 1