[Perf] Support tensor transmission between work and engine with zero-copy to improve efficiency (#4839)

* feat(zmq): support tensor transmission with zero-copy for improved efficiency * perf: zmq.send disable copy * zmq recv data for debug * convert logprobs tensor to cpu
2025-12-24 13:28:13 +08:00 · 2025-11-11 15:43:11 +08:00
parent 8b61f01c68
commit 3098aee05f
8 changed files with 23 additions and 18 deletions
--- a/fastdeploy/model_executor/layers/sample/sampler.py
+++ b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -334,7 +334,9 @@ class Sampler(nn.Layer):
        else:
            indices = token_ids
            top_logprobs = token_logprobs
-
+        indices = indices.cpu()
+        top_logprobs = top_logprobs.cpu()
+        token_ranks = token_ranks.cpu()
        return LogprobsTensors(indices, top_logprobs, token_ranks)

    def forward_cuda(