rename fused_get_rope.cu (#3752)

* rename fused_get_rope.cu

* fix

* fix typos

* fix

* fix
This commit is contained in:
co63oc
2025-09-03 10:54:34 +08:00
committed by GitHub
parent 2c9b169c0e
commit 5441538173
17 changed files with 31 additions and 31 deletions

View File

@@ -1023,7 +1023,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("moe_expert_ffn", &MoeExpertFFNFunc, "moe export ffn function"); m.def("moe_expert_ffn", &MoeExpertFFNFunc, "moe export ffn function");
/** /**
* moe/fused_moe/moe_ffn_wint2.cu * moe/fused_moe/moe_expert_ffn_wint2.cu
* moe_expert_ffn_wint2 * moe_expert_ffn_wint2
*/ */
m.def("moe_expert_ffn_wint2", &MoeExpertFFNWint2Func, "moe export ffn wint2 function"); m.def("moe_expert_ffn_wint2", &MoeExpertFFNWint2Func, "moe export ffn wint2 function");

View File

@@ -204,7 +204,7 @@ if paddle.is_compiled_with_rocm():
"gpu_ops/get_output_msg_with_topk.cc", "gpu_ops/get_output_msg_with_topk.cc",
"gpu_ops/save_output_msg_with_topk.cc", "gpu_ops/save_output_msg_with_topk.cc",
"gpu_ops/transfer_output.cc", "gpu_ops/transfer_output.cc",
"gpu_ops/set_value_by_flags.cu", "gpu_ops/set_value_by_flags_and_idx.cu",
"gpu_ops/token_penalty_multi_scores.cu", "gpu_ops/token_penalty_multi_scores.cu",
"gpu_ops/stop_generation.cu", "gpu_ops/stop_generation.cu",
"gpu_ops/stop_generation_multi_ends.cu", "gpu_ops/stop_generation_multi_ends.cu",
@@ -223,7 +223,7 @@ if paddle.is_compiled_with_rocm():
"gpu_ops/speculate_decoding/speculate_get_output_padding_offset.cu", "gpu_ops/speculate_decoding/speculate_get_output_padding_offset.cu",
"gpu_ops/speculate_decoding/speculate_get_seq_lens_output.cu", "gpu_ops/speculate_decoding/speculate_get_seq_lens_output.cu",
"gpu_ops/speculate_decoding/speculate_save_output.cc", "gpu_ops/speculate_decoding/speculate_save_output.cc",
"gpu_ops/speculate_decoding/speculate_set_value_by_flags.cu", "gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu",
"gpu_ops/speculate_decoding/speculate_step.cu", "gpu_ops/speculate_decoding/speculate_step.cu",
"gpu_ops/speculate_decoding/speculate_step_system_cache.cu", "gpu_ops/speculate_decoding/speculate_step_system_cache.cu",
"gpu_ops/speculate_decoding/speculate_update_v3.cu", "gpu_ops/speculate_decoding/speculate_update_v3.cu",
@@ -261,7 +261,7 @@ elif paddle.is_compiled_with_cuda():
"gpu_ops/save_output_msg_with_topk.cc", "gpu_ops/save_output_msg_with_topk.cc",
"gpu_ops/transfer_output.cc", "gpu_ops/transfer_output.cc",
"gpu_ops/set_mask_value.cu", "gpu_ops/set_mask_value.cu",
"gpu_ops/set_value_by_flags.cu", "gpu_ops/set_value_by_flags_and_idx.cu",
"gpu_ops/ngram_mask.cu", "gpu_ops/ngram_mask.cu",
"gpu_ops/gather_idx.cu", "gpu_ops/gather_idx.cu",
"gpu_ops/get_output_ep.cc", "gpu_ops/get_output_ep.cc",
@@ -276,7 +276,7 @@ elif paddle.is_compiled_with_cuda():
"gpu_ops/recover_decode_task.cu", "gpu_ops/recover_decode_task.cu",
"gpu_ops/step.cu", "gpu_ops/step.cu",
"gpu_ops/step_reschedule.cu", "gpu_ops/step_reschedule.cu",
"gpu_ops/fused_get_rope.cu", "gpu_ops/fused_get_rotary_embedding.cu",
"gpu_ops/get_padding_offset.cu", "gpu_ops/get_padding_offset.cu",
"gpu_ops/update_inputs.cu", "gpu_ops/update_inputs.cu",
"gpu_ops/update_inputs_beam.cu", "gpu_ops/update_inputs_beam.cu",
@@ -560,7 +560,7 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
"gpu_ops/save_output_msg_with_topk.cc", "gpu_ops/save_output_msg_with_topk.cc",
"gpu_ops/transfer_output.cc", "gpu_ops/transfer_output.cc",
"gpu_ops/get_padding_offset.cu", "gpu_ops/get_padding_offset.cu",
"gpu_ops/set_value_by_flags.cu", "gpu_ops/set_value_by_flags_and_idx.cu",
"gpu_ops/rebuild_padding.cu", "gpu_ops/rebuild_padding.cu",
"gpu_ops/update_inputs.cu", "gpu_ops/update_inputs.cu",
"gpu_ops/stop_generation_multi_ends.cu", "gpu_ops/stop_generation_multi_ends.cu",
@@ -609,7 +609,7 @@ elif paddle.device.is_compiled_with_custom_device("metax_gpu"):
"gpu_ops/transfer_output.cc", "gpu_ops/transfer_output.cc",
"gpu_ops/save_with_output.cc", "gpu_ops/save_with_output.cc",
"gpu_ops/set_mask_value.cu", "gpu_ops/set_mask_value.cu",
"gpu_ops/set_value_by_flags.cu", "gpu_ops/set_value_by_flags_and_idx.cu",
"gpu_ops/ngram_mask.cu", "gpu_ops/ngram_mask.cu",
"gpu_ops/gather_idx.cu", "gpu_ops/gather_idx.cu",
"gpu_ops/get_output_ep.cc", "gpu_ops/get_output_ep.cc",
@@ -618,7 +618,7 @@ elif paddle.device.is_compiled_with_custom_device("metax_gpu"):
"gpu_ops/stop_generation.cu", "gpu_ops/stop_generation.cu",
"gpu_ops/stop_generation_multi_ends.cu", "gpu_ops/stop_generation_multi_ends.cu",
"gpu_ops/set_flags.cu", "gpu_ops/set_flags.cu",
"gpu_ops/fused_get_rope.cu", "gpu_ops/fused_get_rotary_embedding.cu",
"gpu_ops/get_padding_offset.cu", "gpu_ops/get_padding_offset.cu",
"gpu_ops/update_inputs.cu", "gpu_ops/update_inputs.cu",
"gpu_ops/update_inputs_beam.cu", "gpu_ops/update_inputs_beam.cu",

View File

@@ -733,7 +733,7 @@ class DeepseekV3ForCausalLM(ModelForCasualLM):
return hidden_states return hidden_states
def clear_grpah_opt_backend(self): def clear_grpah_opt_backend(self):
"""Clear graph optimization bakcend, the captured cuda graph will be cleaned""" """Clear graph optimization backend, the captured cuda graph will be cleaned"""
self.model.clear_grpah_opt_backend(fd_config=self.fd_config) self.model.clear_grpah_opt_backend(fd_config=self.fd_config)

View File

@@ -620,7 +620,7 @@ class Ernie4_5_MoeForCausalLM(ModelForCasualLM):
return hidden_states return hidden_states
def clear_grpah_opt_backend(self): def clear_grpah_opt_backend(self):
"""Clear graph optimization bakcend, the captured cuda graph will be cleaned""" """Clear graph optimization backend, the captured cuda graph will be cleaned"""
self.ernie.clear_grpah_opt_backend(fd_config=self.fd_config) self.ernie.clear_grpah_opt_backend(fd_config=self.fd_config)

View File

@@ -721,7 +721,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM):
return hidden_states return hidden_states
def clear_grpah_opt_backend(self): def clear_grpah_opt_backend(self):
"""Clear graph optimization bakcend, the captured cuda graph will be cleaned""" """Clear graph optimization backend, the captured cuda graph will be cleaned"""
self.ernie.clear_grpah_opt_backend(fd_config=self.fd_config) self.ernie.clear_grpah_opt_backend(fd_config=self.fd_config)

View File

@@ -395,7 +395,7 @@ class Qwen2ForCausalLM(ModelForCasualLM):
return hidden_states return hidden_states
def clear_grpah_opt_backend(self): def clear_grpah_opt_backend(self):
"""Clear graph optimization bakcend, the captured cuda graph will be cleaned""" """Clear graph optimization backend, the captured cuda graph will be cleaned"""
self.qwen2.clear_grpah_opt_backend(fd_config=self.fd_config) self.qwen2.clear_grpah_opt_backend(fd_config=self.fd_config)

View File

@@ -331,7 +331,7 @@ class Qwen3ForCausalLM(ModelForCasualLM):
return hidden_states return hidden_states
def clear_grpah_opt_backend(self): def clear_grpah_opt_backend(self):
"""Clear graph optimization bakcend, the captured cuda graph will be cleaned""" """Clear graph optimization backend, the captured cuda graph will be cleaned"""
self.model.clear_grpah_opt_backend(fd_config=self.fd_config) self.model.clear_grpah_opt_backend(fd_config=self.fd_config)

View File

@@ -452,7 +452,7 @@ class Qwen3MoeForCausalLM(ModelForCasualLM):
return hidden_states return hidden_states
def clear_grpah_opt_backend(self): def clear_grpah_opt_backend(self):
"""Clear graph optimization bakcend, the captured cuda graph will be cleaned""" """Clear graph optimization backend, the captured cuda graph will be cleaned"""
self.model.clear_grpah_opt_backend(fd_config=self.fd_config) self.model.clear_grpah_opt_backend(fd_config=self.fd_config)

View File

@@ -170,14 +170,14 @@ class TestCUDAGrpahSubgraph(unittest.TestCase):
input_tensor1 = paddle.ones([8]) input_tensor1 = paddle.ones([8])
forward_meta1 = ForwardMeta(input_ids=input_tensor1, ids_remove_padding=input_tensor1, step_use_cudagraph=True) forward_meta1 = ForwardMeta(input_ids=input_tensor1, ids_remove_padding=input_tensor1, step_use_cudagraph=True)
# Triger Capture # Trigger Capture
_ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) _ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
# Reaplay # Replay
_ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) _ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
output1 = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) output1 = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
# Corrent output # Correct output
output1_correct = test_model1.forward_correct(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) output1_correct = test_model1.forward_correct(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
assert (output1 == output1_correct).all() assert (output1 == output1_correct).all()

View File

@@ -102,43 +102,43 @@ class TestCUDAGrpahRecapture(unittest.TestCase):
input_tensor1 = paddle.ones([1, 32768]) input_tensor1 = paddle.ones([1, 32768])
forward_meta1 = ForwardMeta(input_ids=input_tensor1, ids_remove_padding=input_tensor1, step_use_cudagraph=True) forward_meta1 = ForwardMeta(input_ids=input_tensor1, ids_remove_padding=input_tensor1, step_use_cudagraph=True)
# Corrent output # Correct output
self.output_correct = self.test_model1.forward_correct( self.output_correct = self.test_model1.forward_correct(
ids_remove_padding=input_tensor1, forward_meta=forward_meta1 ids_remove_padding=input_tensor1, forward_meta=forward_meta1
) )
# Capture and Destory # Capture and Destroy
self.capture_and_replay(input_tensor1, forward_meta1) self.capture_and_replay(input_tensor1, forward_meta1)
self.recapture_and_replay(input_tensor1, forward_meta1) self.recapture_and_replay(input_tensor1, forward_meta1)
def capture_and_replay(self, input_tensor1, forward_meta1): def capture_and_replay(self, input_tensor1, forward_meta1):
""" """ """ """
# Triger Capture # Trigger Capture
print_gpu_memory_use(0, "before capture") print_gpu_memory_use(0, "before capture")
output1 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) output1 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
print_gpu_memory_use(0, "after capture") print_gpu_memory_use(0, "after capture")
# Reaplay # Replay
output1 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) output1 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
assert (output1 == self.output_correct).all() assert (output1 == self.output_correct).all()
# Destory # Destroy
print_gpu_memory_use(0, "before destory") print_gpu_memory_use(0, "before destory")
self.test_model1.clear_grpah_opt_backend() self.test_model1.clear_grpah_opt_backend()
print_gpu_memory_use(0, "after destory") print_gpu_memory_use(0, "after destory")
def recapture_and_replay(self, input_tensor1, forward_meta1): def recapture_and_replay(self, input_tensor1, forward_meta1):
""" """ """ """
# Triger Capture # Trigger Capture
print_gpu_memory_use(0, "before recapture") print_gpu_memory_use(0, "before recapture")
output2 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) output2 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
print_gpu_memory_use(0, "after recapture") print_gpu_memory_use(0, "after recapture")
# Reaplay # Replay
output2 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) output2 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
assert (output2 == self.output_correct).all() assert (output2 == self.output_correct).all()
# Destory # Destroy
print_gpu_memory_use(0, "before destory") print_gpu_memory_use(0, "before destory")
self.test_model1.clear_grpah_opt_backend() self.test_model1.clear_grpah_opt_backend()
print_gpu_memory_use(0, "after destory") print_gpu_memory_use(0, "after destory")

View File

@@ -117,14 +117,14 @@ class TestCUDAGrpahSpecDecode(unittest.TestCase):
input_tensor1 = paddle.ones([1, 32768]) input_tensor1 = paddle.ones([1, 32768])
forward_meta1 = ForwardMeta(input_ids=input_tensor1, ids_remove_padding=input_tensor1, step_use_cudagraph=True) forward_meta1 = ForwardMeta(input_ids=input_tensor1, ids_remove_padding=input_tensor1, step_use_cudagraph=True)
# Triger Capture # Trigger Capture
_ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) _ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
# Reaplay # Replay
_ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) _ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
output1 = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) output1 = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
# Corrent output # Correct output
output1_correct = test_model1.forward_correct(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) output1_correct = test_model1.forward_correct(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
assert (output1 == output1_correct).all() assert (output1 == output1_correct).all()

View File

@@ -104,14 +104,14 @@ class TestStaticGraphCUDAGraphSplit(unittest.TestCase):
x = paddle.randint(32, shape=[1, 8]) x = paddle.randint(32, shape=[1, 8])
forward_meta1 = ForwardMeta(input_ids=x, ids_remove_padding=x, step_use_cudagraph=True) forward_meta1 = ForwardMeta(input_ids=x, ids_remove_padding=x, step_use_cudagraph=True)
# Triger Capture # Trigger Capture
_ = test_model1(x, forward_meta=forward_meta1) _ = test_model1(x, forward_meta=forward_meta1)
# Reaplay # Replay
_ = test_model1(x, forward_meta=forward_meta1) _ = test_model1(x, forward_meta=forward_meta1)
output1 = test_model1(x, forward_meta=forward_meta1) output1 = test_model1(x, forward_meta=forward_meta1)
# Corrent output # Correct output
output1_correct = test_model1.forward_correct(x, forward_meta=forward_meta1) output1_correct = test_model1.forward_correct(x, forward_meta=forward_meta1)
assert (output1 == output1_correct).all() assert (output1 == output1_correct).all()