mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-30 11:26:39 +08:00
rename fused_get_rope.cu (#3752)
* rename fused_get_rope.cu * fix * fix typos * fix * fix
This commit is contained in:
@@ -1023,7 +1023,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
|||||||
m.def("moe_expert_ffn", &MoeExpertFFNFunc, "moe export ffn function");
|
m.def("moe_expert_ffn", &MoeExpertFFNFunc, "moe export ffn function");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* moe/fused_moe/moe_ffn_wint2.cu
|
* moe/fused_moe/moe_expert_ffn_wint2.cu
|
||||||
* moe_expert_ffn_wint2
|
* moe_expert_ffn_wint2
|
||||||
*/
|
*/
|
||||||
m.def("moe_expert_ffn_wint2", &MoeExpertFFNWint2Func, "moe export ffn wint2 function");
|
m.def("moe_expert_ffn_wint2", &MoeExpertFFNWint2Func, "moe export ffn wint2 function");
|
||||||
|
|||||||
@@ -204,7 +204,7 @@ if paddle.is_compiled_with_rocm():
|
|||||||
"gpu_ops/get_output_msg_with_topk.cc",
|
"gpu_ops/get_output_msg_with_topk.cc",
|
||||||
"gpu_ops/save_output_msg_with_topk.cc",
|
"gpu_ops/save_output_msg_with_topk.cc",
|
||||||
"gpu_ops/transfer_output.cc",
|
"gpu_ops/transfer_output.cc",
|
||||||
"gpu_ops/set_value_by_flags.cu",
|
"gpu_ops/set_value_by_flags_and_idx.cu",
|
||||||
"gpu_ops/token_penalty_multi_scores.cu",
|
"gpu_ops/token_penalty_multi_scores.cu",
|
||||||
"gpu_ops/stop_generation.cu",
|
"gpu_ops/stop_generation.cu",
|
||||||
"gpu_ops/stop_generation_multi_ends.cu",
|
"gpu_ops/stop_generation_multi_ends.cu",
|
||||||
@@ -223,7 +223,7 @@ if paddle.is_compiled_with_rocm():
|
|||||||
"gpu_ops/speculate_decoding/speculate_get_output_padding_offset.cu",
|
"gpu_ops/speculate_decoding/speculate_get_output_padding_offset.cu",
|
||||||
"gpu_ops/speculate_decoding/speculate_get_seq_lens_output.cu",
|
"gpu_ops/speculate_decoding/speculate_get_seq_lens_output.cu",
|
||||||
"gpu_ops/speculate_decoding/speculate_save_output.cc",
|
"gpu_ops/speculate_decoding/speculate_save_output.cc",
|
||||||
"gpu_ops/speculate_decoding/speculate_set_value_by_flags.cu",
|
"gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu",
|
||||||
"gpu_ops/speculate_decoding/speculate_step.cu",
|
"gpu_ops/speculate_decoding/speculate_step.cu",
|
||||||
"gpu_ops/speculate_decoding/speculate_step_system_cache.cu",
|
"gpu_ops/speculate_decoding/speculate_step_system_cache.cu",
|
||||||
"gpu_ops/speculate_decoding/speculate_update_v3.cu",
|
"gpu_ops/speculate_decoding/speculate_update_v3.cu",
|
||||||
@@ -261,7 +261,7 @@ elif paddle.is_compiled_with_cuda():
|
|||||||
"gpu_ops/save_output_msg_with_topk.cc",
|
"gpu_ops/save_output_msg_with_topk.cc",
|
||||||
"gpu_ops/transfer_output.cc",
|
"gpu_ops/transfer_output.cc",
|
||||||
"gpu_ops/set_mask_value.cu",
|
"gpu_ops/set_mask_value.cu",
|
||||||
"gpu_ops/set_value_by_flags.cu",
|
"gpu_ops/set_value_by_flags_and_idx.cu",
|
||||||
"gpu_ops/ngram_mask.cu",
|
"gpu_ops/ngram_mask.cu",
|
||||||
"gpu_ops/gather_idx.cu",
|
"gpu_ops/gather_idx.cu",
|
||||||
"gpu_ops/get_output_ep.cc",
|
"gpu_ops/get_output_ep.cc",
|
||||||
@@ -276,7 +276,7 @@ elif paddle.is_compiled_with_cuda():
|
|||||||
"gpu_ops/recover_decode_task.cu",
|
"gpu_ops/recover_decode_task.cu",
|
||||||
"gpu_ops/step.cu",
|
"gpu_ops/step.cu",
|
||||||
"gpu_ops/step_reschedule.cu",
|
"gpu_ops/step_reschedule.cu",
|
||||||
"gpu_ops/fused_get_rope.cu",
|
"gpu_ops/fused_get_rotary_embedding.cu",
|
||||||
"gpu_ops/get_padding_offset.cu",
|
"gpu_ops/get_padding_offset.cu",
|
||||||
"gpu_ops/update_inputs.cu",
|
"gpu_ops/update_inputs.cu",
|
||||||
"gpu_ops/update_inputs_beam.cu",
|
"gpu_ops/update_inputs_beam.cu",
|
||||||
@@ -560,7 +560,7 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
|
|||||||
"gpu_ops/save_output_msg_with_topk.cc",
|
"gpu_ops/save_output_msg_with_topk.cc",
|
||||||
"gpu_ops/transfer_output.cc",
|
"gpu_ops/transfer_output.cc",
|
||||||
"gpu_ops/get_padding_offset.cu",
|
"gpu_ops/get_padding_offset.cu",
|
||||||
"gpu_ops/set_value_by_flags.cu",
|
"gpu_ops/set_value_by_flags_and_idx.cu",
|
||||||
"gpu_ops/rebuild_padding.cu",
|
"gpu_ops/rebuild_padding.cu",
|
||||||
"gpu_ops/update_inputs.cu",
|
"gpu_ops/update_inputs.cu",
|
||||||
"gpu_ops/stop_generation_multi_ends.cu",
|
"gpu_ops/stop_generation_multi_ends.cu",
|
||||||
@@ -609,7 +609,7 @@ elif paddle.device.is_compiled_with_custom_device("metax_gpu"):
|
|||||||
"gpu_ops/transfer_output.cc",
|
"gpu_ops/transfer_output.cc",
|
||||||
"gpu_ops/save_with_output.cc",
|
"gpu_ops/save_with_output.cc",
|
||||||
"gpu_ops/set_mask_value.cu",
|
"gpu_ops/set_mask_value.cu",
|
||||||
"gpu_ops/set_value_by_flags.cu",
|
"gpu_ops/set_value_by_flags_and_idx.cu",
|
||||||
"gpu_ops/ngram_mask.cu",
|
"gpu_ops/ngram_mask.cu",
|
||||||
"gpu_ops/gather_idx.cu",
|
"gpu_ops/gather_idx.cu",
|
||||||
"gpu_ops/get_output_ep.cc",
|
"gpu_ops/get_output_ep.cc",
|
||||||
@@ -618,7 +618,7 @@ elif paddle.device.is_compiled_with_custom_device("metax_gpu"):
|
|||||||
"gpu_ops/stop_generation.cu",
|
"gpu_ops/stop_generation.cu",
|
||||||
"gpu_ops/stop_generation_multi_ends.cu",
|
"gpu_ops/stop_generation_multi_ends.cu",
|
||||||
"gpu_ops/set_flags.cu",
|
"gpu_ops/set_flags.cu",
|
||||||
"gpu_ops/fused_get_rope.cu",
|
"gpu_ops/fused_get_rotary_embedding.cu",
|
||||||
"gpu_ops/get_padding_offset.cu",
|
"gpu_ops/get_padding_offset.cu",
|
||||||
"gpu_ops/update_inputs.cu",
|
"gpu_ops/update_inputs.cu",
|
||||||
"gpu_ops/update_inputs_beam.cu",
|
"gpu_ops/update_inputs_beam.cu",
|
||||||
|
|||||||
@@ -733,7 +733,7 @@ class DeepseekV3ForCausalLM(ModelForCasualLM):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
def clear_grpah_opt_backend(self):
|
def clear_grpah_opt_backend(self):
|
||||||
"""Clear graph optimization bakcend, the captured cuda graph will be cleaned"""
|
"""Clear graph optimization backend, the captured cuda graph will be cleaned"""
|
||||||
self.model.clear_grpah_opt_backend(fd_config=self.fd_config)
|
self.model.clear_grpah_opt_backend(fd_config=self.fd_config)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -620,7 +620,7 @@ class Ernie4_5_MoeForCausalLM(ModelForCasualLM):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
def clear_grpah_opt_backend(self):
|
def clear_grpah_opt_backend(self):
|
||||||
"""Clear graph optimization bakcend, the captured cuda graph will be cleaned"""
|
"""Clear graph optimization backend, the captured cuda graph will be cleaned"""
|
||||||
self.ernie.clear_grpah_opt_backend(fd_config=self.fd_config)
|
self.ernie.clear_grpah_opt_backend(fd_config=self.fd_config)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -721,7 +721,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
def clear_grpah_opt_backend(self):
|
def clear_grpah_opt_backend(self):
|
||||||
"""Clear graph optimization bakcend, the captured cuda graph will be cleaned"""
|
"""Clear graph optimization backend, the captured cuda graph will be cleaned"""
|
||||||
self.ernie.clear_grpah_opt_backend(fd_config=self.fd_config)
|
self.ernie.clear_grpah_opt_backend(fd_config=self.fd_config)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -395,7 +395,7 @@ class Qwen2ForCausalLM(ModelForCasualLM):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
def clear_grpah_opt_backend(self):
|
def clear_grpah_opt_backend(self):
|
||||||
"""Clear graph optimization bakcend, the captured cuda graph will be cleaned"""
|
"""Clear graph optimization backend, the captured cuda graph will be cleaned"""
|
||||||
self.qwen2.clear_grpah_opt_backend(fd_config=self.fd_config)
|
self.qwen2.clear_grpah_opt_backend(fd_config=self.fd_config)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -331,7 +331,7 @@ class Qwen3ForCausalLM(ModelForCasualLM):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
def clear_grpah_opt_backend(self):
|
def clear_grpah_opt_backend(self):
|
||||||
"""Clear graph optimization bakcend, the captured cuda graph will be cleaned"""
|
"""Clear graph optimization backend, the captured cuda graph will be cleaned"""
|
||||||
self.model.clear_grpah_opt_backend(fd_config=self.fd_config)
|
self.model.clear_grpah_opt_backend(fd_config=self.fd_config)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -452,7 +452,7 @@ class Qwen3MoeForCausalLM(ModelForCasualLM):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
def clear_grpah_opt_backend(self):
|
def clear_grpah_opt_backend(self):
|
||||||
"""Clear graph optimization bakcend, the captured cuda graph will be cleaned"""
|
"""Clear graph optimization backend, the captured cuda graph will be cleaned"""
|
||||||
self.model.clear_grpah_opt_backend(fd_config=self.fd_config)
|
self.model.clear_grpah_opt_backend(fd_config=self.fd_config)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -170,14 +170,14 @@ class TestCUDAGrpahSubgraph(unittest.TestCase):
|
|||||||
input_tensor1 = paddle.ones([8])
|
input_tensor1 = paddle.ones([8])
|
||||||
forward_meta1 = ForwardMeta(input_ids=input_tensor1, ids_remove_padding=input_tensor1, step_use_cudagraph=True)
|
forward_meta1 = ForwardMeta(input_ids=input_tensor1, ids_remove_padding=input_tensor1, step_use_cudagraph=True)
|
||||||
|
|
||||||
# Triger Capture
|
# Trigger Capture
|
||||||
_ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
_ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
||||||
|
|
||||||
# Reaplay
|
# Replay
|
||||||
_ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
_ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
||||||
output1 = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
output1 = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
||||||
|
|
||||||
# Corrent output
|
# Correct output
|
||||||
output1_correct = test_model1.forward_correct(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
output1_correct = test_model1.forward_correct(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
||||||
|
|
||||||
assert (output1 == output1_correct).all()
|
assert (output1 == output1_correct).all()
|
||||||
|
|||||||
@@ -102,43 +102,43 @@ class TestCUDAGrpahRecapture(unittest.TestCase):
|
|||||||
input_tensor1 = paddle.ones([1, 32768])
|
input_tensor1 = paddle.ones([1, 32768])
|
||||||
forward_meta1 = ForwardMeta(input_ids=input_tensor1, ids_remove_padding=input_tensor1, step_use_cudagraph=True)
|
forward_meta1 = ForwardMeta(input_ids=input_tensor1, ids_remove_padding=input_tensor1, step_use_cudagraph=True)
|
||||||
|
|
||||||
# Corrent output
|
# Correct output
|
||||||
self.output_correct = self.test_model1.forward_correct(
|
self.output_correct = self.test_model1.forward_correct(
|
||||||
ids_remove_padding=input_tensor1, forward_meta=forward_meta1
|
ids_remove_padding=input_tensor1, forward_meta=forward_meta1
|
||||||
)
|
)
|
||||||
|
|
||||||
# Capture and Destory
|
# Capture and Destroy
|
||||||
self.capture_and_replay(input_tensor1, forward_meta1)
|
self.capture_and_replay(input_tensor1, forward_meta1)
|
||||||
self.recapture_and_replay(input_tensor1, forward_meta1)
|
self.recapture_and_replay(input_tensor1, forward_meta1)
|
||||||
|
|
||||||
def capture_and_replay(self, input_tensor1, forward_meta1):
|
def capture_and_replay(self, input_tensor1, forward_meta1):
|
||||||
""" """
|
""" """
|
||||||
# Triger Capture
|
# Trigger Capture
|
||||||
print_gpu_memory_use(0, "before capture")
|
print_gpu_memory_use(0, "before capture")
|
||||||
output1 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
output1 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
||||||
print_gpu_memory_use(0, "after capture")
|
print_gpu_memory_use(0, "after capture")
|
||||||
|
|
||||||
# Reaplay
|
# Replay
|
||||||
output1 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
output1 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
||||||
assert (output1 == self.output_correct).all()
|
assert (output1 == self.output_correct).all()
|
||||||
|
|
||||||
# Destory
|
# Destroy
|
||||||
print_gpu_memory_use(0, "before destory")
|
print_gpu_memory_use(0, "before destory")
|
||||||
self.test_model1.clear_grpah_opt_backend()
|
self.test_model1.clear_grpah_opt_backend()
|
||||||
print_gpu_memory_use(0, "after destory")
|
print_gpu_memory_use(0, "after destory")
|
||||||
|
|
||||||
def recapture_and_replay(self, input_tensor1, forward_meta1):
|
def recapture_and_replay(self, input_tensor1, forward_meta1):
|
||||||
""" """
|
""" """
|
||||||
# Triger Capture
|
# Trigger Capture
|
||||||
print_gpu_memory_use(0, "before recapture")
|
print_gpu_memory_use(0, "before recapture")
|
||||||
output2 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
output2 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
||||||
print_gpu_memory_use(0, "after recapture")
|
print_gpu_memory_use(0, "after recapture")
|
||||||
|
|
||||||
# Reaplay
|
# Replay
|
||||||
output2 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
output2 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
||||||
assert (output2 == self.output_correct).all()
|
assert (output2 == self.output_correct).all()
|
||||||
|
|
||||||
# Destory
|
# Destroy
|
||||||
print_gpu_memory_use(0, "before destory")
|
print_gpu_memory_use(0, "before destory")
|
||||||
self.test_model1.clear_grpah_opt_backend()
|
self.test_model1.clear_grpah_opt_backend()
|
||||||
print_gpu_memory_use(0, "after destory")
|
print_gpu_memory_use(0, "after destory")
|
||||||
|
|||||||
@@ -117,14 +117,14 @@ class TestCUDAGrpahSpecDecode(unittest.TestCase):
|
|||||||
input_tensor1 = paddle.ones([1, 32768])
|
input_tensor1 = paddle.ones([1, 32768])
|
||||||
forward_meta1 = ForwardMeta(input_ids=input_tensor1, ids_remove_padding=input_tensor1, step_use_cudagraph=True)
|
forward_meta1 = ForwardMeta(input_ids=input_tensor1, ids_remove_padding=input_tensor1, step_use_cudagraph=True)
|
||||||
|
|
||||||
# Triger Capture
|
# Trigger Capture
|
||||||
_ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
_ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
||||||
|
|
||||||
# Reaplay
|
# Replay
|
||||||
_ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
_ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
||||||
output1 = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
output1 = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
||||||
|
|
||||||
# Corrent output
|
# Correct output
|
||||||
output1_correct = test_model1.forward_correct(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
output1_correct = test_model1.forward_correct(ids_remove_padding=input_tensor1, forward_meta=forward_meta1)
|
||||||
|
|
||||||
assert (output1 == output1_correct).all()
|
assert (output1 == output1_correct).all()
|
||||||
|
|||||||
@@ -104,14 +104,14 @@ class TestStaticGraphCUDAGraphSplit(unittest.TestCase):
|
|||||||
x = paddle.randint(32, shape=[1, 8])
|
x = paddle.randint(32, shape=[1, 8])
|
||||||
forward_meta1 = ForwardMeta(input_ids=x, ids_remove_padding=x, step_use_cudagraph=True)
|
forward_meta1 = ForwardMeta(input_ids=x, ids_remove_padding=x, step_use_cudagraph=True)
|
||||||
|
|
||||||
# Triger Capture
|
# Trigger Capture
|
||||||
_ = test_model1(x, forward_meta=forward_meta1)
|
_ = test_model1(x, forward_meta=forward_meta1)
|
||||||
|
|
||||||
# Reaplay
|
# Replay
|
||||||
_ = test_model1(x, forward_meta=forward_meta1)
|
_ = test_model1(x, forward_meta=forward_meta1)
|
||||||
output1 = test_model1(x, forward_meta=forward_meta1)
|
output1 = test_model1(x, forward_meta=forward_meta1)
|
||||||
|
|
||||||
# Corrent output
|
# Correct output
|
||||||
output1_correct = test_model1.forward_correct(x, forward_meta=forward_meta1)
|
output1_correct = test_model1.forward_correct(x, forward_meta=forward_meta1)
|
||||||
|
|
||||||
assert (output1 == output1_correct).all()
|
assert (output1 == output1_correct).all()
|
||||||
|
|||||||
Reference in New Issue
Block a user