mirror of
				https://github.com/PaddlePaddle/FastDeploy.git
				synced 2025-11-01 04:12:58 +08:00 
			
		
		
		
	rename fused_get_rope.cu (#3752)
* rename fused_get_rope.cu * fix * fix typos * fix * fix
This commit is contained in:
		| @@ -1023,7 +1023,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) { | |||||||
|   m.def("moe_expert_ffn", &MoeExpertFFNFunc, "moe export ffn function"); |   m.def("moe_expert_ffn", &MoeExpertFFNFunc, "moe export ffn function"); | ||||||
|  |  | ||||||
|   /** |   /** | ||||||
|    * moe/fused_moe/moe_ffn_wint2.cu |    * moe/fused_moe/moe_expert_ffn_wint2.cu | ||||||
|    * moe_expert_ffn_wint2 |    * moe_expert_ffn_wint2 | ||||||
|    */ |    */ | ||||||
|   m.def("moe_expert_ffn_wint2", &MoeExpertFFNWint2Func, "moe export ffn wint2 function"); |   m.def("moe_expert_ffn_wint2", &MoeExpertFFNWint2Func, "moe export ffn wint2 function"); | ||||||
|   | |||||||
| @@ -204,7 +204,7 @@ if paddle.is_compiled_with_rocm(): | |||||||
|         "gpu_ops/get_output_msg_with_topk.cc", |         "gpu_ops/get_output_msg_with_topk.cc", | ||||||
|         "gpu_ops/save_output_msg_with_topk.cc", |         "gpu_ops/save_output_msg_with_topk.cc", | ||||||
|         "gpu_ops/transfer_output.cc", |         "gpu_ops/transfer_output.cc", | ||||||
|         "gpu_ops/set_value_by_flags.cu", |         "gpu_ops/set_value_by_flags_and_idx.cu", | ||||||
|         "gpu_ops/token_penalty_multi_scores.cu", |         "gpu_ops/token_penalty_multi_scores.cu", | ||||||
|         "gpu_ops/stop_generation.cu", |         "gpu_ops/stop_generation.cu", | ||||||
|         "gpu_ops/stop_generation_multi_ends.cu", |         "gpu_ops/stop_generation_multi_ends.cu", | ||||||
| @@ -223,7 +223,7 @@ if paddle.is_compiled_with_rocm(): | |||||||
|         "gpu_ops/speculate_decoding/speculate_get_output_padding_offset.cu", |         "gpu_ops/speculate_decoding/speculate_get_output_padding_offset.cu", | ||||||
|         "gpu_ops/speculate_decoding/speculate_get_seq_lens_output.cu", |         "gpu_ops/speculate_decoding/speculate_get_seq_lens_output.cu", | ||||||
|         "gpu_ops/speculate_decoding/speculate_save_output.cc", |         "gpu_ops/speculate_decoding/speculate_save_output.cc", | ||||||
|         "gpu_ops/speculate_decoding/speculate_set_value_by_flags.cu", |         "gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu", | ||||||
|         "gpu_ops/speculate_decoding/speculate_step.cu", |         "gpu_ops/speculate_decoding/speculate_step.cu", | ||||||
|         "gpu_ops/speculate_decoding/speculate_step_system_cache.cu", |         "gpu_ops/speculate_decoding/speculate_step_system_cache.cu", | ||||||
|         "gpu_ops/speculate_decoding/speculate_update_v3.cu", |         "gpu_ops/speculate_decoding/speculate_update_v3.cu", | ||||||
| @@ -261,7 +261,7 @@ elif paddle.is_compiled_with_cuda(): | |||||||
|         "gpu_ops/save_output_msg_with_topk.cc", |         "gpu_ops/save_output_msg_with_topk.cc", | ||||||
|         "gpu_ops/transfer_output.cc", |         "gpu_ops/transfer_output.cc", | ||||||
|         "gpu_ops/set_mask_value.cu", |         "gpu_ops/set_mask_value.cu", | ||||||
|         "gpu_ops/set_value_by_flags.cu", |         "gpu_ops/set_value_by_flags_and_idx.cu", | ||||||
|         "gpu_ops/ngram_mask.cu", |         "gpu_ops/ngram_mask.cu", | ||||||
|         "gpu_ops/gather_idx.cu", |         "gpu_ops/gather_idx.cu", | ||||||
|         "gpu_ops/get_output_ep.cc", |         "gpu_ops/get_output_ep.cc", | ||||||
| @@ -276,7 +276,7 @@ elif paddle.is_compiled_with_cuda(): | |||||||
|         "gpu_ops/recover_decode_task.cu", |         "gpu_ops/recover_decode_task.cu", | ||||||
|         "gpu_ops/step.cu", |         "gpu_ops/step.cu", | ||||||
|         "gpu_ops/step_reschedule.cu", |         "gpu_ops/step_reschedule.cu", | ||||||
|         "gpu_ops/fused_get_rope.cu", |         "gpu_ops/fused_get_rotary_embedding.cu", | ||||||
|         "gpu_ops/get_padding_offset.cu", |         "gpu_ops/get_padding_offset.cu", | ||||||
|         "gpu_ops/update_inputs.cu", |         "gpu_ops/update_inputs.cu", | ||||||
|         "gpu_ops/update_inputs_beam.cu", |         "gpu_ops/update_inputs_beam.cu", | ||||||
| @@ -560,7 +560,7 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"): | |||||||
|                 "gpu_ops/save_output_msg_with_topk.cc", |                 "gpu_ops/save_output_msg_with_topk.cc", | ||||||
|                 "gpu_ops/transfer_output.cc", |                 "gpu_ops/transfer_output.cc", | ||||||
|                 "gpu_ops/get_padding_offset.cu", |                 "gpu_ops/get_padding_offset.cu", | ||||||
|                 "gpu_ops/set_value_by_flags.cu", |                 "gpu_ops/set_value_by_flags_and_idx.cu", | ||||||
|                 "gpu_ops/rebuild_padding.cu", |                 "gpu_ops/rebuild_padding.cu", | ||||||
|                 "gpu_ops/update_inputs.cu", |                 "gpu_ops/update_inputs.cu", | ||||||
|                 "gpu_ops/stop_generation_multi_ends.cu", |                 "gpu_ops/stop_generation_multi_ends.cu", | ||||||
| @@ -609,7 +609,7 @@ elif paddle.device.is_compiled_with_custom_device("metax_gpu"): | |||||||
|         "gpu_ops/transfer_output.cc", |         "gpu_ops/transfer_output.cc", | ||||||
|         "gpu_ops/save_with_output.cc", |         "gpu_ops/save_with_output.cc", | ||||||
|         "gpu_ops/set_mask_value.cu", |         "gpu_ops/set_mask_value.cu", | ||||||
|         "gpu_ops/set_value_by_flags.cu", |         "gpu_ops/set_value_by_flags_and_idx.cu", | ||||||
|         "gpu_ops/ngram_mask.cu", |         "gpu_ops/ngram_mask.cu", | ||||||
|         "gpu_ops/gather_idx.cu", |         "gpu_ops/gather_idx.cu", | ||||||
|         "gpu_ops/get_output_ep.cc", |         "gpu_ops/get_output_ep.cc", | ||||||
| @@ -618,7 +618,7 @@ elif paddle.device.is_compiled_with_custom_device("metax_gpu"): | |||||||
|         "gpu_ops/stop_generation.cu", |         "gpu_ops/stop_generation.cu", | ||||||
|         "gpu_ops/stop_generation_multi_ends.cu", |         "gpu_ops/stop_generation_multi_ends.cu", | ||||||
|         "gpu_ops/set_flags.cu", |         "gpu_ops/set_flags.cu", | ||||||
|         "gpu_ops/fused_get_rope.cu", |         "gpu_ops/fused_get_rotary_embedding.cu", | ||||||
|         "gpu_ops/get_padding_offset.cu", |         "gpu_ops/get_padding_offset.cu", | ||||||
|         "gpu_ops/update_inputs.cu", |         "gpu_ops/update_inputs.cu", | ||||||
|         "gpu_ops/update_inputs_beam.cu", |         "gpu_ops/update_inputs_beam.cu", | ||||||
|   | |||||||
| @@ -733,7 +733,7 @@ class DeepseekV3ForCausalLM(ModelForCasualLM): | |||||||
|         return hidden_states |         return hidden_states | ||||||
|  |  | ||||||
|     def clear_grpah_opt_backend(self): |     def clear_grpah_opt_backend(self): | ||||||
|         """Clear graph optimization bakcend, the captured cuda graph will be cleaned""" |         """Clear graph optimization backend, the captured cuda graph will be cleaned""" | ||||||
|         self.model.clear_grpah_opt_backend(fd_config=self.fd_config) |         self.model.clear_grpah_opt_backend(fd_config=self.fd_config) | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -620,7 +620,7 @@ class Ernie4_5_MoeForCausalLM(ModelForCasualLM): | |||||||
|         return hidden_states |         return hidden_states | ||||||
|  |  | ||||||
|     def clear_grpah_opt_backend(self): |     def clear_grpah_opt_backend(self): | ||||||
|         """Clear graph optimization bakcend, the captured cuda graph will be cleaned""" |         """Clear graph optimization backend, the captured cuda graph will be cleaned""" | ||||||
|         self.ernie.clear_grpah_opt_backend(fd_config=self.fd_config) |         self.ernie.clear_grpah_opt_backend(fd_config=self.fd_config) | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -721,7 +721,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM): | |||||||
|         return hidden_states |         return hidden_states | ||||||
|  |  | ||||||
|     def clear_grpah_opt_backend(self): |     def clear_grpah_opt_backend(self): | ||||||
|         """Clear graph optimization bakcend, the captured cuda graph will be cleaned""" |         """Clear graph optimization backend, the captured cuda graph will be cleaned""" | ||||||
|         self.ernie.clear_grpah_opt_backend(fd_config=self.fd_config) |         self.ernie.clear_grpah_opt_backend(fd_config=self.fd_config) | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -395,7 +395,7 @@ class Qwen2ForCausalLM(ModelForCasualLM): | |||||||
|         return hidden_states |         return hidden_states | ||||||
|  |  | ||||||
|     def clear_grpah_opt_backend(self): |     def clear_grpah_opt_backend(self): | ||||||
|         """Clear graph optimization bakcend, the captured cuda graph will be cleaned""" |         """Clear graph optimization backend, the captured cuda graph will be cleaned""" | ||||||
|         self.qwen2.clear_grpah_opt_backend(fd_config=self.fd_config) |         self.qwen2.clear_grpah_opt_backend(fd_config=self.fd_config) | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -331,7 +331,7 @@ class Qwen3ForCausalLM(ModelForCasualLM): | |||||||
|         return hidden_states |         return hidden_states | ||||||
|  |  | ||||||
|     def clear_grpah_opt_backend(self): |     def clear_grpah_opt_backend(self): | ||||||
|         """Clear graph optimization bakcend, the captured cuda graph will be cleaned""" |         """Clear graph optimization backend, the captured cuda graph will be cleaned""" | ||||||
|         self.model.clear_grpah_opt_backend(fd_config=self.fd_config) |         self.model.clear_grpah_opt_backend(fd_config=self.fd_config) | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -452,7 +452,7 @@ class Qwen3MoeForCausalLM(ModelForCasualLM): | |||||||
|         return hidden_states |         return hidden_states | ||||||
|  |  | ||||||
|     def clear_grpah_opt_backend(self): |     def clear_grpah_opt_backend(self): | ||||||
|         """Clear graph optimization bakcend, the captured cuda graph will be cleaned""" |         """Clear graph optimization backend, the captured cuda graph will be cleaned""" | ||||||
|         self.model.clear_grpah_opt_backend(fd_config=self.fd_config) |         self.model.clear_grpah_opt_backend(fd_config=self.fd_config) | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -170,14 +170,14 @@ class TestCUDAGrpahSubgraph(unittest.TestCase): | |||||||
|         input_tensor1 = paddle.ones([8]) |         input_tensor1 = paddle.ones([8]) | ||||||
|         forward_meta1 = ForwardMeta(input_ids=input_tensor1, ids_remove_padding=input_tensor1, step_use_cudagraph=True) |         forward_meta1 = ForwardMeta(input_ids=input_tensor1, ids_remove_padding=input_tensor1, step_use_cudagraph=True) | ||||||
|  |  | ||||||
|         # Triger Capture |         # Trigger Capture | ||||||
|         _ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) |         _ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) | ||||||
|  |  | ||||||
|         # Reaplay |         # Replay | ||||||
|         _ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) |         _ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) | ||||||
|         output1 = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) |         output1 = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) | ||||||
|  |  | ||||||
|         # Corrent output |         # Correct output | ||||||
|         output1_correct = test_model1.forward_correct(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) |         output1_correct = test_model1.forward_correct(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) | ||||||
|  |  | ||||||
|         assert (output1 == output1_correct).all() |         assert (output1 == output1_correct).all() | ||||||
|   | |||||||
| @@ -102,43 +102,43 @@ class TestCUDAGrpahRecapture(unittest.TestCase): | |||||||
|         input_tensor1 = paddle.ones([1, 32768]) |         input_tensor1 = paddle.ones([1, 32768]) | ||||||
|         forward_meta1 = ForwardMeta(input_ids=input_tensor1, ids_remove_padding=input_tensor1, step_use_cudagraph=True) |         forward_meta1 = ForwardMeta(input_ids=input_tensor1, ids_remove_padding=input_tensor1, step_use_cudagraph=True) | ||||||
|  |  | ||||||
|         # Corrent output |         # Correct output | ||||||
|         self.output_correct = self.test_model1.forward_correct( |         self.output_correct = self.test_model1.forward_correct( | ||||||
|             ids_remove_padding=input_tensor1, forward_meta=forward_meta1 |             ids_remove_padding=input_tensor1, forward_meta=forward_meta1 | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
|         # Capture and Destory |         # Capture and Destroy | ||||||
|         self.capture_and_replay(input_tensor1, forward_meta1) |         self.capture_and_replay(input_tensor1, forward_meta1) | ||||||
|         self.recapture_and_replay(input_tensor1, forward_meta1) |         self.recapture_and_replay(input_tensor1, forward_meta1) | ||||||
|  |  | ||||||
|     def capture_and_replay(self, input_tensor1, forward_meta1): |     def capture_and_replay(self, input_tensor1, forward_meta1): | ||||||
|         """ """ |         """ """ | ||||||
|         # Triger Capture |         # Trigger Capture | ||||||
|         print_gpu_memory_use(0, "before capture") |         print_gpu_memory_use(0, "before capture") | ||||||
|         output1 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) |         output1 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) | ||||||
|         print_gpu_memory_use(0, "after capture") |         print_gpu_memory_use(0, "after capture") | ||||||
|  |  | ||||||
|         # Reaplay |         # Replay | ||||||
|         output1 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) |         output1 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) | ||||||
|         assert (output1 == self.output_correct).all() |         assert (output1 == self.output_correct).all() | ||||||
|  |  | ||||||
|         # Destory |         # Destroy | ||||||
|         print_gpu_memory_use(0, "before destory") |         print_gpu_memory_use(0, "before destory") | ||||||
|         self.test_model1.clear_grpah_opt_backend() |         self.test_model1.clear_grpah_opt_backend() | ||||||
|         print_gpu_memory_use(0, "after destory") |         print_gpu_memory_use(0, "after destory") | ||||||
|  |  | ||||||
|     def recapture_and_replay(self, input_tensor1, forward_meta1): |     def recapture_and_replay(self, input_tensor1, forward_meta1): | ||||||
|         """ """ |         """ """ | ||||||
|         # Triger Capture |         # Trigger Capture | ||||||
|         print_gpu_memory_use(0, "before recapture") |         print_gpu_memory_use(0, "before recapture") | ||||||
|         output2 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) |         output2 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) | ||||||
|         print_gpu_memory_use(0, "after recapture") |         print_gpu_memory_use(0, "after recapture") | ||||||
|  |  | ||||||
|         # Reaplay |         # Replay | ||||||
|         output2 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) |         output2 = self.test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) | ||||||
|         assert (output2 == self.output_correct).all() |         assert (output2 == self.output_correct).all() | ||||||
|  |  | ||||||
|         # Destory |         # Destroy | ||||||
|         print_gpu_memory_use(0, "before destory") |         print_gpu_memory_use(0, "before destory") | ||||||
|         self.test_model1.clear_grpah_opt_backend() |         self.test_model1.clear_grpah_opt_backend() | ||||||
|         print_gpu_memory_use(0, "after destory") |         print_gpu_memory_use(0, "after destory") | ||||||
|   | |||||||
| @@ -117,14 +117,14 @@ class TestCUDAGrpahSpecDecode(unittest.TestCase): | |||||||
|         input_tensor1 = paddle.ones([1, 32768]) |         input_tensor1 = paddle.ones([1, 32768]) | ||||||
|         forward_meta1 = ForwardMeta(input_ids=input_tensor1, ids_remove_padding=input_tensor1, step_use_cudagraph=True) |         forward_meta1 = ForwardMeta(input_ids=input_tensor1, ids_remove_padding=input_tensor1, step_use_cudagraph=True) | ||||||
|  |  | ||||||
|         # Triger Capture |         # Trigger Capture | ||||||
|         _ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) |         _ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) | ||||||
|  |  | ||||||
|         # Reaplay |         # Replay | ||||||
|         _ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) |         _ = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) | ||||||
|         output1 = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) |         output1 = test_model1(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) | ||||||
|  |  | ||||||
|         # Corrent output |         # Correct output | ||||||
|         output1_correct = test_model1.forward_correct(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) |         output1_correct = test_model1.forward_correct(ids_remove_padding=input_tensor1, forward_meta=forward_meta1) | ||||||
|  |  | ||||||
|         assert (output1 == output1_correct).all() |         assert (output1 == output1_correct).all() | ||||||
|   | |||||||
| @@ -104,14 +104,14 @@ class TestStaticGraphCUDAGraphSplit(unittest.TestCase): | |||||||
|         x = paddle.randint(32, shape=[1, 8]) |         x = paddle.randint(32, shape=[1, 8]) | ||||||
|         forward_meta1 = ForwardMeta(input_ids=x, ids_remove_padding=x, step_use_cudagraph=True) |         forward_meta1 = ForwardMeta(input_ids=x, ids_remove_padding=x, step_use_cudagraph=True) | ||||||
|  |  | ||||||
|         # Triger Capture |         # Trigger Capture | ||||||
|         _ = test_model1(x, forward_meta=forward_meta1) |         _ = test_model1(x, forward_meta=forward_meta1) | ||||||
|  |  | ||||||
|         # Reaplay |         # Replay | ||||||
|         _ = test_model1(x, forward_meta=forward_meta1) |         _ = test_model1(x, forward_meta=forward_meta1) | ||||||
|         output1 = test_model1(x, forward_meta=forward_meta1) |         output1 = test_model1(x, forward_meta=forward_meta1) | ||||||
|  |  | ||||||
|         # Corrent output |         # Correct output | ||||||
|         output1_correct = test_model1.forward_correct(x, forward_meta=forward_meta1) |         output1_correct = test_model1.forward_correct(x, forward_meta=forward_meta1) | ||||||
|  |  | ||||||
|         assert (output1 == output1_correct).all() |         assert (output1 == output1_correct).all() | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 co63oc
					co63oc