diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 6ac5c12a0..c9001e6f6 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -862,7 +862,7 @@ class GraphOptimizationConfig: self.real_shape_to_captured_size[bs] = end self.real_shape_to_captured_size[self.max_capture_size] = self.max_capture_size - def _set_cudagraph_sizes(self, max_num_seqs: int = 0): + def _set_cudagraph_sizes(self, max_capture_size: int = 0): """ Calculate a series of candidate capture sizes, and then extract a portion of them as the capture list for the CUDA graph based on user input. @@ -874,7 +874,7 @@ class GraphOptimizationConfig: # Shape [256, 288, ... 992, 1024] draft_capture_sizes += [32 * i for i in range(9, 33)] - draft_capture_sizes.append(max_num_seqs) + draft_capture_sizes.append(max_capture_size) self.cudagraph_capture_sizes = sorted(draft_capture_sizes) def to_json_string(self): @@ -1391,19 +1391,22 @@ class FDConfig: self.cache_config: CacheConfig = cache_config # type: ignore self.plas_attention_config: Optional[PlasAttentionConfig] = plas_attention_config self.structured_outputs_config: StructuredOutputsConfig = structured_outputs_config - # Initialize cuda graph capture list - if self.graph_opt_config.cudagraph_capture_sizes is None: - self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.scheduler_config.max_num_seqs) + # Initialize cuda graph capture list + max_capture_shape = self.scheduler_config.max_num_seqs + if self.speculative_config is not None and self.speculative_config.method == "mtp": + max_capture_shape = self.scheduler_config.max_num_seqs * ( + self.speculative_config.num_speculative_tokens + 1 + ) + assert max_capture_shape % 2 == 0, "CUDAGraph only supports capturing even token nums in MTP scenarios." if self.graph_opt_config.cudagraph_only_prefill: - self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=512) - elif self.speculative_config is not None and self.speculative_config.method == "mtp": - max_shape = self.scheduler_config.max_num_seqs * (self.speculative_config.num_speculative_tokens + 1) - if max_shape % 2 == 1: - max_shape = max_shape + 1 - self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=min(512, max_shape)) + max_capture_shape = 512 else: - self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=self.scheduler_config.max_num_seqs) + max_capture_shape = min(512, max_capture_shape) + + if self.graph_opt_config.cudagraph_capture_sizes is None: + self.graph_opt_config._set_cudagraph_sizes(max_capture_size=max_capture_shape) + self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=max_capture_shape) self.tokenizer = tokenizer self.ips = ips diff --git a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py index 863ab0a44..989349827 100644 --- a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py +++ b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py @@ -171,7 +171,7 @@ class CudaGraphPiecewiseBackend: for n in range(entry.num_finished_warmup, self.warm_up_size): entry.num_finished_warmup += 1 entry.runnable(**kwargs) - logger.debug( + logger.info( f"[CUDA GRAPH][ID:{id(self)}] Warm up for real shape {padding_real_shape}, " f"finished ({n + 1}/{entry.num_finished_warmup}) times" ) @@ -207,7 +207,7 @@ class CudaGraphPiecewiseBackend: # For CUDAGraph debug # self._save_cudagrpah_dot_files(entry) - logger.debug(f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph captured for real shape {padding_real_shape}") + logger.info(f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph captured for real shape {padding_real_shape}") # Replay entry.cuda_graph.replay() @@ -224,7 +224,7 @@ class CudaGraphPiecewiseBackend: for shape in self.cudagraph_capture_sizes: self.concrete_size_entries[shape] = ConcreteSizeEntry(real_shape=shape) - logger.debug( + logger.info( f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph capture list {self.cudagraph_capture_sizes}, " "Created all real shape entry." ) @@ -254,3 +254,9 @@ class CudaGraphPiecewiseBackend: f"{log_dir}/GraphDotFiles/backend{id(self)}_shape{entry.real_shape}", 1 << 0, ) + + def check_capture_successful(self): + """Check whether the shapes are captured or not""" + for shape, entry in self.concrete_size_entries.items(): + if not entry.captured: + raise ValueError(f"[CUDA GRAPH][ID:{id(self)}] Shape {shape} capture failed.") diff --git a/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py b/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py index a5b87a057..98057c9e2 100644 --- a/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py +++ b/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py @@ -34,6 +34,10 @@ from fastdeploy.model_executor.graph_optimization.utils import in_profile_run_mo from fastdeploy.model_executor.graph_optimization.utils import ( in_sot_warmup_mode as in_warmup_mode, ) +from fastdeploy.utils import get_logger + +logger = get_logger("cudagrpah_piecewise_backend", "cudagraph_piecewise_backend.log") + P = ParamSpec("P") T = TypeVar("T") @@ -105,6 +109,9 @@ class GraphOptBackend: self.dy_runnable = self.runnable self.fd_config = fd_config self.max_captre_size = fd_config.graph_opt_config.cudagraph_capture_sizes[0] + self._debug_count_cudagraph_replay = 0 + self._debug_count_total_step = 0 + if self.fd_config.graph_opt_config.graph_opt_level > 0: # 1. Prepare cuda graph input buffers (contain output of subgraphs) @@ -123,6 +130,7 @@ class GraphOptBackend: ) def __call__(self, **kwargs): + self._debug_count_total_step += 1 if not self.fd_config.graph_opt_config.use_cudagraph: return self.runnable(**kwargs) if self.cudagraph_piecewise_backend is None: @@ -136,6 +144,10 @@ class GraphOptBackend: if (not kwargs["forward_meta"].step_use_cudagraph) or (real_shape > self.cudagraph_switch_threshold): return self.dy_runnable(**kwargs) else: + self._debug_count_cudagraph_replay += 1 + logger.debug( + f"[CUDA GRAPH][ID:{id(self.cudagraph_piecewise_backend)}] Total step count: {self._debug_count_total_step}, CUDAGraph replay count: {self._debug_count_cudagraph_replay}" + ) return self.cudagraph_piecewise_backend.__call__(**kwargs) def clear_cudagraph_piecewise_backend(self):