From fad28a764cae809761809f8d394b580120b36f4e Mon Sep 17 00:00:00 2001 From: Nicolas Mowen Date: Thu, 11 Sep 2025 10:20:25 -0600 Subject: [PATCH] Use CUDA graphs for object detection on Nvidia GPUs (#20027) * Use CUDA graphs to improve efficiency of object detection * Cleanup comments and typing --- frigate/detectors/plugins/onnx.py | 92 +++++++++++++++++++++++++++---- 1 file changed, 81 insertions(+), 11 deletions(-) diff --git a/frigate/detectors/plugins/onnx.py b/frigate/detectors/plugins/onnx.py index 45e37d6cd..527de7e11 100644 --- a/frigate/detectors/plugins/onnx.py +++ b/frigate/detectors/plugins/onnx.py @@ -1,6 +1,7 @@ import logging import numpy as np +import onnxruntime as ort from pydantic import Field from typing_extensions import Literal @@ -22,6 +23,53 @@ logger = logging.getLogger(__name__) DETECTOR_KEY = "onnx" +class CudaGraphRunner: + """Encapsulates CUDA Graph capture and replay using ONNX Runtime IOBinding. + + This runner assumes a single tensor input and binds all model outputs. + """ + + def __init__(self, session: ort.InferenceSession, cuda_device_id: int): + self._session = session + self._cuda_device_id = cuda_device_id + self._captured = False + self._io_binding: ort.IOBinding | None = None + self._input_name: str | None = None + self._output_names: list[str] | None = None + self._input_ortvalue: ort.OrtValue | None = None + self._output_ortvalues: ort.OrtValue | None = None + + def run(self, input_name: str, tensor_input: np.ndarray): + tensor_input = np.ascontiguousarray(tensor_input) + + if not self._captured: + # Prepare IOBinding with CUDA buffers and let ORT allocate outputs on device + self._io_binding = self._session.io_binding() + self._input_name = input_name + self._output_names = [o.name for o in self._session.get_outputs()] + + self._input_ortvalue = ort.OrtValue.ortvalue_from_numpy( + tensor_input, "cuda", self._cuda_device_id + ) + self._io_binding.bind_ortvalue_input(self._input_name, self._input_ortvalue) + + for name in self._output_names: + # Bind outputs to CUDA and allow ORT to allocate appropriately + self._io_binding.bind_output(name, "cuda", self._cuda_device_id) + + # First IOBinding run to allocate, execute, and capture CUDA Graph + ro = ort.RunOptions() + self._session.run_with_iobinding(self._io_binding, ro) + self._captured = True + return self._io_binding.copy_outputs_to_cpu() + + # Replay using updated input, copy results to CPU + self._input_ortvalue.update_inplace(tensor_input) + ro = ort.RunOptions() + self._session.run_with_iobinding(self._io_binding, ro) + return self._io_binding.copy_outputs_to_cpu() + + class ONNXDetectorConfig(BaseDetectorConfig): type: Literal[DETECTOR_KEY] device: str = Field(default="AUTO", title="Device Type") @@ -33,16 +81,6 @@ class ONNXDetector(DetectionApi): def __init__(self, detector_config: ONNXDetectorConfig): super().__init__(detector_config) - try: - import onnxruntime as ort - - logger.info("ONNX: loaded onnxruntime module") - except ModuleNotFoundError: - logger.error( - "ONNX: module loading failed, need 'pip install onnxruntime'?!?" - ) - raise - path = detector_config.model.path logger.info(f"ONNX: loading {detector_config.model.path}") @@ -50,6 +88,15 @@ class ONNXDetector(DetectionApi): detector_config.device == "CPU", detector_config.device ) + # Enable CUDA Graphs only for supported models when using CUDA EP + if "CUDAExecutionProvider" in providers: + cuda_idx = providers.index("CUDAExecutionProvider") + # mutate only this call's provider options + options[cuda_idx] = { + **options[cuda_idx], + "enable_cuda_graph": True, + } + self.model = ort.InferenceSession( path, providers=providers, provider_options=options ) @@ -62,6 +109,19 @@ class ONNXDetector(DetectionApi): if self.onnx_model_type == ModelTypeEnum.yolox: self.calculate_grids_strides() + self._cuda_device_id = 0 + self._cg_runner: CudaGraphRunner | None = None + + try: + if "CUDAExecutionProvider" in providers: + cuda_idx = providers.index("CUDAExecutionProvider") + self._cuda_device_id = options[cuda_idx].get("device_id", 0) + + if options[cuda_idx].get("enable_cuda_graph"): + self._cg_runner = CudaGraphRunner(self.model, self._cuda_device_id) + except Exception: + pass + logger.info(f"ONNX: {path} loaded") def detect_raw(self, tensor_input: np.ndarray): @@ -78,7 +138,17 @@ class ONNXDetector(DetectionApi): return post_process_dfine(tensor_output, self.width, self.height) model_input_name = self.model.get_inputs()[0].name - tensor_output = self.model.run(None, {model_input_name: tensor_input}) + + if self._cg_runner is not None: + try: + # Run using CUDA graphs if available + tensor_output = self._cg_runner.run(model_input_name, tensor_input) + except Exception: + logger.warning("CUDA Graphs failed, falling back to regular run") + self._cg_runner = None + else: + # Use regular run if CUDA graphs are not available + tensor_output = self.model.run(None, {model_input_name: tensor_input}) if self.onnx_model_type == ModelTypeEnum.rfdetr: return post_process_rfdetr(tensor_output)