device: gpu device_id: 3 cpu_thread_nums: 1 warmup: 200 repeat: 1000 backend: ort profile_mode: runtime include_h2d_d2h: false use_fp16: false collect_memory_info: false sampling_interval: 1 precision_compare: false xpu_l3_cache: 0 result_path: benchmark_gpu_ort_fp32.txt