// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "helper.h" #include float bfloat16_to_float(__nv_bfloat16 x) { uint32_t tmp_x = *(reinterpret_cast(&x)); tmp_x = tmp_x << 16; float float_x = *(reinterpret_cast(&tmp_x)); return float_x; } template static void PrintMatrix(const T* mat_d, int num, std::string name, int numOfCols) { std::vector tmp(num); cudaMemcpy(tmp.data(), mat_d, sizeof(T) * num, cudaMemcpyDeviceToHost); std::ofstream outfile; outfile.open(name + ".dtxt", std::ios::out | std::ios::app); std::stringstream ss; for (int i = 0; i < num; ++i) { if (std::is_same::value || std::is_same::value || std::is_same::value) { ss << static_cast(tmp[i]) << " "; } else { ss << std::setprecision(8) << static_cast(tmp[i]) << " "; } if (i % numOfCols == numOfCols - 1) { ss << std::endl; } } outfile << ss.str(); outfile.close(); } GPUMemoryChecker::GPUMemoryChecker() { nvmlReturn_t result = nvmlInit_v2(); if (NVML_SUCCESS != result) { throw std::runtime_error("Failed to initialize NVML: " + std::string(nvmlErrorString(result))); } result = nvmlDeviceGetCount_v2(&deviceCount_); if (NVML_SUCCESS != result) { nvmlShutdown(); throw std::runtime_error("Failed to get GPU count: " + std::string(nvmlErrorString(result))); } getCUDAVisibleDevice(); } GPUMemoryChecker::~GPUMemoryChecker() { nvmlShutdown(); } void GPUMemoryChecker::getCUDAVisibleDevice() { std::vector devices; const char* env_p = std::getenv("CUDA_VISIBLE_DEVICES"); if (!env_p) { for (int i = 0; i < deviceCount_; i++) { visible_device_.push_back(i); return; } } std::string env_str(env_p); std::istringstream stream(env_str); std::string device_id; while (std::getline(stream, device_id, ',')) { visible_device_.push_back(std::stoi(device_id)); visible_device_mem_usage_.push_back(-1); } std::cout << "\nVisible NVIDIA GPU devices" << env_str << std::endl; return; } void GPUMemoryChecker::addCheckPoint(const char* call_file, int call_line) { try { for (int i = 0; i < visible_device_.size(); i++) { unsigned int device_id = visible_device_.at(i); nvmlDevice_t device; nvmlReturn_t result = nvmlDeviceGetHandleByIndex_v2(device_id, &device); if (NVML_SUCCESS != result) { std::cerr << "Failed to get handle for GPU " << device_id << ": " << nvmlErrorString(result) << std::endl; continue; } char name[NVML_DEVICE_NAME_BUFFER_SIZE]; result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE); if (NVML_SUCCESS != result) { std::cerr << "Failed to get name for GPU " << device_id << ": " << nvmlErrorString(result) << std::endl; continue; } nvmlMemory_t memoryInfo; result = nvmlDeviceGetMemoryInfo(device, &memoryInfo); if (NVML_SUCCESS != result) { std::cerr << "Failed to get memory info for GPU " << device_id << ": " << nvmlErrorString(result) << std::endl; continue; } // Check GPU memory const char* env_c = std::getenv("MEMCHECKER_CHECK_MEMORY"); if (env_c) { assert(memoryInfo.used <= visible_device_mem_usage_.at(i) && "GPU Memory does not allow growth!"); } visible_device_mem_usage_[i] = memoryInfo.used; } // Check GPU memory const char* env_p = std::getenv("MEMCHECKER_PRINT_MEMORY"); if (env_p) { std::cout << "\nCall Line: " << call_line << "\t"; for (int i = 0; i < visible_device_.size(); i++) { unsigned int device_id = visible_device_.at(i); std::cout << "GPU " << device_id << ": " << " Used memory: " << visible_device_mem_usage_.at(device_id) / (1024 * 1024) << " MB\t"; } } } catch (const std::exception& e) { std::cerr << "Error: " << e.what() << std::endl; } } bool getEnvEnablePDL() { static std::once_flag flag; static bool enablePDL = false; std::call_once(flag, [&]() { int sm_version = GetSMVersion(); if (sm_version >= 90) { enablePDL = getBoolEnv("FD_ENABLE_PDL"); } }); return enablePDL; }