mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-25 17:40:35 +08:00
814 lines
30 KiB
Plaintext
814 lines
30 KiB
Plaintext
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include <cublasLt.h>
|
|
#include <cublas_v2.h>
|
|
#include <cuda_runtime_api.h>
|
|
#include <sys/time.h>
|
|
|
|
#include <algorithm>
|
|
#include <fstream>
|
|
#include <iomanip>
|
|
#include <iostream>
|
|
#include <limits>
|
|
#include <list>
|
|
#include <vector>
|
|
|
|
#include "helper.h"
|
|
|
|
template <typename T>
|
|
void handleError(T status, const char* file, int line) {
|
|
printf("Unknown error type at %s:%d\n", file, line);
|
|
exit(1);
|
|
}
|
|
|
|
// for cudaError_t
|
|
template <>
|
|
void handleError<cudaError_t>(cudaError_t status, const char* file, int line) {
|
|
if (status != cudaSuccess) {
|
|
printf(
|
|
"CUDA error at %s:%d - %s\n", file, line, cudaGetErrorString(status));
|
|
exit(1);
|
|
}
|
|
}
|
|
|
|
// for cublasStatus_t
|
|
template <>
|
|
void handleError<cublasStatus_t>(cublasStatus_t status,
|
|
const char* file,
|
|
int line) {
|
|
if (status != CUBLAS_STATUS_SUCCESS) {
|
|
printf(
|
|
"cuBLAS error at %s:%d - %d\n", file, line, static_cast<int>(status));
|
|
exit(1);
|
|
}
|
|
}
|
|
|
|
#define CUDA_CHECK(call) \
|
|
do { \
|
|
handleError((call), __FILE__, __LINE__); \
|
|
} while (0)
|
|
|
|
typedef struct {
|
|
cublasLtMatmulAlgo_t algo;
|
|
cublasStatus_t status;
|
|
float time;
|
|
size_t workspaceSize;
|
|
cublasMath_t mathMode;
|
|
cublasLtReductionScheme_t reductionScheme;
|
|
int customOption;
|
|
float wavesCount;
|
|
} customMatmulPerf_t;
|
|
|
|
typedef struct {
|
|
cublasLtMatmulAlgo_t algo;
|
|
int m;
|
|
int n;
|
|
int k;
|
|
int algoId;
|
|
int swizzle;
|
|
int customOption;
|
|
int tile;
|
|
int splitK_val;
|
|
int reductionScheme;
|
|
int stages;
|
|
size_t workspaceSize;
|
|
float time;
|
|
} algoSelect_t;
|
|
|
|
inline double diffTime(
|
|
const std::chrono::high_resolution_clock::time_point& start,
|
|
const std::chrono::high_resolution_clock::time_point& end) {
|
|
return std::chrono::duration<double, std::milli>(end - start).count();
|
|
}
|
|
|
|
const int splitKSequenceA[] = {1, 2, 3, 4, 5, 6, 8, 12, 16, 32};
|
|
|
|
static inline bool time_compare_perf(const customMatmulPerf_t& perf_a,
|
|
const customMatmulPerf_t& perf_b) {
|
|
return ((perf_a.status == CUBLAS_STATUS_SUCCESS) &&
|
|
(perf_a.time < perf_b.time));
|
|
}
|
|
|
|
static inline bool time_compare_algo_para(const algoSelect_t& algo_para_a,
|
|
const algoSelect_t& algo_para_b) {
|
|
return (algo_para_a.time < algo_para_b.time);
|
|
}
|
|
|
|
// 获取当前 GPU 的剩余显存大小(以字节为单位)
|
|
size_t get_remaining_memory() {
|
|
size_t free, total;
|
|
CUDA_CHECK(cudaMemGetInfo(&free, &total));
|
|
return free;
|
|
}
|
|
|
|
template <typename InT, typename OutT, typename ScaleT = OutT>
|
|
static void TestMatmulRun(cublasLtHandle_t ltHandle,
|
|
cublasLtMatmulDesc_t matmulDesc,
|
|
cublasLtMatrixLayout_t A_desc,
|
|
cublasLtMatrixLayout_t B_desc,
|
|
cublasLtMatrixLayout_t C_desc,
|
|
const InT* A,
|
|
const InT* B,
|
|
OutT* C,
|
|
const cublasLtMatmulAlgo_t& algo,
|
|
customMatmulPerf_t& perfResults,
|
|
cudaEvent_t& startEvent,
|
|
cudaEvent_t& stopEvent) {
|
|
cudaStream_t stream = 0;
|
|
cublasLtMatmulHeuristicResult_t heurResult;
|
|
cublasStatus_t algoStatus = cublasLtMatmulAlgoCheck(
|
|
ltHandle, matmulDesc, A_desc, B_desc, C_desc, C_desc, &algo, &heurResult);
|
|
|
|
auto remainingMemorySize = 0.95 * get_remaining_memory();
|
|
if (algoStatus == CUBLAS_STATUS_SUCCESS &&
|
|
remainingMemorySize > heurResult.workspaceSize) {
|
|
ScaleT alpha = static_cast<ScaleT>(1), beta = static_cast<ScaleT>(0);
|
|
void* workSpace;
|
|
CUDA_CHECK(cudaMalloc(&workSpace, heurResult.workspaceSize));
|
|
CUDA_CHECK(cudaEventRecord(startEvent, stream));
|
|
int repeats = 100;
|
|
for (int loop = 0; loop < repeats; loop++) {
|
|
cublasStatus_t currStatus = cublasLtMatmul(ltHandle,
|
|
matmulDesc,
|
|
&alpha,
|
|
A,
|
|
A_desc,
|
|
B,
|
|
B_desc,
|
|
&beta,
|
|
C,
|
|
C_desc,
|
|
C,
|
|
C_desc,
|
|
&algo,
|
|
workSpace,
|
|
heurResult.workspaceSize,
|
|
stream);
|
|
if (currStatus != CUBLAS_STATUS_SUCCESS) {
|
|
perfResults.status = currStatus;
|
|
break;
|
|
}
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
}
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
|
|
CUDA_CHECK(cudaEventRecord(stopEvent, stream));
|
|
CUDA_CHECK(cudaEventSynchronize(stopEvent));
|
|
float time;
|
|
CUDA_CHECK(cudaEventElapsedTime(&time, startEvent, stopEvent));
|
|
|
|
if (algoStatus == CUBLAS_STATUS_SUCCESS) {
|
|
perfResults.algo = algo;
|
|
perfResults.time = time / repeats;
|
|
perfResults.workspaceSize = heurResult.workspaceSize;
|
|
perfResults.wavesCount = heurResult.wavesCount;
|
|
}
|
|
CUDA_CHECK(cudaFree(workSpace));
|
|
} else {
|
|
std::cerr << "Not enough workspace! Required "
|
|
<< static_cast<double>(heurResult.workspaceSize) / 1024.0 /
|
|
1024.0 / 1024.0
|
|
<< " GiB" << ", But remaining "
|
|
<< static_cast<double>(remainingMemorySize) / 1024.0 / 1024.0 /
|
|
1024.0
|
|
<< " GiB" << std::endl;
|
|
perfResults.status = CUBLAS_STATUS_NOT_SUPPORTED; // Not enough workspace
|
|
}
|
|
}
|
|
|
|
template <typename InT, typename OutT, typename ScaleT = OutT>
|
|
void FindAlgo(const cublasLtHandle_t& ltHandle,
|
|
int m,
|
|
int n,
|
|
int k,
|
|
const InT* A,
|
|
const InT* B,
|
|
OutT* C,
|
|
cublasLtMatmulDesc_t matmulDesc,
|
|
cublasLtMatrixLayout_t A_desc,
|
|
cublasLtMatrixLayout_t B_desc,
|
|
cublasLtMatrixLayout_t C_desc,
|
|
cublasComputeType_t computeType,
|
|
cudaDataType_t scaleType,
|
|
cudaDataType_t Atype,
|
|
cudaDataType_t Btype,
|
|
cudaDataType_t Ctype,
|
|
std::vector<algoSelect_t>& algos,
|
|
const std::string& path) {
|
|
// Get Ids
|
|
// https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulAlgoGetIds
|
|
// Input
|
|
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
|
|
int AlgoCount = 0;
|
|
int AlgoCombinations = 20000;
|
|
std::vector<customMatmulPerf_t> perfResultsTmp;
|
|
|
|
// Output
|
|
int algoIdA[100];
|
|
int nbAlgoIds;
|
|
CUDA_CHECK(cublasLtMatmulAlgoGetIds(ltHandle,
|
|
computeType,
|
|
scaleType,
|
|
Atype,
|
|
Btype,
|
|
Ctype,
|
|
Ctype,
|
|
100,
|
|
algoIdA,
|
|
&nbAlgoIds));
|
|
|
|
std::clog << std::endl << "get " << nbAlgoIds << " algoIds" << std::endl;
|
|
|
|
for (int idx = 0; idx < nbAlgoIds; idx++) {
|
|
cublasLtMatmulAlgo_t algo;
|
|
std::clog << "Process algo: " << algoIdA[idx] << " ";
|
|
|
|
/* Initialize algo structure with given Algp ID */
|
|
// https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulAlgoInit
|
|
CUDA_CHECK(cublasLtMatmulAlgoInit(ltHandle,
|
|
computeType,
|
|
scaleType,
|
|
Atype,
|
|
Btype,
|
|
Ctype,
|
|
Ctype,
|
|
algoIdA[idx],
|
|
&algo));
|
|
// Query the tiles enums supported by that algo which is used to alloc
|
|
// enough space to store it
|
|
// https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulAlgoCapGetAttribute
|
|
cublasLtMatmulTile_t tileA[CUBLASLT_MATMUL_TILE_END];
|
|
size_t nbTiles, sizeWritten;
|
|
CUDA_CHECK(cublasLtMatmulAlgoCapGetAttribute(
|
|
&algo, CUBLASLT_ALGO_CAP_TILE_IDS, tileA, sizeof(tileA), &sizeWritten));
|
|
nbTiles = sizeWritten / sizeof(tileA[0]);
|
|
|
|
// Query the stages enums supported by that algo (cuda must >= 11.0)
|
|
CUDA_CHECK(cublasLtMatmulAlgoCapGetAttribute(
|
|
&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, NULL, 0, &sizeWritten));
|
|
int nbStages = int(sizeWritten / sizeof(uint32_t));
|
|
std::vector<uint32_t> stagesA(nbStages == 0 ? 1 : nbStages);
|
|
if (nbStages == 0) {
|
|
stagesA[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
|
|
nbStages = 1;
|
|
} else {
|
|
CUDA_CHECK(cublasLtMatmulAlgoCapGetAttribute(&algo,
|
|
CUBLASLT_ALGO_CAP_STAGES_IDS,
|
|
stagesA.data(),
|
|
sizeof(uint32_t) * nbStages,
|
|
&sizeWritten));
|
|
}
|
|
|
|
// Retrieve Other Algo Capabilities attributes
|
|
int32_t splitkSupport, customOptionMax;
|
|
uint32_t redMask, swizzlingMax;
|
|
// cublasLtMatmulInnerShape_t innerShape;
|
|
CUDA_CHECK(
|
|
cublasLtMatmulAlgoCapGetAttribute(&algo,
|
|
CUBLASLT_ALGO_CAP_SPLITK_SUPPORT,
|
|
&splitkSupport,
|
|
sizeof(splitkSupport),
|
|
&sizeWritten));
|
|
std::clog << "splitkSupport: " << splitkSupport << std::endl;
|
|
CUDA_CHECK(cublasLtMatmulAlgoCapGetAttribute(
|
|
&algo,
|
|
CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK,
|
|
&redMask,
|
|
sizeof(redMask),
|
|
&sizeWritten));
|
|
CUDA_CHECK(cublasLtMatmulAlgoCapGetAttribute(
|
|
&algo,
|
|
CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT,
|
|
&swizzlingMax,
|
|
sizeof(swizzlingMax),
|
|
&sizeWritten));
|
|
CUDA_CHECK(
|
|
cublasLtMatmulAlgoCapGetAttribute(&algo,
|
|
CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX,
|
|
&customOptionMax,
|
|
sizeof(customOptionMax),
|
|
&sizeWritten));
|
|
|
|
/* Loop over the different tiles */
|
|
for (int tileIdx = 0; tileIdx < nbTiles && AlgoCount < AlgoCombinations;
|
|
tileIdx++) {
|
|
/* Loop over different stages count */
|
|
for (int stagesIdx = 0;
|
|
stagesIdx < nbStages && AlgoCount < AlgoCombinations;
|
|
stagesIdx++) {
|
|
/* Loop over the different custom option if any */
|
|
for (int32_t customOption = 0;
|
|
customOption <= customOptionMax && AlgoCount < AlgoCombinations;
|
|
customOption++) {
|
|
/* Loop over the CTAs swizzling support */
|
|
for (uint32_t k = 0;
|
|
k <= swizzlingMax && AlgoCount < AlgoCombinations;
|
|
k++) {
|
|
int splitK_trial = 0;
|
|
if (splitkSupport) {
|
|
splitK_trial +=
|
|
sizeof(splitKSequenceA) / sizeof(splitKSequenceA[0]);
|
|
}
|
|
|
|
for (int l = 0;
|
|
(l < (1 + splitK_trial)) && (AlgoCount < AlgoCombinations);
|
|
l++) {
|
|
CUDA_CHECK(cublasLtMatmulAlgoConfigSetAttribute(
|
|
&algo,
|
|
CUBLASLT_ALGO_CONFIG_TILE_ID,
|
|
&tileA[tileIdx],
|
|
sizeof(tileA[tileIdx])));
|
|
CUDA_CHECK(cublasLtMatmulAlgoConfigSetAttribute(
|
|
&algo,
|
|
CUBLASLT_ALGO_CONFIG_STAGES_ID,
|
|
&stagesA[stagesIdx],
|
|
sizeof(stagesA[stagesIdx])));
|
|
CUDA_CHECK(cublasLtMatmulAlgoConfigSetAttribute(
|
|
&algo,
|
|
CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION,
|
|
&customOption,
|
|
sizeof(customOption)));
|
|
CUDA_CHECK(cublasLtMatmulAlgoConfigSetAttribute(
|
|
&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k)));
|
|
int splitK_val = 1;
|
|
uint32_t redScheme = CUBLASLT_REDUCTION_SCHEME_NONE;
|
|
CUDA_CHECK(cublasLtMatmulAlgoConfigSetAttribute(
|
|
&algo,
|
|
CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
|
|
&splitK_val,
|
|
sizeof(splitK_val)));
|
|
CUDA_CHECK(cublasLtMatmulAlgoConfigSetAttribute(
|
|
&algo,
|
|
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
|
|
&redScheme,
|
|
sizeof(int)));
|
|
if (l > 0) { // Split-K case
|
|
splitK_val = splitKSequenceA[l - 1];
|
|
CUDA_CHECK(cublasLtMatmulAlgoConfigSetAttribute(
|
|
&algo,
|
|
CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
|
|
&splitKSequenceA[l - 1],
|
|
sizeof(splitKSequenceA[l - 1])));
|
|
for (redScheme = 1;
|
|
redScheme < (int)CUBLASLT_REDUCTION_SCHEME_MASK &&
|
|
(AlgoCount < AlgoCombinations);
|
|
redScheme <<= 1) {
|
|
CUDA_CHECK(cublasLtMatmulAlgoConfigSetAttribute(
|
|
&algo,
|
|
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
|
|
&redScheme,
|
|
sizeof(redScheme)));
|
|
|
|
cublasLtMatmulHeuristicResult_t heurResult;
|
|
cublasStatus_t algoStatus =
|
|
cublasLtMatmulAlgoCheck(ltHandle,
|
|
matmulDesc,
|
|
A_desc,
|
|
B_desc,
|
|
C_desc,
|
|
C_desc,
|
|
&algo,
|
|
&heurResult);
|
|
if (algoStatus == CUBLAS_STATUS_SUCCESS) {
|
|
algoSelect_t algoSelect;
|
|
algoSelect.algo = algo;
|
|
algoSelect.m = m;
|
|
algoSelect.n = n;
|
|
algoSelect.k = k;
|
|
algoSelect.algoId = algoIdA[idx];
|
|
algoSelect.tile = tileA[tileIdx];
|
|
algoSelect.swizzle = k;
|
|
algoSelect.customOption = customOption;
|
|
algoSelect.splitK_val = splitK_val;
|
|
algoSelect.reductionScheme = redScheme;
|
|
algoSelect.stages = stagesA[stagesIdx];
|
|
algos.push_back(algoSelect);
|
|
AlgoCount++;
|
|
}
|
|
}
|
|
} else {
|
|
// Prepare algos
|
|
cublasLtMatmulHeuristicResult_t heurResult;
|
|
// https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulAlgoCheck
|
|
cublasStatus_t algoStatus =
|
|
cublasLtMatmulAlgoCheck(ltHandle,
|
|
matmulDesc,
|
|
A_desc,
|
|
B_desc,
|
|
C_desc,
|
|
C_desc,
|
|
&algo,
|
|
&heurResult);
|
|
if (algoStatus == CUBLAS_STATUS_SUCCESS) {
|
|
algoSelect_t algoSelect;
|
|
algoSelect.algo = algo;
|
|
algoSelect.m = m;
|
|
algoSelect.n = n;
|
|
algoSelect.k = k;
|
|
algoSelect.algoId = algoIdA[idx];
|
|
algoSelect.tile = tileA[tileIdx];
|
|
algoSelect.swizzle = k;
|
|
algoSelect.customOption = customOption;
|
|
algoSelect.splitK_val = splitK_val;
|
|
algoSelect.reductionScheme = redScheme;
|
|
algoSelect.stages = stagesA[stagesIdx];
|
|
algos.push_back(algoSelect);
|
|
AlgoCount++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
std::clog << "Got " << AlgoCount << " algos" << std::endl;
|
|
cudaEvent_t startEvent;
|
|
cudaEvent_t stopEvent;
|
|
std::vector<customMatmulPerf_t> perfResults(AlgoCount);
|
|
CUDA_CHECK(cudaEventCreate(&startEvent));
|
|
CUDA_CHECK(cudaEventCreate(&stopEvent));
|
|
for (int i = 0; i < AlgoCount; i++) {
|
|
TestMatmulRun<InT, OutT, ScaleT>(ltHandle,
|
|
matmulDesc,
|
|
A_desc,
|
|
B_desc,
|
|
C_desc,
|
|
A,
|
|
B,
|
|
C,
|
|
algos[i].algo,
|
|
perfResults[i],
|
|
startEvent,
|
|
stopEvent);
|
|
algos[i].workspaceSize = perfResults[i].workspaceSize;
|
|
algos[i].time = perfResults[i].time;
|
|
if (perfResults[i].status != CUBLAS_STATUS_SUCCESS) {
|
|
std::clog << "algo " << algos[i].algoId << " tile " << algos[i].tile
|
|
<< " stages " << algos[i].stages << " splitK_val "
|
|
<< algos[i].splitK_val << std::endl;
|
|
algos[i].time = std::numeric_limits<float>::max();
|
|
std::cerr << " TestMatmulRun with status " << perfResults[i].status
|
|
<< std::endl;
|
|
continue;
|
|
}
|
|
}
|
|
std::sort(algos.begin(), algos.end(), time_compare_algo_para);
|
|
int i = 0;
|
|
while (algos[i].time == 0) i++;
|
|
std::ofstream outfile;
|
|
outfile.open(path, std::ios::app);
|
|
outfile << m << "," << k << "," << n << "," << algos[i].algoId << ","
|
|
<< algos[i].swizzle << "," << algos[i].customOption << ","
|
|
<< algos[i].tile << "," << algos[i].splitK_val << ","
|
|
<< algos[i].reductionScheme << "," << algos[i].stages << ","
|
|
<< algos[i].workspaceSize << "," << algos[i].time << "\n";
|
|
outfile.close();
|
|
}
|
|
|
|
class DevContext {};
|
|
|
|
class CPUContext : public DevContext {};
|
|
|
|
class CUBLASLTContext : public DevContext {
|
|
public:
|
|
CUBLASLTContext() { CUDA_CHECK(cublasLtCreate(&handle)); }
|
|
|
|
cublasLtHandle_t handle;
|
|
};
|
|
|
|
template <typename InT, typename OutT, typename DevContext>
|
|
void GEMMInt8(const DevContext& dev_ctx,
|
|
const std::vector<InT>& A,
|
|
const std::vector<InT>& B,
|
|
std::vector<OutT>& C,
|
|
int m,
|
|
int k,
|
|
int n,
|
|
bool is_test,
|
|
bool is_read_from_file = false,
|
|
const std::string& path = "search.csv") {
|
|
std::cerr << "Base Class is not implemented" << std::endl;
|
|
}
|
|
|
|
template <>
|
|
void GEMMInt8<int8_t, int32_t, CPUContext>(const CPUContext& dev_ctx,
|
|
const std::vector<int8_t>& A,
|
|
const std::vector<int8_t>& B,
|
|
std::vector<int32_t>& C,
|
|
int m,
|
|
int k,
|
|
int n,
|
|
bool is_test,
|
|
bool is_read_from_file,
|
|
const std::string& path) {
|
|
std::cerr << "CPUContext Class is not implemented" << std::endl;
|
|
}
|
|
|
|
template <>
|
|
void GEMMInt8<int8_t, int32_t, CUBLASLTContext>(const CUBLASLTContext& dev_ctx,
|
|
const std::vector<int8_t>& AVec,
|
|
const std::vector<int8_t>& BVec,
|
|
std::vector<int32_t>& CVec,
|
|
int m,
|
|
int k,
|
|
int n,
|
|
bool is_test,
|
|
bool is_read_from_file,
|
|
const std::string& path) {
|
|
int8_t* A_dev;
|
|
int8_t* B_dev;
|
|
int32_t* C_dev;
|
|
char* workSpace;
|
|
|
|
CUDA_CHECK(cudaMalloc((void**)&A_dev, AVec.size() * sizeof(int8_t)));
|
|
CUDA_CHECK(cudaMalloc((void**)&B_dev, BVec.size() * sizeof(int8_t)));
|
|
CUDA_CHECK(cudaMalloc((void**)&C_dev, m * n * sizeof(int32_t)));
|
|
CUDA_CHECK(
|
|
cudaMemcpy(A_dev, AVec.data(), AVec.size(), cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(
|
|
cudaMemcpy(B_dev, BVec.data(), BVec.size(), cudaMemcpyHostToDevice));
|
|
|
|
// init data structure
|
|
cublasLtMatmulDesc_t matmul_desc;
|
|
cublasLtMatrixLayout_t A_desc;
|
|
cublasLtMatrixLayout_t B_desc;
|
|
cublasLtMatrixLayout_t C_desc;
|
|
int32_t alpha = 1;
|
|
int32_t beta = 0;
|
|
|
|
cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32I;
|
|
CUDA_CHECK(
|
|
cublasLtMatmulDescCreate(&matmul_desc, cudaComputeType, CUDA_R_32I));
|
|
cublasOperation_t op_transpose = CUBLAS_OP_T;
|
|
CUDA_CHECK(cublasLtMatmulDescSetAttribute(matmul_desc,
|
|
CUBLASLT_MATMUL_DESC_TRANSA,
|
|
&op_transpose,
|
|
sizeof(op_transpose)));
|
|
CUDA_CHECK(cublasLtMatrixLayoutCreate(&B_desc, CUDA_R_8I, k, n, k));
|
|
CUDA_CHECK(cublasLtMatrixLayoutCreate(&A_desc, CUDA_R_8I, k, m, k));
|
|
CUDA_CHECK(cublasLtMatrixLayoutCreate(&C_desc, CUDA_R_32I, n, m, n));
|
|
|
|
cublasLtMatmulAlgo_t algo;
|
|
int algoId;
|
|
int swizzle;
|
|
int customOption;
|
|
int tile;
|
|
int splitK_val;
|
|
int reductionScheme;
|
|
int stages;
|
|
size_t work_space_size = 0;
|
|
float time_ref;
|
|
|
|
auto using_default_config = [&]() {
|
|
algoId = 21;
|
|
swizzle = 0;
|
|
customOption = 0;
|
|
tile = 15;
|
|
splitK_val = 0;
|
|
reductionScheme = 0;
|
|
stages = 23;
|
|
if (m >= 128) {
|
|
tile = 20;
|
|
stages = 17;
|
|
}
|
|
};
|
|
if (is_test) {
|
|
std::vector<algoSelect_t> algos;
|
|
// Select //
|
|
FindAlgo(dev_ctx.handle,
|
|
m,
|
|
n,
|
|
k,
|
|
B_dev,
|
|
A_dev,
|
|
C_dev,
|
|
matmul_desc,
|
|
B_desc,
|
|
A_desc,
|
|
C_desc,
|
|
CUBLAS_COMPUTE_32I,
|
|
CUDA_R_32I,
|
|
CUDA_R_8I,
|
|
CUDA_R_8I,
|
|
CUDA_R_32I,
|
|
algos,
|
|
path);
|
|
int i = 0;
|
|
while (algos[i].time == 0) i++;
|
|
algoId = algos[i].algoId;
|
|
swizzle = algos[i].swizzle;
|
|
customOption = algos[i].customOption;
|
|
tile = algos[i].tile;
|
|
splitK_val = algos[i].splitK_val;
|
|
reductionScheme = algos[i].reductionScheme;
|
|
stages = algos[i].stages;
|
|
work_space_size = algos[i].workspaceSize;
|
|
} else if (is_read_from_file) {
|
|
int m_tmp, k_tmp, n_tmp;
|
|
std::ifstream file(path);
|
|
if (!file.is_open()) {
|
|
std::cout << "file not open. Now we use default params" << std::endl;
|
|
using_default_config();
|
|
} else {
|
|
bool match = false;
|
|
int find_cnt = 0;
|
|
std::string line;
|
|
while (std::getline(file, line)) {
|
|
std::istringstream iss(line);
|
|
char comma;
|
|
if (iss >> m_tmp >> comma >> k_tmp >> comma >> n_tmp >> comma >>
|
|
algoId >> comma >> swizzle >> comma >> customOption >> comma >>
|
|
tile >> comma >> splitK_val >> comma >> reductionScheme >> comma >>
|
|
stages >> comma >> work_space_size >> comma >> time_ref) {
|
|
if (k_tmp == k && n_tmp == n && m <= m_tmp) {
|
|
match = true;
|
|
break;
|
|
}
|
|
find_cnt++;
|
|
}
|
|
}
|
|
if (find_cnt == 0) {
|
|
std::cout << "the file is empty. Now we use default params"
|
|
<< std::endl;
|
|
using_default_config();
|
|
}
|
|
}
|
|
} else {
|
|
std::cout << "Please use test mode to select\n, Now we use default params"
|
|
<< std::endl;
|
|
using_default_config();
|
|
}
|
|
|
|
CUDA_CHECK(cudaMalloc((void**)&workSpace, work_space_size));
|
|
|
|
CUDA_CHECK(cublasLtMatmulAlgoInit(dev_ctx.handle,
|
|
cudaComputeType,
|
|
CUDA_R_32I,
|
|
CUDA_R_8I,
|
|
CUDA_R_8I,
|
|
CUDA_R_32I,
|
|
CUDA_R_32I,
|
|
algoId,
|
|
&algo));
|
|
CUDA_CHECK(
|
|
cublasLtMatmulAlgoConfigSetAttribute(&algo,
|
|
CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION,
|
|
&(customOption),
|
|
sizeof(customOption)));
|
|
CUDA_CHECK(cublasLtMatmulAlgoConfigSetAttribute(
|
|
&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tile), sizeof(tile)));
|
|
CUDA_CHECK(
|
|
cublasLtMatmulAlgoConfigSetAttribute(&algo,
|
|
CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
|
|
&(splitK_val),
|
|
sizeof(splitK_val)));
|
|
CUDA_CHECK(cublasLtMatmulAlgoConfigSetAttribute(
|
|
&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle)));
|
|
CUDA_CHECK(cublasLtMatmulAlgoConfigSetAttribute(
|
|
&algo,
|
|
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
|
|
&(reductionScheme),
|
|
sizeof(int)));
|
|
CUDA_CHECK(cublasLtMatmulAlgoConfigSetAttribute(
|
|
&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(stages), sizeof(stages)));
|
|
|
|
auto start = std::chrono::high_resolution_clock::now();
|
|
const int repeats = 10;
|
|
for (int loop = 0; loop < repeats; loop++) {
|
|
CUDA_CHECK(cublasLtMatmul(dev_ctx.handle,
|
|
matmul_desc,
|
|
&alpha,
|
|
B_dev,
|
|
B_desc,
|
|
A_dev,
|
|
A_desc,
|
|
&beta,
|
|
C_dev,
|
|
C_desc,
|
|
C_dev,
|
|
C_desc,
|
|
&algo,
|
|
// nullptr,
|
|
workSpace,
|
|
// 0,
|
|
work_space_size,
|
|
0));
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
}
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
auto end = std::chrono::high_resolution_clock::now();
|
|
double time = diffTime(start, end);
|
|
auto now = std::chrono::system_clock::now();
|
|
std::time_t now_time_t = std::chrono::system_clock::to_time_t(now);
|
|
std::tm now_tm = *std::localtime(&now_time_t);
|
|
|
|
std::cout << "GEMM with cublaslt imma1 int8 spend " << time / repeats
|
|
<< " ms in " << m << ", " << k << ", " << n
|
|
<< ", current time: " << std::put_time(&now_tm, "%H:%M:%S")
|
|
<< std::endl;
|
|
CUDA_CHECK(cudaFree(A_dev));
|
|
CUDA_CHECK(cudaFree(B_dev));
|
|
CUDA_CHECK(cudaFree(C_dev));
|
|
CUDA_CHECK(cudaFree(workSpace));
|
|
}
|
|
|
|
void TuneCublasltGemm(const paddle::Tensor& K,
|
|
const paddle::Tensor& N,
|
|
const int M_start,
|
|
const int M_end,
|
|
const std::string& dtype,
|
|
const bool is_test,
|
|
const bool is_read_from_file,
|
|
const std::string& path) {
|
|
assert(M_end >= M_start);
|
|
assert(M_start >= 1);
|
|
assert(K.dims().size() == 1 && N.dims().size() == 1);
|
|
assert(is_test != is_read_from_file);
|
|
|
|
auto K_cpu = K.copy_to(paddle::CPUPlace(), false);
|
|
auto N_cpu = N.copy_to(paddle::CPUPlace(), false);
|
|
int64_t* K_data = K_cpu.data<int64_t>();
|
|
int64_t* N_data = N_cpu.data<int64_t>();
|
|
|
|
int K_size = K.numel();
|
|
int N_size = N.numel();
|
|
assert(K_size == N_size);
|
|
|
|
std::vector<int> mm;
|
|
int m = M_start, step = 1;
|
|
while (m <= M_end) {
|
|
// update step
|
|
if (m >= 8192) {
|
|
step = 4096;
|
|
} else if (m >= 1024) {
|
|
step = 1024;
|
|
} else if (m >= 512) {
|
|
step = 128;
|
|
} else if (m >= 256) {
|
|
step = 64;
|
|
} else if (m >= 64) {
|
|
step = 32;
|
|
} else if (m >= 16) {
|
|
step = 16;
|
|
} else if (m >= 4) {
|
|
step = 4;
|
|
} else {
|
|
step = 1;
|
|
}
|
|
mm.push_back(m);
|
|
m += step;
|
|
}
|
|
|
|
for (int j = 0; j < mm.size(); j++) {
|
|
int m = mm[j];
|
|
for (int i = 0; i < K_size; ++i) {
|
|
int n = (int)N_data[i];
|
|
int k = (int)K_data[i];
|
|
auto A = std::vector<int8_t>(m * k);
|
|
auto B = std::vector<int8_t>(k * n);
|
|
auto C = std::vector<int32_t>(m * n);
|
|
|
|
if (dtype == "int8") {
|
|
CUBLASLTContext dev_ctx;
|
|
GEMMInt8(dev_ctx,
|
|
A,
|
|
B,
|
|
C,
|
|
m,
|
|
k,
|
|
n,
|
|
is_test, /*is_test*/
|
|
is_read_from_file, /*is_read_from_file*/
|
|
path);
|
|
} else {
|
|
// other dtype
|
|
throw std::runtime_error(dtype + "not currently supported");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
PD_BUILD_STATIC_OP(tune_cublaslt_gemm)
|
|
.Inputs({"K", "N"})
|
|
.Outputs({})
|
|
.Attrs({"M_start: int",
|
|
"M_end: int",
|
|
"dtype: std::string",
|
|
"is_test: bool",
|
|
"is_read_from_file: bool",
|
|
"path: std::string"})
|
|
.SetKernelFn(PD_KERNEL(TuneCublasltGemm));
|