mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[LLM] First commit the llm deployment code
This commit is contained in:
108
custom_ops/gpu_ops/common/configManager.h
Normal file
108
custom_ops/gpu_ops/common/configManager.h
Normal file
@@ -0,0 +1,108 @@
|
||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <regex>
|
||||
#include <limits>
|
||||
|
||||
class ConfigManager {
|
||||
public:
|
||||
static ConfigManager& get_instance(const std::string& config_path = "fastdeploy_op_configs.json") {
|
||||
static ConfigManager instance(config_path);
|
||||
return instance;
|
||||
}
|
||||
|
||||
std::string get_best_config(const std::string& op_name, const size_t m, const size_t n, const size_t k) {
|
||||
initialize();
|
||||
std::string mnk_string = op_name + "-" +
|
||||
std::to_string(update_m(m)) + "x" + std::to_string(n) + "x" + std::to_string(k);
|
||||
if (configs_.contains(mnk_string)) {
|
||||
return configs_.at(mnk_string);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
int64_t update_m(const size_t m) {
|
||||
size_t new_m = m;
|
||||
if (m < 4) {
|
||||
return m;
|
||||
} else if (m < 16) {
|
||||
return (m + 3) / 4 * 4;
|
||||
} else if (m < 64) {
|
||||
return (m + 15) / 16 * 16;
|
||||
} else if (m < 256) {
|
||||
return (m + 31) / 32 * 32;
|
||||
} else if (m < 512) {
|
||||
return (m + 63) / 64 * 64;
|
||||
} else if (m < 1024) {
|
||||
return (m + 127) / 128 * 128;
|
||||
} else if (m < 8192) {
|
||||
return (m + 1023) / 1024 * 1024;
|
||||
} else if (m < 32768) {
|
||||
return (m + 4095) / 4096 * 4096;
|
||||
} else {
|
||||
return 32768;
|
||||
}
|
||||
}
|
||||
|
||||
void update(const std::string& op_name, const size_t m, const size_t n, const size_t k, const std::string& config) {
|
||||
initialize();
|
||||
std::string mnk_string = op_name + "-" +
|
||||
std::to_string(update_m(m)) + "x" + std::to_string(n) + "x" + std::to_string(k);
|
||||
configs_[mnk_string] = config;
|
||||
}
|
||||
|
||||
void print() const {
|
||||
std::cout << configs_.dump(4) << std::endl; // Pretty print with 4 spaces
|
||||
}
|
||||
|
||||
~ConfigManager() {
|
||||
std::ofstream file(config_path_);
|
||||
if (file.is_open()) {
|
||||
file << configs_.dump(4); // Pretty print with 4 spaces
|
||||
file.close();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
void initialize() {
|
||||
if (initialized_) return;
|
||||
std::ifstream file(config_path_);
|
||||
if (file.is_open()) {
|
||||
try {
|
||||
file >> configs_;
|
||||
} catch (const std::exception& e) {
|
||||
std::cerr << "Error reading configs from " << config_path_ << " : " << e.what() << std::endl;
|
||||
configs_ = nlohmann::json::object(); // Create an empty JSON object
|
||||
}
|
||||
file.close();
|
||||
} else {
|
||||
configs_ = nlohmann::json::object(); // Create an empty JSON object
|
||||
}
|
||||
initialized_ = true;
|
||||
}
|
||||
|
||||
ConfigManager(const std::string& config_path) : config_path_(config_path) {}
|
||||
ConfigManager(const ConfigManager&) = delete;
|
||||
ConfigManager& operator=(const ConfigManager&) = delete;
|
||||
|
||||
nlohmann::json configs_;
|
||||
std::string config_path_;
|
||||
bool initialized_{false};
|
||||
};
|
||||
33
custom_ops/gpu_ops/common/cudaUtils.h
Normal file
33
custom_ops/gpu_ops/common/cudaUtils.h
Normal file
@@ -0,0 +1,33 @@
|
||||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#pragma once
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include "paddle/phi/core/enforce.h"
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
inline int getSMVersion()
|
||||
{
|
||||
int device{-1};
|
||||
PADDLE_ENFORCE_GPU_SUCCESS(cudaGetDevice(&device));
|
||||
int sm_major = 0;
|
||||
int sm_minor = 0;
|
||||
PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device));
|
||||
PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
|
||||
return sm_major * 10 + sm_minor;
|
||||
}
|
||||
|
||||
}
|
||||
331
custom_ops/gpu_ops/common/quantization.h
Normal file
331
custom_ops/gpu_ops/common/quantization.h
Normal file
@@ -0,0 +1,331 @@
|
||||
/*
|
||||
* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
namespace common
|
||||
{
|
||||
|
||||
class QuantMode
|
||||
{
|
||||
// [WARNING] KEEP BELOW DEFINITION IN SYNC WITH tensorrt_llm/quantization/mode.py
|
||||
public:
|
||||
using BaseType = std::uint32_t;
|
||||
|
||||
explicit constexpr QuantMode(BaseType value) noexcept
|
||||
: mValue{value}
|
||||
{
|
||||
}
|
||||
|
||||
QuantMode() noexcept = default;
|
||||
|
||||
constexpr QuantMode(QuantMode const&) noexcept = default;
|
||||
|
||||
constexpr QuantMode& operator=(QuantMode const& other) noexcept = default;
|
||||
|
||||
static constexpr QuantMode none() noexcept
|
||||
{
|
||||
return QuantMode(BaseType(0));
|
||||
}
|
||||
|
||||
static constexpr QuantMode int4Weights() noexcept
|
||||
{
|
||||
return QuantMode(BaseType(1u) << 0);
|
||||
}
|
||||
|
||||
static constexpr QuantMode int8Weights() noexcept
|
||||
{
|
||||
return QuantMode(BaseType(1u) << 1);
|
||||
}
|
||||
|
||||
static constexpr QuantMode activations() noexcept
|
||||
{
|
||||
return QuantMode(BaseType(1u) << 2);
|
||||
}
|
||||
|
||||
static constexpr QuantMode perChannelScaling() noexcept
|
||||
{
|
||||
return QuantMode(BaseType(1u) << 3);
|
||||
}
|
||||
|
||||
static constexpr QuantMode perTokenScaling() noexcept
|
||||
{
|
||||
return QuantMode(BaseType(1u) << 4);
|
||||
}
|
||||
|
||||
static constexpr QuantMode perGroupScaling() noexcept
|
||||
{
|
||||
return QuantMode(BaseType(1u) << 5);
|
||||
}
|
||||
|
||||
static constexpr QuantMode int8KvCache() noexcept
|
||||
{
|
||||
return QuantMode(BaseType(1u) << 6);
|
||||
}
|
||||
|
||||
static constexpr QuantMode fp8KvCache() noexcept
|
||||
{
|
||||
return QuantMode(BaseType(1u) << 7);
|
||||
}
|
||||
|
||||
static constexpr QuantMode fp8Qdq() noexcept
|
||||
{
|
||||
return QuantMode(BaseType(1u) << 8);
|
||||
}
|
||||
|
||||
static constexpr QuantMode fp8RowWise() noexcept
|
||||
{
|
||||
return QuantMode(BaseType(1u) << 3 | BaseType(1u) << 4 | BaseType(1u) << 9);
|
||||
}
|
||||
|
||||
constexpr BaseType value() const noexcept
|
||||
{
|
||||
return mValue;
|
||||
}
|
||||
|
||||
constexpr bool isSet(QuantMode const& mode) const noexcept
|
||||
{
|
||||
return (mValue & mode.value()) == mode.value();
|
||||
}
|
||||
|
||||
constexpr bool hasInt4Weights() const noexcept
|
||||
{
|
||||
return isSet(int4Weights());
|
||||
}
|
||||
|
||||
constexpr bool hasInt8Weights() const noexcept
|
||||
{
|
||||
return isSet(int8Weights());
|
||||
}
|
||||
|
||||
constexpr bool hasActivations() const noexcept
|
||||
{
|
||||
return isSet(activations());
|
||||
}
|
||||
|
||||
constexpr bool hasPerChannelScaling() const noexcept
|
||||
{
|
||||
return isSet(perChannelScaling());
|
||||
}
|
||||
|
||||
constexpr bool hasPerTokenScaling() const noexcept
|
||||
{
|
||||
return isSet(perTokenScaling());
|
||||
}
|
||||
|
||||
constexpr bool hasPerGroupScaling() const noexcept
|
||||
{
|
||||
return isSet(perGroupScaling());
|
||||
}
|
||||
|
||||
constexpr bool hasStaticActivationScaling() const noexcept
|
||||
{
|
||||
return !hasPerTokenScaling();
|
||||
}
|
||||
|
||||
constexpr bool hasInt8KvCache() const noexcept
|
||||
{
|
||||
return isSet(int8KvCache());
|
||||
}
|
||||
|
||||
constexpr bool hasFp8KvCache() const noexcept
|
||||
{
|
||||
return isSet(fp8KvCache());
|
||||
}
|
||||
|
||||
constexpr bool hasFp8Qdq() const noexcept
|
||||
{
|
||||
return isSet(fp8Qdq());
|
||||
}
|
||||
|
||||
constexpr bool hasFp8RowWise() const noexcept
|
||||
{
|
||||
return isSet(fp8RowWise());
|
||||
}
|
||||
|
||||
constexpr bool hasKvCacheQuant() const noexcept
|
||||
{
|
||||
return hasInt8KvCache() || hasFp8KvCache();
|
||||
}
|
||||
|
||||
static constexpr QuantMode fromDescription(bool quantizeWeights = false, bool quantizeActivations = false,
|
||||
bool perToken = false, bool perChannel = false, bool perGroup = false, bool useInt4Weights = false,
|
||||
bool useInt8KvCache = false, bool useFp8KvCache = false, bool useFp8Qdq = false, bool useFp8RowWise = false)
|
||||
{
|
||||
QuantMode quantMode{};
|
||||
if (quantizeWeights)
|
||||
{
|
||||
if (useInt4Weights)
|
||||
quantMode += int4Weights();
|
||||
else
|
||||
quantMode += int8Weights();
|
||||
}
|
||||
|
||||
if (quantizeActivations)
|
||||
{
|
||||
quantMode += activations();
|
||||
}
|
||||
|
||||
if (perChannel)
|
||||
{
|
||||
quantMode += QuantMode::perChannelScaling();
|
||||
}
|
||||
if (perToken)
|
||||
{
|
||||
quantMode += QuantMode::perTokenScaling();
|
||||
}
|
||||
if (perGroup)
|
||||
{
|
||||
quantMode += QuantMode::perGroupScaling();
|
||||
}
|
||||
|
||||
if (useInt8KvCache)
|
||||
{
|
||||
quantMode += int8KvCache();
|
||||
}
|
||||
|
||||
if (useFp8KvCache)
|
||||
{
|
||||
quantMode += fp8KvCache();
|
||||
}
|
||||
|
||||
if (useFp8Qdq)
|
||||
{
|
||||
quantMode += fp8Qdq();
|
||||
}
|
||||
|
||||
if (useFp8RowWise)
|
||||
{
|
||||
quantMode += fp8RowWise();
|
||||
}
|
||||
|
||||
return quantMode;
|
||||
}
|
||||
|
||||
static constexpr QuantMode useSmoothQuant(bool perToken = false, bool perChannel = false)
|
||||
{
|
||||
return fromDescription(true, true, perToken, perChannel);
|
||||
}
|
||||
|
||||
static constexpr QuantMode useWeightOnly(bool useInt4Weights = false, bool perGroup = false)
|
||||
{
|
||||
return fromDescription(true, false, false, false, perGroup, useInt4Weights);
|
||||
}
|
||||
|
||||
static const QuantMode fromQuantAlgo(
|
||||
std::optional<std::string> quantAlgo = std::nullopt, std::optional<std::string> kvCacheQuantAlgo = std::nullopt)
|
||||
{
|
||||
QuantMode quantMode{};
|
||||
if (quantAlgo == "W8A16")
|
||||
{
|
||||
quantMode = useWeightOnly(false, false);
|
||||
}
|
||||
else if (quantAlgo == "W4A16")
|
||||
{
|
||||
quantMode = useWeightOnly(true, false);
|
||||
}
|
||||
else if (quantAlgo == "W4A16_AWQ")
|
||||
{
|
||||
quantMode = useWeightOnly(true, true);
|
||||
}
|
||||
else if (quantAlgo == "W4A8_AWQ")
|
||||
{
|
||||
quantMode = useWeightOnly(true, true);
|
||||
}
|
||||
else if (quantAlgo == "W4A16_GPTQ")
|
||||
{
|
||||
quantMode = useWeightOnly(true, true);
|
||||
}
|
||||
else if (quantAlgo == "W8A8_SQ_PER_CHANNEL")
|
||||
{
|
||||
quantMode = useSmoothQuant(false, true);
|
||||
}
|
||||
else if (quantAlgo == "W8A8_SQ_PER_TENSOR_PLUGIN")
|
||||
{
|
||||
quantMode = useSmoothQuant(false, false);
|
||||
}
|
||||
else if (quantAlgo == "W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN")
|
||||
{
|
||||
quantMode = useSmoothQuant(true, true);
|
||||
}
|
||||
else if (quantAlgo == "W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN")
|
||||
{
|
||||
quantMode = useSmoothQuant(false, true);
|
||||
}
|
||||
else if (quantAlgo == "W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN")
|
||||
{
|
||||
quantMode = useSmoothQuant(true, false);
|
||||
}
|
||||
else if (quantAlgo == "FP8")
|
||||
{
|
||||
quantMode = fromDescription(false, false, false, false, false, false, false, false, true);
|
||||
}
|
||||
else if (quantAlgo == "FP8_ROWWISE")
|
||||
{
|
||||
quantMode = fromDescription(false, false, true, true, false, false, false, false, false, true);
|
||||
}
|
||||
|
||||
if (kvCacheQuantAlgo == "INT8")
|
||||
{
|
||||
quantMode += int8KvCache();
|
||||
}
|
||||
else if (kvCacheQuantAlgo == "FP8")
|
||||
{
|
||||
quantMode += fp8KvCache();
|
||||
}
|
||||
|
||||
return quantMode;
|
||||
}
|
||||
|
||||
constexpr QuantMode operator+(QuantMode const& other) const noexcept
|
||||
{
|
||||
return QuantMode(mValue | other.mValue);
|
||||
}
|
||||
|
||||
constexpr QuantMode& operator+=(QuantMode const& other) noexcept
|
||||
{
|
||||
return *this = *this + other;
|
||||
}
|
||||
|
||||
constexpr QuantMode operator-(QuantMode const& other) const noexcept
|
||||
{
|
||||
return QuantMode(mValue & ~other.mValue);
|
||||
}
|
||||
|
||||
constexpr QuantMode& operator-=(QuantMode const& other) noexcept
|
||||
{
|
||||
return *this = *this - other;
|
||||
}
|
||||
|
||||
constexpr bool operator==(QuantMode const& other) const noexcept
|
||||
{
|
||||
return mValue == other.mValue;
|
||||
}
|
||||
|
||||
constexpr bool operator!=(QuantMode const& other) const noexcept
|
||||
{
|
||||
return !(*this == other);
|
||||
}
|
||||
|
||||
private:
|
||||
BaseType mValue{0};
|
||||
};
|
||||
|
||||
} // namespace common
|
||||
Reference in New Issue
Block a user