mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-06 17:17:14 +08:00

* Add notes for tensors * Optimize some apis * move some warnings * Support build with Paddle2ONNX * Add protobuf support * Fix compile on mac * add clearn package script * Add paddle2onnx code * remove submodule * Add onnx ocde * remove softlink * add onnx code * fix error * Add cmake file * fix patchelf * update paddle2onnx * Delete .gitmodules --------- Co-authored-by: PaddleCI <paddle_ci@example.com> Co-authored-by: pangyoki <pangyoki@126.com> Co-authored-by: jiangjiajun <jiangjiajun@baidu.lcom>
1206 lines
44 KiB
C++
Executable File
1206 lines
44 KiB
C++
Executable File
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||
//
|
||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
// you may not use this file except in compliance with the License.
|
||
// You may obtain a copy of the License at
|
||
//
|
||
// http://www.apache.org/licenses/LICENSE-2.0
|
||
//
|
||
// Unless required by applicable law or agreed to in writing, software
|
||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
// See the License for the specific language governing permissions and
|
||
// limitations under the License.
|
||
|
||
#include "paddle2onnx/mapper/quantize_helper.h"
|
||
|
||
namespace paddle2onnx {
|
||
|
||
void QuantizeModelProcessor::RemoveNodeByName(const std::string& name,
|
||
const bool& update_io) {
|
||
if (name.empty()) {
|
||
return;
|
||
}
|
||
for (auto iter = nodes_->begin(); iter != nodes_->end(); iter++) {
|
||
if ((*iter)->name() == name) {
|
||
std::string input_name = (*iter)->input(0);
|
||
std::string output_name = (*iter)->output(0);
|
||
nodes_->erase(iter);
|
||
if (update_io) {
|
||
ReplaceInputOfAllNodes(output_name, input_name);
|
||
}
|
||
return;
|
||
}
|
||
}
|
||
}
|
||
|
||
void QuantizeModelProcessor::ReplaceInputOfAllNodes(
|
||
const std::string& old_name, const std::string& new_name,
|
||
const std::vector<std::shared_ptr<ONNX_NAMESPACE::NodeProto>>&
|
||
except_nodes) {
|
||
auto iter = name2node_dict_.find(old_name);
|
||
std::vector<std::shared_ptr<ONNX_NAMESPACE::NodeProto>> need_rename_nodes;
|
||
// after replace all old_name to new_name, replace the quantize_info of new
|
||
// name with the quantize_info of old name
|
||
auto quantize_info_iter = helper_->quantize_info.find(old_name);
|
||
if (quantize_info_iter != helper_->quantize_info.end()) {
|
||
helper_->quantize_info[new_name] = helper_->quantize_info[old_name];
|
||
}
|
||
|
||
if (iter != name2node_dict_.end()) {
|
||
need_rename_nodes = iter->second;
|
||
}
|
||
for (auto& node : need_rename_nodes) {
|
||
auto iter = std::find(except_nodes.begin(), except_nodes.end(), node);
|
||
if (iter != except_nodes.end()) {
|
||
continue;
|
||
}
|
||
for (size_t i = 0; i < node->input_size(); ++i) {
|
||
if (node->input(i) == old_name) {
|
||
node->set_input(i, new_name);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
void QuantizeModelProcessor::UpdateInputNameToNodes() {
|
||
name2node_dict_.clear();
|
||
for (auto& node : *nodes_) {
|
||
for (size_t i = 0; i < node->input_size(); ++i) {
|
||
std::string node_input = node->input(i);
|
||
if (name2node_dict_.find(node_input) != name2node_dict_.end()) {
|
||
name2node_dict_[node_input].push_back(node);
|
||
} else {
|
||
name2node_dict_[node_input] = {node};
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
void QuantizeModelProcessor::ProcessQuantizeModel(
|
||
std::vector<std::shared_ptr<ONNX_NAMESPACE::NodeProto>>* parameters,
|
||
std::vector<std::shared_ptr<ONNX_NAMESPACE::ValueInfoProto>>* inputs,
|
||
std::vector<std::shared_ptr<ONNX_NAMESPACE::ValueInfoProto>>* outputs,
|
||
std::vector<std::shared_ptr<ONNX_NAMESPACE::NodeProto>>* nodes,
|
||
OnnxHelper* helper, const std::string& deploy_backend,
|
||
const PaddleParser& parser, std::string* calibration_cache) {
|
||
// Determine whether the model contains quantization related OPs, if not, exit
|
||
// directly
|
||
bool quantized_model = false;
|
||
for (auto& node : *nodes) {
|
||
if (node->op_type() == "QuantizeLinear" ||
|
||
node->op_type() == "DequantizeLinear") {
|
||
quantized_model = true;
|
||
break;
|
||
}
|
||
}
|
||
if (!quantized_model) {
|
||
return;
|
||
}
|
||
parser_ = &parser;
|
||
helper_ = helper;
|
||
parameters_ = parameters;
|
||
inputs_ = inputs;
|
||
outputs_ = outputs;
|
||
nodes_ = nodes;
|
||
P2OLogger() << "[Info] Quantize model deploy backend is: " << deploy_backend
|
||
<< std::endl;
|
||
// Determine the format of the exported ONNX quantization model according to
|
||
// the deploy_backend
|
||
if (deploy_backend == "others") {
|
||
// If deploy_backend is others, the quantization model is exported as a
|
||
// float model + quantization table.
|
||
RemoveAllQuantizeOps();
|
||
std::ofstream outfile;
|
||
outfile.open("max_range.txt", std::ios::out);
|
||
if (!outfile.is_open()) {
|
||
P2OLogger() << "[WARNING] Quantize model processer failed to write range "
|
||
"information in current location."
|
||
<< std::endl;
|
||
return;
|
||
}
|
||
for (auto iter = helper_->quantize_info.begin();
|
||
iter != helper_->quantize_info.end(); iter++) {
|
||
std::string log = iter->first;
|
||
auto scale = iter->second.scale_;
|
||
if (scale.size() == 1) {
|
||
log = log + ": " + std::to_string(scale[0] * 127);
|
||
outfile << log << std::endl;
|
||
}
|
||
}
|
||
outfile.close();
|
||
} else if (deploy_backend == "onnxruntime") {
|
||
// When deploy_backend is ONNXRuntime, use the follow four steps to process:
|
||
// 1. broadcast quantize info
|
||
// 2. remove all quantize ops
|
||
// 3. merge conv and add
|
||
// 4. merge conv and bn
|
||
// 5. add Q and DQ according ONNXRuntime quantize OP fuse patten.
|
||
// 6. use topo sort in nodes
|
||
QuantizeInfoBroadcast();
|
||
RemoveAllQuantizeOps();
|
||
MergeConvAdd();
|
||
MergeConvBN();
|
||
AddQDQForORT();
|
||
SortNodes();
|
||
} else if (deploy_backend == "tensorrt") {
|
||
// When deploy_backend is TensorRT, use the follow four steps to process:
|
||
// For Explicit Quantization
|
||
// 1. broadcast quantize info
|
||
// 2. remove all quantize ops
|
||
// 3. add Q and DQ before conv and matmul.
|
||
// 4. use topo sort in nodes
|
||
|
||
// For Implicit Quantization
|
||
// 1. remove all quantize ops
|
||
// 2. broadcast quantize info
|
||
// 3. save float onnx model and alibration.cache
|
||
QuantizeInfoBroadcast();
|
||
RemoveAllQuantizeOps();
|
||
// Add qdq for Explicit Quantization
|
||
// AddTrtQDQ();
|
||
// SortNodes();
|
||
|
||
// Genarate calibration.cache for Implicit Quantization
|
||
// convert float to hex
|
||
GenerateCache(calibration_cache);
|
||
} else if (deploy_backend == "rknn") {
|
||
// When deploy_backend is RKNN, use the follow four steps to process:
|
||
// 1. broadcast quantize info
|
||
// 2. remove all quantize ops
|
||
// 3. merge conv and add
|
||
// 4. merge conv and bn
|
||
// 5. add Q and DQ
|
||
// 6. use topo sort in nodes
|
||
QuantizeInfoBroadcast();
|
||
RemoveAllQuantizeOps();
|
||
MergeConvAdd();
|
||
MergeConvBN();
|
||
AddQDQForRKNN();
|
||
SortNodes();
|
||
} else {
|
||
Assert(false,
|
||
"[QuantizeModelProcessor] Only support 'onnxruntime' / 'tensorrt' "
|
||
"/ 'others' as "
|
||
"backend now, but now the backend is: " +
|
||
deploy_backend + ".");
|
||
}
|
||
}
|
||
|
||
void QuantizeModelProcessor::AddQDQForRKNN() {
|
||
UpdateInputNameToNodes();
|
||
supported_quantize_type_ = {"Abs",
|
||
"Acos",
|
||
"Add",
|
||
"Asin",
|
||
"Atan",
|
||
"AveragePool",
|
||
"BatchNormalization",
|
||
"Ceil",
|
||
"Clip",
|
||
"Conv",
|
||
"ConvTranspose",
|
||
"Cos",
|
||
"Cosh",
|
||
"Concat",
|
||
"Elu",
|
||
"Erf",
|
||
"Exp",
|
||
"Floor",
|
||
"Gemm",
|
||
"HardSigmoid",
|
||
"InstanceNormalization",
|
||
"IsInf",
|
||
"IsNaN",
|
||
"Log",
|
||
"MaxPool",
|
||
"Mul",
|
||
"Neg",
|
||
"ReduceMean",
|
||
"Relu",
|
||
"Resize",
|
||
"Round",
|
||
"Sigmoid",
|
||
"Sin",
|
||
"Sinh",
|
||
"Split",
|
||
"Sqrt",
|
||
"Tan",
|
||
"Tanh"};
|
||
for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) {
|
||
auto node = *iter;
|
||
auto type_iter = std::find(supported_quantize_type_.begin(),
|
||
supported_quantize_type_.end(), node->op_type());
|
||
if (!supported_quantize_type_.empty() &&
|
||
type_iter == supported_quantize_type_.end()) {
|
||
continue;
|
||
}
|
||
// Add Miss scale for Mul and MatMul
|
||
if (node->op_type() == "Mul" || node->op_type() == "MatMul" ||
|
||
node->op_type() == "Add") {
|
||
for (size_t i = 0; i < node->input_size(); ++i) {
|
||
std::string node_input = node->input(i);
|
||
if (helper_->quantize_info.find(node_input) !=
|
||
helper_->quantize_info.end()) {
|
||
continue;
|
||
}
|
||
std::vector<float> weight;
|
||
if (!GetTensorByName(node_input, &weight)) {
|
||
continue;
|
||
}
|
||
std::vector<int64_t> weight_shape;
|
||
GetTensorShape(node_input, &weight_shape);
|
||
int64_t quantize_axis = 1;
|
||
if (node->op_type() == "Add") {
|
||
quantize_axis = 0;
|
||
}
|
||
std::vector<float> scale;
|
||
std::vector<int64_t> zeros;
|
||
|
||
if (node->op_type() == "Add" || weight_shape.size() == 1 ||
|
||
weight_shape.empty()) {
|
||
GetTensorWiseQuantizeInfo(weight, &scale, &zeros);
|
||
} else {
|
||
GetChannelWiseQuantizeInfo(weight, weight_shape, quantize_axis,
|
||
&scale, &zeros);
|
||
}
|
||
std::string weight_scale_node, weight_zero_node;
|
||
if (scale.size() == 1) {
|
||
weight_scale_node = helper_->Constant(
|
||
{}, ONNX_NAMESPACE::TensorProto::FLOAT, scale[0]);
|
||
weight_zero_node = helper_->Constant(
|
||
{}, ONNX_NAMESPACE::TensorProto::INT8, zeros[0]);
|
||
} else {
|
||
weight_scale_node =
|
||
helper_->Constant(ONNX_NAMESPACE::TensorProto::FLOAT, scale);
|
||
weight_zero_node =
|
||
helper_->Constant(ONNX_NAMESPACE::TensorProto::INT8, zeros);
|
||
}
|
||
QuantizeInfo weight_quantize_info(scale, zeros, weight_scale_node,
|
||
weight_zero_node, quantize_axis);
|
||
helper_->quantize_info[node_input] = weight_quantize_info;
|
||
}
|
||
}
|
||
std::vector<std::string> tensor_names;
|
||
for (size_t i = 0; i < node->input_size(); ++i) {
|
||
std::string node_input = node->input(i);
|
||
tensor_names.push_back(node_input);
|
||
}
|
||
for (size_t i = 0; i < node->output_size(); ++i) {
|
||
std::string node_output = node->output(i);
|
||
tensor_names.push_back(node_output);
|
||
}
|
||
if (!CanBeQuantize(tensor_names)) {
|
||
continue;
|
||
}
|
||
for (auto& name : tensor_names) {
|
||
AppendQuantizeTensor(name);
|
||
}
|
||
}
|
||
|
||
// update name2node_dict for the change of Relu op.
|
||
UpdateInputNameToNodes();
|
||
// Add QDQ in model
|
||
AddQDQInModel(tensors_to_be_quantize);
|
||
}
|
||
|
||
void QuantizeModelProcessor::GenerateCache(std::string* calibration_cache) {
|
||
union {
|
||
float f;
|
||
unsigned char farray[4];
|
||
} un;
|
||
*calibration_cache += "TRT-8XXX-EntropyCalibration2 \n";
|
||
for (auto iter = helper_->quantize_info.rbegin();
|
||
iter != helper_->quantize_info.rend(); iter++) {
|
||
std::string tensor_name = iter->first;
|
||
QuantizeInfo quantize_info = iter->second;
|
||
if (quantize_info.scale_.size() == 1) {
|
||
float val = quantize_info.scale_[0];
|
||
un.f = val;
|
||
*calibration_cache += (tensor_name + ": ");
|
||
std::stringstream enc;
|
||
for (int64_t i = 3; i >= 0; i--) {
|
||
enc << std::hex << std::setw(2) << std::setfill('0')
|
||
<< (int)(un.farray[i]);
|
||
}
|
||
*calibration_cache = *calibration_cache + enc.str() + "\n";
|
||
}
|
||
}
|
||
}
|
||
// In TensorRT, all quantized op: Conv, ConvTranspose, liner(MatMul), MaxPool,
|
||
// AvgPool, AdaptiveAvgPool, rnn(not support now)
|
||
// https://github.com/NVIDIA/TensorRT/tree/main/tools/pytorch-quantization/pytorch_quantization/nn/modules
|
||
void QuantizeModelProcessor::AddTrtQDQ() {
|
||
UpdateInputNameToNodes();
|
||
std::vector<std::string>
|
||
quantize_tensors; // save the tensor names that need add quantize ops
|
||
std::vector<std::string> pool_types = {"MaxPool", "AvgPool",
|
||
"AdaptiveAvgPool"};
|
||
for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) {
|
||
quantize_tensors.clear();
|
||
auto node = *iter;
|
||
if (node->op_type() == "Conv" || node->op_type() == "ConvTranspose") {
|
||
std::vector<std::string> tensor_names = {node->input(0), node->input(1)};
|
||
if (!CanBeQuantize(tensor_names)) {
|
||
continue;
|
||
}
|
||
quantize_tensors = tensor_names;
|
||
}
|
||
if (node->op_type() == "MatMul") {
|
||
std::vector<std::string> tensor_names = {node->input(0), node->input(1)};
|
||
for (auto& name : tensor_names) {
|
||
if (helper_->quantize_info.find(name) != helper_->quantize_info.end()) {
|
||
continue;
|
||
}
|
||
std::vector<float> matmul_weight;
|
||
if (!GetTensorByName(name, &matmul_weight)) {
|
||
continue;
|
||
}
|
||
std::vector<int64_t> matmul_weight_shape;
|
||
if (!GetTensorShape(name, &matmul_weight_shape)) {
|
||
continue;
|
||
}
|
||
int64_t quantize_axis = 1;
|
||
std::vector<float> scale;
|
||
std::vector<int64_t> zeros;
|
||
GetChannelWiseQuantizeInfo(matmul_weight, matmul_weight_shape,
|
||
quantize_axis, &scale, &zeros);
|
||
auto scale_node =
|
||
helper_->Constant(ONNX_NAMESPACE::TensorProto::FLOAT, scale);
|
||
auto zero_node =
|
||
helper_->Constant(ONNX_NAMESPACE::TensorProto::INT8, zeros);
|
||
QuantizeInfo matmul_weight_quantize_info(scale, zeros, scale_node,
|
||
zero_node, quantize_axis);
|
||
helper_->quantize_info[name] = matmul_weight_quantize_info;
|
||
}
|
||
if (!CanBeQuantize(tensor_names)) {
|
||
continue;
|
||
}
|
||
quantize_tensors = tensor_names;
|
||
}
|
||
auto type_iter =
|
||
std::find(pool_types.begin(), pool_types.end(), node->op_type());
|
||
if (type_iter != pool_types.end()) {
|
||
std::vector<std::string> tensor_names = {node->input(0)};
|
||
if (!CanBeQuantize(tensor_names)) {
|
||
continue;
|
||
}
|
||
quantize_tensors = tensor_names;
|
||
}
|
||
|
||
std::string negative_scale_tensor = "";
|
||
for (std::string& name : quantize_tensors) {
|
||
Assert(
|
||
helper_->quantize_info.find(name) != helper_->quantize_info.end(),
|
||
"[QuantizeModelProcessor] Can not find quantize info for tensor: " +
|
||
name);
|
||
QuantizeInfo quantize_info = helper_->quantize_info[name];
|
||
std::vector<float> scales = quantize_info.scale_;
|
||
for (auto& i : scales) {
|
||
if (i <= 1e-10) {
|
||
negative_scale_tensor = negative_scale_tensor + " " + name;
|
||
}
|
||
}
|
||
}
|
||
if (negative_scale_tensor.size() > 0) {
|
||
P2OLogger()
|
||
<< "[Warning] The scale of tensors: [ " + negative_scale_tensor +
|
||
" ] contains negative scale, so this OP will not be quantized."
|
||
<< std::endl;
|
||
continue;
|
||
}
|
||
// An OP requires a separate quantize op
|
||
for (std::string& name : quantize_tensors) {
|
||
if (IsGraphOutput(name)) {
|
||
continue;
|
||
}
|
||
QuantizeInfo quantize_info = helper_->quantize_info[name];
|
||
std::string scale_node = quantize_info.scale_node_;
|
||
std::string zeros_node = quantize_info.zeros_node_;
|
||
int64_t quantize_axis = quantize_info.quantize_axis_;
|
||
auto q_node =
|
||
helper_->MakeNode("QuantizeLinear", {name, scale_node, zeros_node});
|
||
if (helper_->GetOpsetVersion() >= 13) {
|
||
AddAttribute(q_node, "axis", quantize_axis);
|
||
}
|
||
auto dq_node = helper_->MakeNode(
|
||
"DequantizeLinear", {q_node->output(0), scale_node, zeros_node});
|
||
if (helper_->GetOpsetVersion() >= 13) {
|
||
AddAttribute(dq_node, "axis", quantize_axis);
|
||
}
|
||
for (size_t i = 0; i < node->input_size(); ++i) {
|
||
if (node->input(i) == name) {
|
||
node->set_input(i, dq_node->output(0));
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// According to:
|
||
// https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
|
||
void QuantizeModelProcessor::AddQDQForORT() {
|
||
UpdateInputNameToNodes();
|
||
supported_quantize_type_ = {"Conv", "MatMul", "Mul", "Sigmoid",
|
||
"Add", "LeakyRelu", "Relu"};
|
||
for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) {
|
||
auto node = *iter;
|
||
auto type_iter = std::find(supported_quantize_type_.begin(),
|
||
supported_quantize_type_.end(), node->op_type());
|
||
if (!supported_quantize_type_.empty() &&
|
||
type_iter == supported_quantize_type_.end()) {
|
||
continue;
|
||
}
|
||
// Here we only add Relu, Conv, mul and matmul, all tensors should add Q and
|
||
// DQ will be saved in tensors_to_be_quantize
|
||
if (node->op_type() == "Relu") {
|
||
std::vector<std::string> tensor_names = {node->input(0), node->output(0)};
|
||
if (!CanBeQuantize(tensor_names)) {
|
||
continue;
|
||
}
|
||
node->set_op_type("LeakyRelu");
|
||
AddAttribute(node, "alpha", static_cast<float>(0.0));
|
||
for (auto& name : tensor_names) {
|
||
AppendQuantizeTensor(name);
|
||
}
|
||
}
|
||
if (node->op_type() == "LeakyRelu") {
|
||
std::vector<std::string> tensor_names = {node->input(0), node->output(0)};
|
||
if (!CanBeQuantize(tensor_names)) {
|
||
continue;
|
||
}
|
||
for (auto& name : tensor_names) {
|
||
AppendQuantizeTensor(name);
|
||
}
|
||
}
|
||
if (node->op_type() == "Add") {
|
||
std::vector<std::string> tensor_names = {node->input(0), node->input(1),
|
||
node->output(0)};
|
||
if (!CanBeQuantize(tensor_names)) {
|
||
continue;
|
||
}
|
||
for (auto& name : tensor_names) {
|
||
AppendQuantizeTensor(name);
|
||
}
|
||
}
|
||
if (node->op_type() == "Sigmoid") {
|
||
std::vector<std::string> tensor_names = {node->input(0), node->output(0)};
|
||
if (!CanBeQuantize(tensor_names)) {
|
||
continue;
|
||
}
|
||
for (auto& name : tensor_names) {
|
||
AppendQuantizeTensor(name);
|
||
}
|
||
}
|
||
if (node->op_type() == "Conv") {
|
||
std::vector<std::string> tensor_names = {node->input(0), node->input(1),
|
||
node->output(0)};
|
||
if (node->input_size() == 3) {
|
||
tensor_names.push_back(node->input(2));
|
||
}
|
||
if (!CanBeQuantize(tensor_names, {2})) {
|
||
continue;
|
||
}
|
||
for (auto& name : tensor_names) {
|
||
AppendQuantizeTensor(name);
|
||
}
|
||
}
|
||
if (node->op_type() == "MatMul") {
|
||
std::vector<std::string> tensor_names = {node->input(0), node->input(1),
|
||
node->output(0)};
|
||
for (auto& name : tensor_names) {
|
||
if (helper_->quantize_info.find(name) != helper_->quantize_info.end()) {
|
||
continue;
|
||
}
|
||
std::vector<float> matmul_weight;
|
||
if (!GetTensorByName(name, &matmul_weight)) {
|
||
continue;
|
||
}
|
||
std::vector<int64_t> matmul_weight_shape;
|
||
if (!GetTensorShape(name, &matmul_weight_shape)) {
|
||
continue;
|
||
}
|
||
int64_t quantize_axis = 1;
|
||
std::vector<float> scale;
|
||
std::vector<int64_t> zeros;
|
||
GetChannelWiseQuantizeInfo(matmul_weight, matmul_weight_shape,
|
||
quantize_axis, &scale, &zeros);
|
||
auto scale_node =
|
||
helper_->Constant(ONNX_NAMESPACE::TensorProto::FLOAT, scale);
|
||
auto zero_node =
|
||
helper_->Constant(ONNX_NAMESPACE::TensorProto::INT8, zeros);
|
||
QuantizeInfo matmul_weight_quantize_info(scale, zeros, scale_node,
|
||
zero_node, quantize_axis);
|
||
helper_->quantize_info[name] = matmul_weight_quantize_info;
|
||
}
|
||
if (!CanBeQuantize(tensor_names)) {
|
||
tensor_names.pop_back();
|
||
if (!CanBeQuantize(tensor_names)) {
|
||
continue;
|
||
}
|
||
}
|
||
for (auto& name : tensor_names) {
|
||
AppendQuantizeTensor(name);
|
||
}
|
||
}
|
||
if (node->op_type() == "Mul") {
|
||
std::vector<std::string> tensor_names = {node->input(0), node->input(1),
|
||
node->output(0)};
|
||
if (!CanBeQuantize(tensor_names)) {
|
||
continue;
|
||
}
|
||
for (auto& name : tensor_names) {
|
||
AppendQuantizeTensor(name);
|
||
}
|
||
}
|
||
}
|
||
// update name2node_dict for the change of Relu op.
|
||
UpdateInputNameToNodes();
|
||
// Add QDQ in model
|
||
AddQDQInModel(tensors_to_be_quantize);
|
||
}
|
||
|
||
void QuantizeModelProcessor::AddQDQInModel(
|
||
const std::vector<std::string>& tensors_to_be_quantize) {
|
||
// add Q and DQ according to tensors_to_be_quantize
|
||
for (auto& name : tensors_to_be_quantize) {
|
||
if (IsGraphOutput(name)) {
|
||
continue;
|
||
}
|
||
Assert(helper_->quantize_info.find(name) != helper_->quantize_info.end(),
|
||
"[QuantizeModelProcessor] Can not find quantize info for tensor: " +
|
||
name);
|
||
QuantizeInfo quantize_info = helper_->quantize_info[name];
|
||
std::string scale_node = quantize_info.scale_node_;
|
||
std::string zeros_node = quantize_info.zeros_node_;
|
||
int64_t quantize_axis = quantize_info.quantize_axis_;
|
||
auto iter = std::find(only_dequantize_tensors.begin(),
|
||
only_dequantize_tensors.end(), name);
|
||
if (iter != only_dequantize_tensors.end()) {
|
||
// if only add DequantizeLinear
|
||
std::vector<float> scale = quantize_info.scale_;
|
||
std::vector<float> bias;
|
||
Assert(GetTensorByName(name, &bias),
|
||
"[QuantizeModelProcessor] Can not find bias value: " + name);
|
||
std::vector<int32_t> new_bias(scale.size(), 0);
|
||
for (int64_t i = 0; i < bias.size(); i++) {
|
||
float scale_val = scale.size() == 1 ? scale[0] : scale[i];
|
||
new_bias[i] = rint(bias[i] / scale_val);
|
||
}
|
||
|
||
Weight updated_bias;
|
||
std::vector<int64_t> bias_shape = {static_cast<int64_t>(new_bias.size())};
|
||
updated_bias.set(P2ODataType::INT32, bias_shape, new_bias);
|
||
helper_->updated_params[name] = updated_bias;
|
||
auto dq_node =
|
||
helper_->MakeNode("DequantizeLinear", {name, scale_node, zeros_node});
|
||
if (helper_->GetOpsetVersion() >= 13) {
|
||
AddAttribute(dq_node, "axis", quantize_axis);
|
||
}
|
||
ReplaceInputOfAllNodes(name, dq_node->output(0));
|
||
} else {
|
||
// Handle the following situations
|
||
// conv conv
|
||
// / | \ -> / \
|
||
// conv conv scale DQD scale
|
||
// / \
|
||
// conv conv
|
||
std::vector<std::shared_ptr<ONNX_NAMESPACE::NodeProto>> except_nodes;
|
||
auto next_nodes = name2node_dict_[name];
|
||
if (next_nodes.size() > 1) {
|
||
for (auto& node : next_nodes) {
|
||
auto iter =
|
||
std::find(supported_quantize_type_.begin(),
|
||
supported_quantize_type_.end(), node->op_type());
|
||
if (iter == supported_quantize_type_.end()) {
|
||
except_nodes.push_back(node);
|
||
}
|
||
}
|
||
}
|
||
// When all the outputs of this tensor cannot be renamed,
|
||
// it means that the quantization OP will be merged
|
||
if (next_nodes.size() == except_nodes.size()) {
|
||
except_nodes.clear();
|
||
}
|
||
auto q_node =
|
||
helper_->MakeNode("QuantizeLinear", {name, scale_node, zeros_node});
|
||
if (helper_->GetOpsetVersion() >= 13) {
|
||
AddAttribute(q_node, "axis", quantize_axis);
|
||
}
|
||
auto dq_node = helper_->MakeNode(
|
||
"DequantizeLinear", {q_node->output(0), scale_node, zeros_node});
|
||
if (helper_->GetOpsetVersion() >= 13) {
|
||
AddAttribute(dq_node, "axis", quantize_axis);
|
||
}
|
||
ReplaceInputOfAllNodes(name, dq_node->output(0), except_nodes);
|
||
}
|
||
}
|
||
}
|
||
|
||
void QuantizeModelProcessor::MergeConvBN() {
|
||
UpdateInputNameToNodes();
|
||
for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) {
|
||
auto conv_node = *iter;
|
||
if (conv_node->op_type() != "Conv") {
|
||
continue;
|
||
}
|
||
|
||
bool act_has_quantize_info =
|
||
helper_->quantize_info.find(conv_node->input(0)) !=
|
||
helper_->quantize_info.end();
|
||
if (!act_has_quantize_info) {
|
||
continue;
|
||
}
|
||
auto next_nodes = name2node_dict_[conv_node->output(0)];
|
||
|
||
if (next_nodes.size() > 1 || IsGraphOutput(conv_node->output(0))) {
|
||
continue;
|
||
}
|
||
|
||
auto bn_node = next_nodes[0];
|
||
if (bn_node->op_type() != "BatchNormalization" ||
|
||
IsGraphOutput(bn_node->output(0))) {
|
||
continue;
|
||
}
|
||
|
||
std::vector<float> conv_weight;
|
||
Assert(GetTensorByName(conv_node->input(1), &conv_weight),
|
||
"Can not get " + conv_node->input(1) + " from Conv.");
|
||
|
||
std::vector<float> bn_scale;
|
||
Assert(GetTensorByName(bn_node->input(1), &bn_scale),
|
||
"Can not get " + bn_node->input(1) + " from BN.");
|
||
|
||
std::vector<float> bn_bias;
|
||
Assert(GetTensorByName(bn_node->input(2), &bn_bias),
|
||
"Can not get " + bn_node->input(2) + " from BN.");
|
||
|
||
std::vector<float> bn_mean;
|
||
Assert(GetTensorByName(bn_node->input(3), &bn_mean),
|
||
"Can not get " + bn_node->input(3) + " from BN.");
|
||
|
||
std::vector<float> bn_var;
|
||
Assert(GetTensorByName(bn_node->input(4), &bn_var),
|
||
"Can not get " + bn_node->input(4) + " from BN.");
|
||
|
||
float epsilon = 1;
|
||
for (auto i = 0; i < bn_node->attribute_size(); i++) {
|
||
auto attr = bn_node->attribute(i);
|
||
if (attr.name() == "epsilon") {
|
||
epsilon = attr.f();
|
||
}
|
||
}
|
||
|
||
std::vector<float> conv_bias(bn_bias.size(), 0);
|
||
std::string conv_bias_node = conv_node->input(1) + ".merged.bias";
|
||
if (conv_node->input_size() == 3) {
|
||
conv_bias_node = conv_node->input(2);
|
||
conv_bias.clear();
|
||
Assert(GetTensorByName(conv_bias_node, &conv_bias),
|
||
"Can not get " + conv_node->input(2) + " in Conv.");
|
||
}
|
||
|
||
// merge conv and bn
|
||
std::vector<float> alpha(bn_scale.size());
|
||
for (int64_t i = 0; i < bn_scale.size(); i++) {
|
||
alpha[i] = bn_scale[i] / sqrt(bn_var[i] + epsilon);
|
||
}
|
||
|
||
std::vector<float> new_bias(bn_scale.size());
|
||
for (int64_t i = 0; i < bn_scale.size(); i++) {
|
||
new_bias[i] =
|
||
conv_bias[i] * alpha[i] + (bn_bias[i] - bn_mean[i] * alpha[i]);
|
||
}
|
||
std::vector<float> new_weight(conv_weight.size());
|
||
int64_t offset = conv_weight.size() / bn_bias.size();
|
||
for (int64_t i = 0; i < bn_scale.size(); i++) {
|
||
int64_t outter_offset = i * offset;
|
||
for (int64_t j = 0; j < offset; j++) {
|
||
int64_t index = outter_offset + j;
|
||
new_weight[index] = conv_weight[index] * alpha[i];
|
||
}
|
||
}
|
||
// update weight
|
||
std::vector<int64_t> weight_shape;
|
||
Assert(GetTensorShape(conv_node->input(1), &weight_shape),
|
||
"Can not get the shape of " + conv_node->input(1) + " in Conv.");
|
||
Weight updated_conv_weight;
|
||
updated_conv_weight.set(P2ODataType::FP32, weight_shape, new_weight);
|
||
helper_->updated_params[conv_node->input(1)] = updated_conv_weight;
|
||
// update bias
|
||
Weight updated_bias_weight;
|
||
std::vector<int64_t> bias_shape = {static_cast<int64_t>(new_bias.size())};
|
||
updated_bias_weight.set(P2ODataType::FP32, bias_shape, new_bias);
|
||
helper_->updated_params[conv_bias_node] = updated_bias_weight;
|
||
AppendQuantizeTensor(conv_bias_node, true);
|
||
// update weight scale
|
||
auto quantize_info = helper_->quantize_info[conv_node->input(1)];
|
||
std::string scale_node = quantize_info.scale_node_;
|
||
std::string zero_node = quantize_info.zeros_node_;
|
||
int64_t quantize_axis = quantize_info.quantize_axis_;
|
||
RemoveNodeByName(scale_node);
|
||
RemoveNodeByName(zero_node);
|
||
std::vector<float> scale = quantize_info.scale_;
|
||
std::vector<float> new_scale;
|
||
std::vector<int64_t> new_zeros;
|
||
if (scale.size() == 1) {
|
||
GetTensorWiseQuantizeInfo(new_weight, &new_scale, &new_zeros);
|
||
} else {
|
||
GetChannelWiseQuantizeInfo(new_weight, weight_shape, quantize_axis,
|
||
&new_scale, &new_zeros);
|
||
}
|
||
auto weight_scale_node =
|
||
helper_->Constant(ONNX_NAMESPACE::TensorProto::FLOAT, new_scale);
|
||
auto weight_zero_node =
|
||
helper_->Constant(ONNX_NAMESPACE::TensorProto::INT8, new_zeros);
|
||
QuantizeInfo updated_weight_quantize_info(new_scale, new_zeros,
|
||
weight_scale_node,
|
||
weight_zero_node, quantize_axis);
|
||
helper_->quantize_info[conv_node->input(1)] = updated_weight_quantize_info;
|
||
// add bias scale and update bias
|
||
auto act_quantize_info = helper_->quantize_info[conv_node->input(0)];
|
||
std::vector<float> act_scale = act_quantize_info.scale_;
|
||
std::vector<float> bias_scale;
|
||
for (int64_t i = 0; i < new_scale.size(); i++) {
|
||
bias_scale.push_back(act_scale[0] * new_scale[i]);
|
||
}
|
||
std::vector<int64_t> bias_zeros(bias_scale.size(), 0);
|
||
auto bias_scale_node =
|
||
helper_->Constant(ONNX_NAMESPACE::TensorProto::FLOAT, bias_scale);
|
||
auto bias_zero_node =
|
||
helper_->Constant(ONNX_NAMESPACE::TensorProto::INT32, bias_zeros);
|
||
QuantizeInfo bias_quantize_info(bias_scale, bias_zeros, bias_scale_node,
|
||
bias_zero_node, 0);
|
||
helper_->quantize_info[conv_bias_node] = bias_quantize_info;
|
||
if (conv_node->input_size() == 2) {
|
||
conv_node->add_input(conv_bias_node);
|
||
}
|
||
// remove BN op
|
||
RemoveNodeByName(bn_node->name());
|
||
}
|
||
}
|
||
|
||
void QuantizeModelProcessor::MergeConvAdd() {
|
||
UpdateInputNameToNodes();
|
||
for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) {
|
||
auto node = *iter;
|
||
if (node->op_type() != "Conv") {
|
||
continue;
|
||
}
|
||
// if act input of conv does not have quantize info, continue
|
||
bool act_has_quantize_info = helper_->quantize_info.find(node->input(0)) !=
|
||
helper_->quantize_info.end();
|
||
if (!act_has_quantize_info) {
|
||
continue;
|
||
}
|
||
|
||
// if weight of conv does not have quantize info, continue
|
||
bool weight_has_quantize_info =
|
||
helper_->quantize_info.find(node->input(1)) !=
|
||
helper_->quantize_info.end();
|
||
if (!weight_has_quantize_info) {
|
||
continue;
|
||
}
|
||
auto next_nodes = name2node_dict_[node->output(0)];
|
||
|
||
if (next_nodes.size() > 1 || IsGraphOutput(node->output(0))) {
|
||
continue;
|
||
}
|
||
|
||
auto next_node = next_nodes[0];
|
||
if (next_node->op_type() != "Add" || IsGraphOutput(next_node->output(0))) {
|
||
continue;
|
||
}
|
||
std::string reshape_node = node->output(0) == next_node->input(0)
|
||
? next_node->input(1)
|
||
: next_node->input(0);
|
||
std::vector<std::shared_ptr<ONNX_NAMESPACE::NodeProto>> before_nodes;
|
||
for (auto& node : *nodes_) {
|
||
for (size_t i = 0; i < node->output_size(); ++i) {
|
||
std::string node_output = node->output(i);
|
||
if (node_output == reshape_node) {
|
||
before_nodes.push_back(node);
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
if (before_nodes.size() != 1 || before_nodes[0]->op_type() != "Reshape") {
|
||
continue;
|
||
}
|
||
|
||
std::string bias_node = before_nodes[0]->input(0);
|
||
// continue if bias is not a constant
|
||
std::vector<float> bias_val;
|
||
if (!GetTensorByName(bias_node, &bias_val)) {
|
||
continue;
|
||
}
|
||
|
||
// continue if shape tensor of reshape op is not a constant
|
||
std::vector<int64_t> shape_val;
|
||
if (!GetTensorByName(before_nodes[0]->input(1), &shape_val)) {
|
||
continue;
|
||
}
|
||
// continue if shape_val != [1, bias_val.size(), 1, 1]
|
||
std::vector<int64_t> target = {1, static_cast<int64_t>(bias_val.size()), 1,
|
||
1};
|
||
if (target != shape_val) {
|
||
continue;
|
||
}
|
||
// remove Reshape op
|
||
RemoveNodeByName(before_nodes[0]->name());
|
||
// add scale for bias
|
||
std::vector<float> weight_scale =
|
||
helper_->quantize_info[node->input(1)].scale_;
|
||
std::vector<float> act_scale =
|
||
helper_->quantize_info[node->input(0)].scale_;
|
||
std::vector<float> bias_scale;
|
||
for (int64_t i = 0; i < weight_scale.size(); i++) {
|
||
bias_scale.push_back(weight_scale[i] * act_scale[0]);
|
||
}
|
||
std::vector<int64_t> onnx_zeros(bias_scale.size(), 0);
|
||
auto scale_node =
|
||
helper_->Constant(ONNX_NAMESPACE::TensorProto::FLOAT, bias_scale);
|
||
auto zero_node =
|
||
helper_->Constant(ONNX_NAMESPACE::TensorProto::INT32, onnx_zeros);
|
||
|
||
QuantizeInfo quantize_info(bias_scale, onnx_zeros, scale_node, zero_node,
|
||
0);
|
||
|
||
helper_->quantize_info[bias_node] = quantize_info;
|
||
AppendQuantizeTensor(bias_node, true);
|
||
node->add_input(bias_node);
|
||
RemoveNodeByName(next_node->name());
|
||
}
|
||
}
|
||
|
||
void QuantizeModelProcessor::SortNodes() {
|
||
// return the topo sort of nodes;
|
||
// 1. Get i2o_mapper and constant_nodes, i2o_mapper means the node map to its
|
||
// all output nodes, constant_nodes save all constant nodes.
|
||
// 2. Nodes without output nodes are first saved to new_nodes, and then
|
||
// cyclically delete the records of the node in i2o_mapper items, and nodes
|
||
// whose output nodes are empty are also saved to new_nodes in turn.
|
||
// 3. Store constant nodes in new_nodes.
|
||
// 4. Reverse new_nodes, then assign to nodes.
|
||
std::map<std::string, std::vector<std::string>> i2o_mapper;
|
||
std::vector<std::shared_ptr<ONNX_NAMESPACE::NodeProto>> constant_nodes;
|
||
std::map<std::string, std::shared_ptr<ONNX_NAMESPACE::NodeProto>>
|
||
name2node_mapper;
|
||
for (int64_t i = 0; i < nodes_->size(); i++) {
|
||
auto node = (*nodes_)[i];
|
||
if (node->op_type() == "Constant") {
|
||
constant_nodes.push_back(node);
|
||
continue;
|
||
}
|
||
name2node_mapper[node->name()] = node;
|
||
for (int64_t in_index = 0; in_index < node->input_size(); in_index++) {
|
||
std::string input = node->input(in_index);
|
||
for (int64_t j = 0; j < nodes_->size(); j++) {
|
||
if (i == j) {
|
||
continue;
|
||
}
|
||
auto input_node = (*nodes_)[j];
|
||
if (input_node->op_type() == "Constant") {
|
||
continue;
|
||
}
|
||
for (int64_t out_index = 0; out_index < input_node->output_size();
|
||
out_index++) {
|
||
if (input == input_node->output(out_index)) {
|
||
if (i2o_mapper.find(input_node->name()) == i2o_mapper.end()) {
|
||
i2o_mapper[input_node->name()] = {node->name()};
|
||
} else {
|
||
auto iter =
|
||
std::find(i2o_mapper[input_node->name()].begin(),
|
||
i2o_mapper[input_node->name()].end(), node->name());
|
||
if (iter == i2o_mapper[input_node->name()].end()) {
|
||
i2o_mapper[input_node->name()].push_back(node->name());
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
std::vector<std::shared_ptr<ONNX_NAMESPACE::NodeProto>> new_nodes;
|
||
|
||
for (int64_t i = 0; i < nodes_->size(); i++) {
|
||
auto node_name = (*nodes_)[i]->name();
|
||
auto node = (*nodes_)[i];
|
||
if (node->op_type() == "Constant") {
|
||
continue;
|
||
}
|
||
if (i2o_mapper.find(node_name) == i2o_mapper.end()) {
|
||
new_nodes.push_back(node);
|
||
}
|
||
}
|
||
int64_t index = 0;
|
||
while (index < new_nodes.size()) {
|
||
auto current_node = new_nodes[index];
|
||
std::string current_node_name = current_node->name();
|
||
for (auto iter = i2o_mapper.begin(); iter != i2o_mapper.end(); iter++) {
|
||
std::string input_node_name = iter->first;
|
||
std::vector<std::string>* output_nodes_name = &iter->second;
|
||
if (output_nodes_name->empty()) {
|
||
continue;
|
||
}
|
||
auto in_inter = std::find(output_nodes_name->begin(),
|
||
output_nodes_name->end(), current_node_name);
|
||
if (in_inter != output_nodes_name->end()) {
|
||
output_nodes_name->erase(in_inter);
|
||
}
|
||
if (output_nodes_name->empty()) {
|
||
new_nodes.push_back(name2node_mapper[input_node_name]);
|
||
}
|
||
}
|
||
index++;
|
||
}
|
||
|
||
for (auto& node : constant_nodes) {
|
||
new_nodes.push_back(node);
|
||
}
|
||
std::reverse(new_nodes.begin(), new_nodes.end());
|
||
Assert(nodes_->size() == new_nodes.size(),
|
||
"The number of nodes after topological sorting is not equal to the "
|
||
"number before sorting");
|
||
*nodes_ = new_nodes;
|
||
}
|
||
|
||
void QuantizeModelProcessor::RemoveAllQuantizeOps() {
|
||
UpdateInputNameToNodes();
|
||
for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) {
|
||
auto node = *iter;
|
||
if (node->op_type() != "QuantizeLinear") {
|
||
continue;
|
||
}
|
||
auto next_node_names = name2node_dict_[node->output(0)];
|
||
|
||
if (next_node_names.empty() || !next_node_names[0]->has_op_type() ||
|
||
next_node_names[0]->op_type() != "DequantizeLinear") {
|
||
continue;
|
||
}
|
||
std::string input_name = node->input(0);
|
||
RemoveNodeByName(node->name());
|
||
std::string output_name = next_node_names[0]->output(0);
|
||
RemoveNodeByName(next_node_names[0]->name());
|
||
ReplaceInputOfAllNodes(output_name, input_name);
|
||
}
|
||
}
|
||
|
||
// Broadcast quantize info between the input and output of the OPs that will not
|
||
// change quantize info
|
||
void QuantizeModelProcessor::QuantizeInfoBroadcast() {
|
||
UpdateInputNameToNodes();
|
||
for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) {
|
||
auto node = *iter;
|
||
if (node->op_type() != "Identity") {
|
||
continue;
|
||
}
|
||
std::string input_name = node->input(0);
|
||
std::string output_name = node->output(0);
|
||
auto input_quantize_info_iter = helper_->quantize_info.find(input_name);
|
||
auto output_quantize_info_iter = helper_->quantize_info.find(output_name);
|
||
// The input and output of Identity do not have quantize info
|
||
if (input_quantize_info_iter == helper_->quantize_info.end() &&
|
||
output_quantize_info_iter == helper_->quantize_info.end()) {
|
||
continue;
|
||
}
|
||
// The input and output of Identity have quantize info
|
||
if (input_quantize_info_iter != helper_->quantize_info.end() &&
|
||
output_quantize_info_iter != helper_->quantize_info.end()) {
|
||
continue;
|
||
}
|
||
if (input_quantize_info_iter != helper_->quantize_info.end()) {
|
||
helper_->quantize_info[output_name] = helper_->quantize_info[input_name];
|
||
} else if (output_quantize_info_iter != helper_->quantize_info.end()) {
|
||
helper_->quantize_info[input_name] = helper_->quantize_info[output_name];
|
||
}
|
||
if (ConnectToOutput(output_name)) {
|
||
continue;
|
||
}
|
||
RemoveNodeByName(node->name());
|
||
iter--;
|
||
}
|
||
}
|
||
|
||
bool QuantizeModelProcessor::IsGraphOutput(const std::string& name) {
|
||
for (auto& item : *outputs_) {
|
||
auto out_node = (*item.get());
|
||
if (name == out_node.name()) {
|
||
return true;
|
||
}
|
||
}
|
||
return false;
|
||
}
|
||
|
||
// Try get tensor shape value
|
||
bool QuantizeModelProcessor::GetTensorShape(const std::string& name,
|
||
std::vector<int64_t>* shape) {
|
||
for (auto& item : *parameters_) {
|
||
auto node = *(item.get());
|
||
if (node.output(0) != name) {
|
||
continue;
|
||
}
|
||
for (auto i = 0; i < node.attribute_size(); i++) {
|
||
auto attr = node.attribute(i);
|
||
if (attr.name() == "value") {
|
||
auto tensor = attr.mutable_t();
|
||
for (int64_t i = 0; i < tensor->dims_size(); i++) {
|
||
shape->push_back(tensor->dims(i));
|
||
}
|
||
}
|
||
}
|
||
}
|
||
return !shape->empty();
|
||
}
|
||
|
||
void QuantizeModelProcessor::GetTensorWiseQuantizeInfo(
|
||
const std::vector<float>& tensor, std::vector<float>* scale,
|
||
std::vector<int64_t>* zero) {
|
||
float max_val = -1;
|
||
for (int64_t i = 0; i < tensor.size(); i++) {
|
||
if (fabs(tensor[i]) > max_val) {
|
||
max_val = fabs(tensor[i]);
|
||
}
|
||
}
|
||
Assert(max_val >= 0,
|
||
"[GetTensorWiseQuantizeInfo] Require the scale >= 0, but now it's " +
|
||
std::to_string(max_val) + ".");
|
||
scale->push_back(max_val / 127);
|
||
zero->push_back(0);
|
||
}
|
||
|
||
void QuantizeModelProcessor::GetChannelWiseQuantizeInfo(
|
||
const std::vector<float>& tensor, const std::vector<int64_t>& shape,
|
||
const int64_t& quant_axis, std::vector<float>* scale,
|
||
std::vector<int64_t>* zero) {
|
||
int64_t channel_count = shape[quant_axis];
|
||
|
||
for (int64_t i = 0; i < channel_count; i++) {
|
||
if (quant_axis == 0) {
|
||
float max_val = -1;
|
||
int64_t inner_offset = 1;
|
||
for (auto& j : shape) {
|
||
inner_offset *= j;
|
||
}
|
||
inner_offset /= channel_count;
|
||
int64_t index = i * inner_offset;
|
||
for (int64_t j = 0; j < inner_offset; j++) {
|
||
if (fabs(tensor[index + j]) > max_val) {
|
||
max_val = fabs(tensor[index + j]);
|
||
}
|
||
}
|
||
Assert(
|
||
max_val >= 0,
|
||
"[GetChannelWiseQuantizeInfo] Require the scale >= 0, but now it's " +
|
||
std::to_string(max_val) + ".");
|
||
scale->push_back(max_val / 127);
|
||
zero->push_back(0);
|
||
} else if (quant_axis == 1) {
|
||
float max_val = -1;
|
||
int64_t inner_offset = shape.size() == 4 ? shape[2] * shape[3] : 1;
|
||
for (int64_t outter = 0; outter < shape[0]; outter++) {
|
||
int64_t index = outter * channel_count * inner_offset;
|
||
for (int64_t inner = 0; inner < inner_offset; inner++) {
|
||
int64_t final_index = index + i * inner_offset + inner;
|
||
if (fabs(tensor[final_index]) > max_val) {
|
||
max_val = fabs(tensor[final_index]);
|
||
}
|
||
}
|
||
}
|
||
Assert(
|
||
max_val >= 0,
|
||
"[GetChannelWiseQuantizeInfo] Require the scale >= 0, but now it's " +
|
||
std::to_string(max_val) + ".");
|
||
scale->push_back(max_val / 127);
|
||
zero->push_back(0);
|
||
} else {
|
||
Assert(false,
|
||
"QuantizeModelProcessor::GetChannelWiseQuantizeInfo only supports "
|
||
"quant_axis equals to 0 or 1, but now it's " +
|
||
std::to_string(quant_axis) + ".");
|
||
}
|
||
}
|
||
}
|
||
|
||
template <typename T>
|
||
bool QuantizeModelProcessor::GetTensorByName(const std::string& name,
|
||
std::vector<T>* value) {
|
||
// Find tensor values in the following order, if found, store the data in
|
||
// value, and return true:
|
||
// 1. updated_parameters, the weight of conv or matmul.
|
||
// 2. parameters of original graph, the scale or bias of BN.
|
||
// 3. constant node in nodes, other vals.
|
||
auto updated_params_iter = helper_->updated_params.find(name);
|
||
if (updated_params_iter != helper_->updated_params.end()) {
|
||
(updated_params_iter->second).get(value);
|
||
return true;
|
||
}
|
||
for (int64_t block_index = 0; block_index < parser_->NumOfBlocks();
|
||
block_index++) {
|
||
if (parser_->TryGetTensorValue(block_index, name, value)) {
|
||
return true;
|
||
}
|
||
}
|
||
return helper_->TryGetTensorValue(name, value);
|
||
}
|
||
|
||
bool QuantizeModelProcessor::ConnectToOutput(const std::string& output_name) {
|
||
std::vector<std::string> names = {output_name};
|
||
while (!names.empty()) {
|
||
std::string name = names[names.size() - 1];
|
||
names.pop_back();
|
||
if (IsGraphOutput(name)) {
|
||
return true;
|
||
}
|
||
auto next_nodes = name2node_dict_[name];
|
||
for (auto& next : next_nodes) {
|
||
if (next->op_type() == "Identity") {
|
||
names.push_back(next->output(0));
|
||
}
|
||
}
|
||
}
|
||
return false;
|
||
}
|
||
|
||
bool QuantizeModelProcessor::CanBeQuantize(
|
||
const std::vector<std::string>& tensor_names,
|
||
const std::vector<int64_t>& output_index) {
|
||
for (auto& tensor : tensor_names) {
|
||
if (helper_->quantize_info.find(tensor) == helper_->quantize_info.end()) {
|
||
return false;
|
||
}
|
||
}
|
||
// If there is an OP linked to the output by identity, it needs to be skipped,
|
||
// do not quantize the OP
|
||
for (auto i = 0; i < output_index.size(); i++) {
|
||
int64_t index = output_index[i];
|
||
if (index == -1) {
|
||
index = tensor_names.size() - 1;
|
||
}
|
||
|
||
std::string output_name = tensor_names[index];
|
||
if (ConnectToOutput(output_name)) {
|
||
return false;
|
||
}
|
||
}
|
||
return true;
|
||
}
|
||
|
||
void QuantizeModelProcessor::AppendQuantizeTensor(const std::string& tensor,
|
||
const bool& only_dequantize) {
|
||
if (only_dequantize) {
|
||
if (std::find(only_dequantize_tensors.begin(),
|
||
only_dequantize_tensors.end(),
|
||
tensor) == only_dequantize_tensors.end()) {
|
||
only_dequantize_tensors.push_back(tensor);
|
||
}
|
||
} else {
|
||
if (std::find(tensors_to_be_quantize.begin(), tensors_to_be_quantize.end(),
|
||
tensor) == tensors_to_be_quantize.end()) {
|
||
tensors_to_be_quantize.push_back(tensor);
|
||
}
|
||
}
|
||
}
|
||
} // namespace paddle2onnx
|