Files
FastDeploy/paddle2onnx/mapper/quantize_helper.cc
Jason 6343b0db47 [Build] Support build with source code of Paddle2ONNX (#1559)
* Add notes for tensors

* Optimize some apis

* move some warnings

* Support build with Paddle2ONNX

* Add protobuf support

* Fix compile on mac

* add clearn package script

* Add paddle2onnx code

* remove submodule

* Add onnx ocde

* remove softlink

* add onnx code

* fix error

* Add cmake file

* fix patchelf

* update paddle2onnx

* Delete .gitmodules

---------

Co-authored-by: PaddleCI <paddle_ci@example.com>
Co-authored-by: pangyoki <pangyoki@126.com>
Co-authored-by: jiangjiajun <jiangjiajun@baidu.lcom>
2023-03-17 10:03:22 +08:00

1206 lines
44 KiB
C++
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle2onnx/mapper/quantize_helper.h"
namespace paddle2onnx {
void QuantizeModelProcessor::RemoveNodeByName(const std::string& name,
const bool& update_io) {
if (name.empty()) {
return;
}
for (auto iter = nodes_->begin(); iter != nodes_->end(); iter++) {
if ((*iter)->name() == name) {
std::string input_name = (*iter)->input(0);
std::string output_name = (*iter)->output(0);
nodes_->erase(iter);
if (update_io) {
ReplaceInputOfAllNodes(output_name, input_name);
}
return;
}
}
}
void QuantizeModelProcessor::ReplaceInputOfAllNodes(
const std::string& old_name, const std::string& new_name,
const std::vector<std::shared_ptr<ONNX_NAMESPACE::NodeProto>>&
except_nodes) {
auto iter = name2node_dict_.find(old_name);
std::vector<std::shared_ptr<ONNX_NAMESPACE::NodeProto>> need_rename_nodes;
// after replace all old_name to new_name, replace the quantize_info of new
// name with the quantize_info of old name
auto quantize_info_iter = helper_->quantize_info.find(old_name);
if (quantize_info_iter != helper_->quantize_info.end()) {
helper_->quantize_info[new_name] = helper_->quantize_info[old_name];
}
if (iter != name2node_dict_.end()) {
need_rename_nodes = iter->second;
}
for (auto& node : need_rename_nodes) {
auto iter = std::find(except_nodes.begin(), except_nodes.end(), node);
if (iter != except_nodes.end()) {
continue;
}
for (size_t i = 0; i < node->input_size(); ++i) {
if (node->input(i) == old_name) {
node->set_input(i, new_name);
}
}
}
}
void QuantizeModelProcessor::UpdateInputNameToNodes() {
name2node_dict_.clear();
for (auto& node : *nodes_) {
for (size_t i = 0; i < node->input_size(); ++i) {
std::string node_input = node->input(i);
if (name2node_dict_.find(node_input) != name2node_dict_.end()) {
name2node_dict_[node_input].push_back(node);
} else {
name2node_dict_[node_input] = {node};
}
}
}
}
void QuantizeModelProcessor::ProcessQuantizeModel(
std::vector<std::shared_ptr<ONNX_NAMESPACE::NodeProto>>* parameters,
std::vector<std::shared_ptr<ONNX_NAMESPACE::ValueInfoProto>>* inputs,
std::vector<std::shared_ptr<ONNX_NAMESPACE::ValueInfoProto>>* outputs,
std::vector<std::shared_ptr<ONNX_NAMESPACE::NodeProto>>* nodes,
OnnxHelper* helper, const std::string& deploy_backend,
const PaddleParser& parser, std::string* calibration_cache) {
// Determine whether the model contains quantization related OPs, if not, exit
// directly
bool quantized_model = false;
for (auto& node : *nodes) {
if (node->op_type() == "QuantizeLinear" ||
node->op_type() == "DequantizeLinear") {
quantized_model = true;
break;
}
}
if (!quantized_model) {
return;
}
parser_ = &parser;
helper_ = helper;
parameters_ = parameters;
inputs_ = inputs;
outputs_ = outputs;
nodes_ = nodes;
P2OLogger() << "[Info] Quantize model deploy backend is: " << deploy_backend
<< std::endl;
// Determine the format of the exported ONNX quantization model according to
// the deploy_backend
if (deploy_backend == "others") {
// If deploy_backend is others, the quantization model is exported as a
// float model + quantization table.
RemoveAllQuantizeOps();
std::ofstream outfile;
outfile.open("max_range.txt", std::ios::out);
if (!outfile.is_open()) {
P2OLogger() << "[WARNING] Quantize model processer failed to write range "
"information in current location."
<< std::endl;
return;
}
for (auto iter = helper_->quantize_info.begin();
iter != helper_->quantize_info.end(); iter++) {
std::string log = iter->first;
auto scale = iter->second.scale_;
if (scale.size() == 1) {
log = log + ": " + std::to_string(scale[0] * 127);
outfile << log << std::endl;
}
}
outfile.close();
} else if (deploy_backend == "onnxruntime") {
// When deploy_backend is ONNXRuntime, use the follow four steps to process:
// 1. broadcast quantize info
// 2. remove all quantize ops
// 3. merge conv and add
// 4. merge conv and bn
// 5. add Q and DQ according ONNXRuntime quantize OP fuse patten.
// 6. use topo sort in nodes
QuantizeInfoBroadcast();
RemoveAllQuantizeOps();
MergeConvAdd();
MergeConvBN();
AddQDQForORT();
SortNodes();
} else if (deploy_backend == "tensorrt") {
// When deploy_backend is TensorRT, use the follow four steps to process:
// For Explicit Quantization
// 1. broadcast quantize info
// 2. remove all quantize ops
// 3. add Q and DQ before conv and matmul.
// 4. use topo sort in nodes
// For Implicit Quantization
// 1. remove all quantize ops
// 2. broadcast quantize info
// 3. save float onnx model and alibration.cache
QuantizeInfoBroadcast();
RemoveAllQuantizeOps();
// Add qdq for Explicit Quantization
// AddTrtQDQ();
// SortNodes();
// Genarate calibration.cache for Implicit Quantization
// convert float to hex
GenerateCache(calibration_cache);
} else if (deploy_backend == "rknn") {
// When deploy_backend is RKNN, use the follow four steps to process:
// 1. broadcast quantize info
// 2. remove all quantize ops
// 3. merge conv and add
// 4. merge conv and bn
// 5. add Q and DQ
// 6. use topo sort in nodes
QuantizeInfoBroadcast();
RemoveAllQuantizeOps();
MergeConvAdd();
MergeConvBN();
AddQDQForRKNN();
SortNodes();
} else {
Assert(false,
"[QuantizeModelProcessor] Only support 'onnxruntime' / 'tensorrt' "
"/ 'others' as "
"backend now, but now the backend is: " +
deploy_backend + ".");
}
}
void QuantizeModelProcessor::AddQDQForRKNN() {
UpdateInputNameToNodes();
supported_quantize_type_ = {"Abs",
"Acos",
"Add",
"Asin",
"Atan",
"AveragePool",
"BatchNormalization",
"Ceil",
"Clip",
"Conv",
"ConvTranspose",
"Cos",
"Cosh",
"Concat",
"Elu",
"Erf",
"Exp",
"Floor",
"Gemm",
"HardSigmoid",
"InstanceNormalization",
"IsInf",
"IsNaN",
"Log",
"MaxPool",
"Mul",
"Neg",
"ReduceMean",
"Relu",
"Resize",
"Round",
"Sigmoid",
"Sin",
"Sinh",
"Split",
"Sqrt",
"Tan",
"Tanh"};
for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) {
auto node = *iter;
auto type_iter = std::find(supported_quantize_type_.begin(),
supported_quantize_type_.end(), node->op_type());
if (!supported_quantize_type_.empty() &&
type_iter == supported_quantize_type_.end()) {
continue;
}
// Add Miss scale for Mul and MatMul
if (node->op_type() == "Mul" || node->op_type() == "MatMul" ||
node->op_type() == "Add") {
for (size_t i = 0; i < node->input_size(); ++i) {
std::string node_input = node->input(i);
if (helper_->quantize_info.find(node_input) !=
helper_->quantize_info.end()) {
continue;
}
std::vector<float> weight;
if (!GetTensorByName(node_input, &weight)) {
continue;
}
std::vector<int64_t> weight_shape;
GetTensorShape(node_input, &weight_shape);
int64_t quantize_axis = 1;
if (node->op_type() == "Add") {
quantize_axis = 0;
}
std::vector<float> scale;
std::vector<int64_t> zeros;
if (node->op_type() == "Add" || weight_shape.size() == 1 ||
weight_shape.empty()) {
GetTensorWiseQuantizeInfo(weight, &scale, &zeros);
} else {
GetChannelWiseQuantizeInfo(weight, weight_shape, quantize_axis,
&scale, &zeros);
}
std::string weight_scale_node, weight_zero_node;
if (scale.size() == 1) {
weight_scale_node = helper_->Constant(
{}, ONNX_NAMESPACE::TensorProto::FLOAT, scale[0]);
weight_zero_node = helper_->Constant(
{}, ONNX_NAMESPACE::TensorProto::INT8, zeros[0]);
} else {
weight_scale_node =
helper_->Constant(ONNX_NAMESPACE::TensorProto::FLOAT, scale);
weight_zero_node =
helper_->Constant(ONNX_NAMESPACE::TensorProto::INT8, zeros);
}
QuantizeInfo weight_quantize_info(scale, zeros, weight_scale_node,
weight_zero_node, quantize_axis);
helper_->quantize_info[node_input] = weight_quantize_info;
}
}
std::vector<std::string> tensor_names;
for (size_t i = 0; i < node->input_size(); ++i) {
std::string node_input = node->input(i);
tensor_names.push_back(node_input);
}
for (size_t i = 0; i < node->output_size(); ++i) {
std::string node_output = node->output(i);
tensor_names.push_back(node_output);
}
if (!CanBeQuantize(tensor_names)) {
continue;
}
for (auto& name : tensor_names) {
AppendQuantizeTensor(name);
}
}
// update name2node_dict for the change of Relu op.
UpdateInputNameToNodes();
// Add QDQ in model
AddQDQInModel(tensors_to_be_quantize);
}
void QuantizeModelProcessor::GenerateCache(std::string* calibration_cache) {
union {
float f;
unsigned char farray[4];
} un;
*calibration_cache += "TRT-8XXX-EntropyCalibration2 \n";
for (auto iter = helper_->quantize_info.rbegin();
iter != helper_->quantize_info.rend(); iter++) {
std::string tensor_name = iter->first;
QuantizeInfo quantize_info = iter->second;
if (quantize_info.scale_.size() == 1) {
float val = quantize_info.scale_[0];
un.f = val;
*calibration_cache += (tensor_name + ": ");
std::stringstream enc;
for (int64_t i = 3; i >= 0; i--) {
enc << std::hex << std::setw(2) << std::setfill('0')
<< (int)(un.farray[i]);
}
*calibration_cache = *calibration_cache + enc.str() + "\n";
}
}
}
// In TensorRT, all quantized op: Conv, ConvTranspose, liner(MatMul), MaxPool,
// AvgPool, AdaptiveAvgPool, rnn(not support now)
// https://github.com/NVIDIA/TensorRT/tree/main/tools/pytorch-quantization/pytorch_quantization/nn/modules
void QuantizeModelProcessor::AddTrtQDQ() {
UpdateInputNameToNodes();
std::vector<std::string>
quantize_tensors; // save the tensor names that need add quantize ops
std::vector<std::string> pool_types = {"MaxPool", "AvgPool",
"AdaptiveAvgPool"};
for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) {
quantize_tensors.clear();
auto node = *iter;
if (node->op_type() == "Conv" || node->op_type() == "ConvTranspose") {
std::vector<std::string> tensor_names = {node->input(0), node->input(1)};
if (!CanBeQuantize(tensor_names)) {
continue;
}
quantize_tensors = tensor_names;
}
if (node->op_type() == "MatMul") {
std::vector<std::string> tensor_names = {node->input(0), node->input(1)};
for (auto& name : tensor_names) {
if (helper_->quantize_info.find(name) != helper_->quantize_info.end()) {
continue;
}
std::vector<float> matmul_weight;
if (!GetTensorByName(name, &matmul_weight)) {
continue;
}
std::vector<int64_t> matmul_weight_shape;
if (!GetTensorShape(name, &matmul_weight_shape)) {
continue;
}
int64_t quantize_axis = 1;
std::vector<float> scale;
std::vector<int64_t> zeros;
GetChannelWiseQuantizeInfo(matmul_weight, matmul_weight_shape,
quantize_axis, &scale, &zeros);
auto scale_node =
helper_->Constant(ONNX_NAMESPACE::TensorProto::FLOAT, scale);
auto zero_node =
helper_->Constant(ONNX_NAMESPACE::TensorProto::INT8, zeros);
QuantizeInfo matmul_weight_quantize_info(scale, zeros, scale_node,
zero_node, quantize_axis);
helper_->quantize_info[name] = matmul_weight_quantize_info;
}
if (!CanBeQuantize(tensor_names)) {
continue;
}
quantize_tensors = tensor_names;
}
auto type_iter =
std::find(pool_types.begin(), pool_types.end(), node->op_type());
if (type_iter != pool_types.end()) {
std::vector<std::string> tensor_names = {node->input(0)};
if (!CanBeQuantize(tensor_names)) {
continue;
}
quantize_tensors = tensor_names;
}
std::string negative_scale_tensor = "";
for (std::string& name : quantize_tensors) {
Assert(
helper_->quantize_info.find(name) != helper_->quantize_info.end(),
"[QuantizeModelProcessor] Can not find quantize info for tensor: " +
name);
QuantizeInfo quantize_info = helper_->quantize_info[name];
std::vector<float> scales = quantize_info.scale_;
for (auto& i : scales) {
if (i <= 1e-10) {
negative_scale_tensor = negative_scale_tensor + " " + name;
}
}
}
if (negative_scale_tensor.size() > 0) {
P2OLogger()
<< "[Warning] The scale of tensors: [ " + negative_scale_tensor +
" ] contains negative scale, so this OP will not be quantized."
<< std::endl;
continue;
}
// An OP requires a separate quantize op
for (std::string& name : quantize_tensors) {
if (IsGraphOutput(name)) {
continue;
}
QuantizeInfo quantize_info = helper_->quantize_info[name];
std::string scale_node = quantize_info.scale_node_;
std::string zeros_node = quantize_info.zeros_node_;
int64_t quantize_axis = quantize_info.quantize_axis_;
auto q_node =
helper_->MakeNode("QuantizeLinear", {name, scale_node, zeros_node});
if (helper_->GetOpsetVersion() >= 13) {
AddAttribute(q_node, "axis", quantize_axis);
}
auto dq_node = helper_->MakeNode(
"DequantizeLinear", {q_node->output(0), scale_node, zeros_node});
if (helper_->GetOpsetVersion() >= 13) {
AddAttribute(dq_node, "axis", quantize_axis);
}
for (size_t i = 0; i < node->input_size(); ++i) {
if (node->input(i) == name) {
node->set_input(i, dq_node->output(0));
}
}
}
}
}
// According to:
// https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
void QuantizeModelProcessor::AddQDQForORT() {
UpdateInputNameToNodes();
supported_quantize_type_ = {"Conv", "MatMul", "Mul", "Sigmoid",
"Add", "LeakyRelu", "Relu"};
for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) {
auto node = *iter;
auto type_iter = std::find(supported_quantize_type_.begin(),
supported_quantize_type_.end(), node->op_type());
if (!supported_quantize_type_.empty() &&
type_iter == supported_quantize_type_.end()) {
continue;
}
// Here we only add Relu, Conv, mul and matmul, all tensors should add Q and
// DQ will be saved in tensors_to_be_quantize
if (node->op_type() == "Relu") {
std::vector<std::string> tensor_names = {node->input(0), node->output(0)};
if (!CanBeQuantize(tensor_names)) {
continue;
}
node->set_op_type("LeakyRelu");
AddAttribute(node, "alpha", static_cast<float>(0.0));
for (auto& name : tensor_names) {
AppendQuantizeTensor(name);
}
}
if (node->op_type() == "LeakyRelu") {
std::vector<std::string> tensor_names = {node->input(0), node->output(0)};
if (!CanBeQuantize(tensor_names)) {
continue;
}
for (auto& name : tensor_names) {
AppendQuantizeTensor(name);
}
}
if (node->op_type() == "Add") {
std::vector<std::string> tensor_names = {node->input(0), node->input(1),
node->output(0)};
if (!CanBeQuantize(tensor_names)) {
continue;
}
for (auto& name : tensor_names) {
AppendQuantizeTensor(name);
}
}
if (node->op_type() == "Sigmoid") {
std::vector<std::string> tensor_names = {node->input(0), node->output(0)};
if (!CanBeQuantize(tensor_names)) {
continue;
}
for (auto& name : tensor_names) {
AppendQuantizeTensor(name);
}
}
if (node->op_type() == "Conv") {
std::vector<std::string> tensor_names = {node->input(0), node->input(1),
node->output(0)};
if (node->input_size() == 3) {
tensor_names.push_back(node->input(2));
}
if (!CanBeQuantize(tensor_names, {2})) {
continue;
}
for (auto& name : tensor_names) {
AppendQuantizeTensor(name);
}
}
if (node->op_type() == "MatMul") {
std::vector<std::string> tensor_names = {node->input(0), node->input(1),
node->output(0)};
for (auto& name : tensor_names) {
if (helper_->quantize_info.find(name) != helper_->quantize_info.end()) {
continue;
}
std::vector<float> matmul_weight;
if (!GetTensorByName(name, &matmul_weight)) {
continue;
}
std::vector<int64_t> matmul_weight_shape;
if (!GetTensorShape(name, &matmul_weight_shape)) {
continue;
}
int64_t quantize_axis = 1;
std::vector<float> scale;
std::vector<int64_t> zeros;
GetChannelWiseQuantizeInfo(matmul_weight, matmul_weight_shape,
quantize_axis, &scale, &zeros);
auto scale_node =
helper_->Constant(ONNX_NAMESPACE::TensorProto::FLOAT, scale);
auto zero_node =
helper_->Constant(ONNX_NAMESPACE::TensorProto::INT8, zeros);
QuantizeInfo matmul_weight_quantize_info(scale, zeros, scale_node,
zero_node, quantize_axis);
helper_->quantize_info[name] = matmul_weight_quantize_info;
}
if (!CanBeQuantize(tensor_names)) {
tensor_names.pop_back();
if (!CanBeQuantize(tensor_names)) {
continue;
}
}
for (auto& name : tensor_names) {
AppendQuantizeTensor(name);
}
}
if (node->op_type() == "Mul") {
std::vector<std::string> tensor_names = {node->input(0), node->input(1),
node->output(0)};
if (!CanBeQuantize(tensor_names)) {
continue;
}
for (auto& name : tensor_names) {
AppendQuantizeTensor(name);
}
}
}
// update name2node_dict for the change of Relu op.
UpdateInputNameToNodes();
// Add QDQ in model
AddQDQInModel(tensors_to_be_quantize);
}
void QuantizeModelProcessor::AddQDQInModel(
const std::vector<std::string>& tensors_to_be_quantize) {
// add Q and DQ according to tensors_to_be_quantize
for (auto& name : tensors_to_be_quantize) {
if (IsGraphOutput(name)) {
continue;
}
Assert(helper_->quantize_info.find(name) != helper_->quantize_info.end(),
"[QuantizeModelProcessor] Can not find quantize info for tensor: " +
name);
QuantizeInfo quantize_info = helper_->quantize_info[name];
std::string scale_node = quantize_info.scale_node_;
std::string zeros_node = quantize_info.zeros_node_;
int64_t quantize_axis = quantize_info.quantize_axis_;
auto iter = std::find(only_dequantize_tensors.begin(),
only_dequantize_tensors.end(), name);
if (iter != only_dequantize_tensors.end()) {
// if only add DequantizeLinear
std::vector<float> scale = quantize_info.scale_;
std::vector<float> bias;
Assert(GetTensorByName(name, &bias),
"[QuantizeModelProcessor] Can not find bias value: " + name);
std::vector<int32_t> new_bias(scale.size(), 0);
for (int64_t i = 0; i < bias.size(); i++) {
float scale_val = scale.size() == 1 ? scale[0] : scale[i];
new_bias[i] = rint(bias[i] / scale_val);
}
Weight updated_bias;
std::vector<int64_t> bias_shape = {static_cast<int64_t>(new_bias.size())};
updated_bias.set(P2ODataType::INT32, bias_shape, new_bias);
helper_->updated_params[name] = updated_bias;
auto dq_node =
helper_->MakeNode("DequantizeLinear", {name, scale_node, zeros_node});
if (helper_->GetOpsetVersion() >= 13) {
AddAttribute(dq_node, "axis", quantize_axis);
}
ReplaceInputOfAllNodes(name, dq_node->output(0));
} else {
// Handle the following situations
// conv conv
// / | \ -> / \
// conv conv scale DQD scale
// / \
// conv conv
std::vector<std::shared_ptr<ONNX_NAMESPACE::NodeProto>> except_nodes;
auto next_nodes = name2node_dict_[name];
if (next_nodes.size() > 1) {
for (auto& node : next_nodes) {
auto iter =
std::find(supported_quantize_type_.begin(),
supported_quantize_type_.end(), node->op_type());
if (iter == supported_quantize_type_.end()) {
except_nodes.push_back(node);
}
}
}
// When all the outputs of this tensor cannot be renamed,
// it means that the quantization OP will be merged
if (next_nodes.size() == except_nodes.size()) {
except_nodes.clear();
}
auto q_node =
helper_->MakeNode("QuantizeLinear", {name, scale_node, zeros_node});
if (helper_->GetOpsetVersion() >= 13) {
AddAttribute(q_node, "axis", quantize_axis);
}
auto dq_node = helper_->MakeNode(
"DequantizeLinear", {q_node->output(0), scale_node, zeros_node});
if (helper_->GetOpsetVersion() >= 13) {
AddAttribute(dq_node, "axis", quantize_axis);
}
ReplaceInputOfAllNodes(name, dq_node->output(0), except_nodes);
}
}
}
void QuantizeModelProcessor::MergeConvBN() {
UpdateInputNameToNodes();
for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) {
auto conv_node = *iter;
if (conv_node->op_type() != "Conv") {
continue;
}
bool act_has_quantize_info =
helper_->quantize_info.find(conv_node->input(0)) !=
helper_->quantize_info.end();
if (!act_has_quantize_info) {
continue;
}
auto next_nodes = name2node_dict_[conv_node->output(0)];
if (next_nodes.size() > 1 || IsGraphOutput(conv_node->output(0))) {
continue;
}
auto bn_node = next_nodes[0];
if (bn_node->op_type() != "BatchNormalization" ||
IsGraphOutput(bn_node->output(0))) {
continue;
}
std::vector<float> conv_weight;
Assert(GetTensorByName(conv_node->input(1), &conv_weight),
"Can not get " + conv_node->input(1) + " from Conv.");
std::vector<float> bn_scale;
Assert(GetTensorByName(bn_node->input(1), &bn_scale),
"Can not get " + bn_node->input(1) + " from BN.");
std::vector<float> bn_bias;
Assert(GetTensorByName(bn_node->input(2), &bn_bias),
"Can not get " + bn_node->input(2) + " from BN.");
std::vector<float> bn_mean;
Assert(GetTensorByName(bn_node->input(3), &bn_mean),
"Can not get " + bn_node->input(3) + " from BN.");
std::vector<float> bn_var;
Assert(GetTensorByName(bn_node->input(4), &bn_var),
"Can not get " + bn_node->input(4) + " from BN.");
float epsilon = 1;
for (auto i = 0; i < bn_node->attribute_size(); i++) {
auto attr = bn_node->attribute(i);
if (attr.name() == "epsilon") {
epsilon = attr.f();
}
}
std::vector<float> conv_bias(bn_bias.size(), 0);
std::string conv_bias_node = conv_node->input(1) + ".merged.bias";
if (conv_node->input_size() == 3) {
conv_bias_node = conv_node->input(2);
conv_bias.clear();
Assert(GetTensorByName(conv_bias_node, &conv_bias),
"Can not get " + conv_node->input(2) + " in Conv.");
}
// merge conv and bn
std::vector<float> alpha(bn_scale.size());
for (int64_t i = 0; i < bn_scale.size(); i++) {
alpha[i] = bn_scale[i] / sqrt(bn_var[i] + epsilon);
}
std::vector<float> new_bias(bn_scale.size());
for (int64_t i = 0; i < bn_scale.size(); i++) {
new_bias[i] =
conv_bias[i] * alpha[i] + (bn_bias[i] - bn_mean[i] * alpha[i]);
}
std::vector<float> new_weight(conv_weight.size());
int64_t offset = conv_weight.size() / bn_bias.size();
for (int64_t i = 0; i < bn_scale.size(); i++) {
int64_t outter_offset = i * offset;
for (int64_t j = 0; j < offset; j++) {
int64_t index = outter_offset + j;
new_weight[index] = conv_weight[index] * alpha[i];
}
}
// update weight
std::vector<int64_t> weight_shape;
Assert(GetTensorShape(conv_node->input(1), &weight_shape),
"Can not get the shape of " + conv_node->input(1) + " in Conv.");
Weight updated_conv_weight;
updated_conv_weight.set(P2ODataType::FP32, weight_shape, new_weight);
helper_->updated_params[conv_node->input(1)] = updated_conv_weight;
// update bias
Weight updated_bias_weight;
std::vector<int64_t> bias_shape = {static_cast<int64_t>(new_bias.size())};
updated_bias_weight.set(P2ODataType::FP32, bias_shape, new_bias);
helper_->updated_params[conv_bias_node] = updated_bias_weight;
AppendQuantizeTensor(conv_bias_node, true);
// update weight scale
auto quantize_info = helper_->quantize_info[conv_node->input(1)];
std::string scale_node = quantize_info.scale_node_;
std::string zero_node = quantize_info.zeros_node_;
int64_t quantize_axis = quantize_info.quantize_axis_;
RemoveNodeByName(scale_node);
RemoveNodeByName(zero_node);
std::vector<float> scale = quantize_info.scale_;
std::vector<float> new_scale;
std::vector<int64_t> new_zeros;
if (scale.size() == 1) {
GetTensorWiseQuantizeInfo(new_weight, &new_scale, &new_zeros);
} else {
GetChannelWiseQuantizeInfo(new_weight, weight_shape, quantize_axis,
&new_scale, &new_zeros);
}
auto weight_scale_node =
helper_->Constant(ONNX_NAMESPACE::TensorProto::FLOAT, new_scale);
auto weight_zero_node =
helper_->Constant(ONNX_NAMESPACE::TensorProto::INT8, new_zeros);
QuantizeInfo updated_weight_quantize_info(new_scale, new_zeros,
weight_scale_node,
weight_zero_node, quantize_axis);
helper_->quantize_info[conv_node->input(1)] = updated_weight_quantize_info;
// add bias scale and update bias
auto act_quantize_info = helper_->quantize_info[conv_node->input(0)];
std::vector<float> act_scale = act_quantize_info.scale_;
std::vector<float> bias_scale;
for (int64_t i = 0; i < new_scale.size(); i++) {
bias_scale.push_back(act_scale[0] * new_scale[i]);
}
std::vector<int64_t> bias_zeros(bias_scale.size(), 0);
auto bias_scale_node =
helper_->Constant(ONNX_NAMESPACE::TensorProto::FLOAT, bias_scale);
auto bias_zero_node =
helper_->Constant(ONNX_NAMESPACE::TensorProto::INT32, bias_zeros);
QuantizeInfo bias_quantize_info(bias_scale, bias_zeros, bias_scale_node,
bias_zero_node, 0);
helper_->quantize_info[conv_bias_node] = bias_quantize_info;
if (conv_node->input_size() == 2) {
conv_node->add_input(conv_bias_node);
}
// remove BN op
RemoveNodeByName(bn_node->name());
}
}
void QuantizeModelProcessor::MergeConvAdd() {
UpdateInputNameToNodes();
for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) {
auto node = *iter;
if (node->op_type() != "Conv") {
continue;
}
// if act input of conv does not have quantize info, continue
bool act_has_quantize_info = helper_->quantize_info.find(node->input(0)) !=
helper_->quantize_info.end();
if (!act_has_quantize_info) {
continue;
}
// if weight of conv does not have quantize info, continue
bool weight_has_quantize_info =
helper_->quantize_info.find(node->input(1)) !=
helper_->quantize_info.end();
if (!weight_has_quantize_info) {
continue;
}
auto next_nodes = name2node_dict_[node->output(0)];
if (next_nodes.size() > 1 || IsGraphOutput(node->output(0))) {
continue;
}
auto next_node = next_nodes[0];
if (next_node->op_type() != "Add" || IsGraphOutput(next_node->output(0))) {
continue;
}
std::string reshape_node = node->output(0) == next_node->input(0)
? next_node->input(1)
: next_node->input(0);
std::vector<std::shared_ptr<ONNX_NAMESPACE::NodeProto>> before_nodes;
for (auto& node : *nodes_) {
for (size_t i = 0; i < node->output_size(); ++i) {
std::string node_output = node->output(i);
if (node_output == reshape_node) {
before_nodes.push_back(node);
break;
}
}
}
if (before_nodes.size() != 1 || before_nodes[0]->op_type() != "Reshape") {
continue;
}
std::string bias_node = before_nodes[0]->input(0);
// continue if bias is not a constant
std::vector<float> bias_val;
if (!GetTensorByName(bias_node, &bias_val)) {
continue;
}
// continue if shape tensor of reshape op is not a constant
std::vector<int64_t> shape_val;
if (!GetTensorByName(before_nodes[0]->input(1), &shape_val)) {
continue;
}
// continue if shape_val != [1, bias_val.size(), 1, 1]
std::vector<int64_t> target = {1, static_cast<int64_t>(bias_val.size()), 1,
1};
if (target != shape_val) {
continue;
}
// remove Reshape op
RemoveNodeByName(before_nodes[0]->name());
// add scale for bias
std::vector<float> weight_scale =
helper_->quantize_info[node->input(1)].scale_;
std::vector<float> act_scale =
helper_->quantize_info[node->input(0)].scale_;
std::vector<float> bias_scale;
for (int64_t i = 0; i < weight_scale.size(); i++) {
bias_scale.push_back(weight_scale[i] * act_scale[0]);
}
std::vector<int64_t> onnx_zeros(bias_scale.size(), 0);
auto scale_node =
helper_->Constant(ONNX_NAMESPACE::TensorProto::FLOAT, bias_scale);
auto zero_node =
helper_->Constant(ONNX_NAMESPACE::TensorProto::INT32, onnx_zeros);
QuantizeInfo quantize_info(bias_scale, onnx_zeros, scale_node, zero_node,
0);
helper_->quantize_info[bias_node] = quantize_info;
AppendQuantizeTensor(bias_node, true);
node->add_input(bias_node);
RemoveNodeByName(next_node->name());
}
}
void QuantizeModelProcessor::SortNodes() {
// return the topo sort of nodes;
// 1. Get i2o_mapper and constant_nodes, i2o_mapper means the node map to its
// all output nodes, constant_nodes save all constant nodes.
// 2. Nodes without output nodes are first saved to new_nodes, and then
// cyclically delete the records of the node in i2o_mapper items, and nodes
// whose output nodes are empty are also saved to new_nodes in turn.
// 3. Store constant nodes in new_nodes.
// 4. Reverse new_nodes, then assign to nodes.
std::map<std::string, std::vector<std::string>> i2o_mapper;
std::vector<std::shared_ptr<ONNX_NAMESPACE::NodeProto>> constant_nodes;
std::map<std::string, std::shared_ptr<ONNX_NAMESPACE::NodeProto>>
name2node_mapper;
for (int64_t i = 0; i < nodes_->size(); i++) {
auto node = (*nodes_)[i];
if (node->op_type() == "Constant") {
constant_nodes.push_back(node);
continue;
}
name2node_mapper[node->name()] = node;
for (int64_t in_index = 0; in_index < node->input_size(); in_index++) {
std::string input = node->input(in_index);
for (int64_t j = 0; j < nodes_->size(); j++) {
if (i == j) {
continue;
}
auto input_node = (*nodes_)[j];
if (input_node->op_type() == "Constant") {
continue;
}
for (int64_t out_index = 0; out_index < input_node->output_size();
out_index++) {
if (input == input_node->output(out_index)) {
if (i2o_mapper.find(input_node->name()) == i2o_mapper.end()) {
i2o_mapper[input_node->name()] = {node->name()};
} else {
auto iter =
std::find(i2o_mapper[input_node->name()].begin(),
i2o_mapper[input_node->name()].end(), node->name());
if (iter == i2o_mapper[input_node->name()].end()) {
i2o_mapper[input_node->name()].push_back(node->name());
}
}
}
}
}
}
}
std::vector<std::shared_ptr<ONNX_NAMESPACE::NodeProto>> new_nodes;
for (int64_t i = 0; i < nodes_->size(); i++) {
auto node_name = (*nodes_)[i]->name();
auto node = (*nodes_)[i];
if (node->op_type() == "Constant") {
continue;
}
if (i2o_mapper.find(node_name) == i2o_mapper.end()) {
new_nodes.push_back(node);
}
}
int64_t index = 0;
while (index < new_nodes.size()) {
auto current_node = new_nodes[index];
std::string current_node_name = current_node->name();
for (auto iter = i2o_mapper.begin(); iter != i2o_mapper.end(); iter++) {
std::string input_node_name = iter->first;
std::vector<std::string>* output_nodes_name = &iter->second;
if (output_nodes_name->empty()) {
continue;
}
auto in_inter = std::find(output_nodes_name->begin(),
output_nodes_name->end(), current_node_name);
if (in_inter != output_nodes_name->end()) {
output_nodes_name->erase(in_inter);
}
if (output_nodes_name->empty()) {
new_nodes.push_back(name2node_mapper[input_node_name]);
}
}
index++;
}
for (auto& node : constant_nodes) {
new_nodes.push_back(node);
}
std::reverse(new_nodes.begin(), new_nodes.end());
Assert(nodes_->size() == new_nodes.size(),
"The number of nodes after topological sorting is not equal to the "
"number before sorting");
*nodes_ = new_nodes;
}
void QuantizeModelProcessor::RemoveAllQuantizeOps() {
UpdateInputNameToNodes();
for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) {
auto node = *iter;
if (node->op_type() != "QuantizeLinear") {
continue;
}
auto next_node_names = name2node_dict_[node->output(0)];
if (next_node_names.empty() || !next_node_names[0]->has_op_type() ||
next_node_names[0]->op_type() != "DequantizeLinear") {
continue;
}
std::string input_name = node->input(0);
RemoveNodeByName(node->name());
std::string output_name = next_node_names[0]->output(0);
RemoveNodeByName(next_node_names[0]->name());
ReplaceInputOfAllNodes(output_name, input_name);
}
}
// Broadcast quantize info between the input and output of the OPs that will not
// change quantize info
void QuantizeModelProcessor::QuantizeInfoBroadcast() {
UpdateInputNameToNodes();
for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) {
auto node = *iter;
if (node->op_type() != "Identity") {
continue;
}
std::string input_name = node->input(0);
std::string output_name = node->output(0);
auto input_quantize_info_iter = helper_->quantize_info.find(input_name);
auto output_quantize_info_iter = helper_->quantize_info.find(output_name);
// The input and output of Identity do not have quantize info
if (input_quantize_info_iter == helper_->quantize_info.end() &&
output_quantize_info_iter == helper_->quantize_info.end()) {
continue;
}
// The input and output of Identity have quantize info
if (input_quantize_info_iter != helper_->quantize_info.end() &&
output_quantize_info_iter != helper_->quantize_info.end()) {
continue;
}
if (input_quantize_info_iter != helper_->quantize_info.end()) {
helper_->quantize_info[output_name] = helper_->quantize_info[input_name];
} else if (output_quantize_info_iter != helper_->quantize_info.end()) {
helper_->quantize_info[input_name] = helper_->quantize_info[output_name];
}
if (ConnectToOutput(output_name)) {
continue;
}
RemoveNodeByName(node->name());
iter--;
}
}
bool QuantizeModelProcessor::IsGraphOutput(const std::string& name) {
for (auto& item : *outputs_) {
auto out_node = (*item.get());
if (name == out_node.name()) {
return true;
}
}
return false;
}
// Try get tensor shape value
bool QuantizeModelProcessor::GetTensorShape(const std::string& name,
std::vector<int64_t>* shape) {
for (auto& item : *parameters_) {
auto node = *(item.get());
if (node.output(0) != name) {
continue;
}
for (auto i = 0; i < node.attribute_size(); i++) {
auto attr = node.attribute(i);
if (attr.name() == "value") {
auto tensor = attr.mutable_t();
for (int64_t i = 0; i < tensor->dims_size(); i++) {
shape->push_back(tensor->dims(i));
}
}
}
}
return !shape->empty();
}
void QuantizeModelProcessor::GetTensorWiseQuantizeInfo(
const std::vector<float>& tensor, std::vector<float>* scale,
std::vector<int64_t>* zero) {
float max_val = -1;
for (int64_t i = 0; i < tensor.size(); i++) {
if (fabs(tensor[i]) > max_val) {
max_val = fabs(tensor[i]);
}
}
Assert(max_val >= 0,
"[GetTensorWiseQuantizeInfo] Require the scale >= 0, but now it's " +
std::to_string(max_val) + ".");
scale->push_back(max_val / 127);
zero->push_back(0);
}
void QuantizeModelProcessor::GetChannelWiseQuantizeInfo(
const std::vector<float>& tensor, const std::vector<int64_t>& shape,
const int64_t& quant_axis, std::vector<float>* scale,
std::vector<int64_t>* zero) {
int64_t channel_count = shape[quant_axis];
for (int64_t i = 0; i < channel_count; i++) {
if (quant_axis == 0) {
float max_val = -1;
int64_t inner_offset = 1;
for (auto& j : shape) {
inner_offset *= j;
}
inner_offset /= channel_count;
int64_t index = i * inner_offset;
for (int64_t j = 0; j < inner_offset; j++) {
if (fabs(tensor[index + j]) > max_val) {
max_val = fabs(tensor[index + j]);
}
}
Assert(
max_val >= 0,
"[GetChannelWiseQuantizeInfo] Require the scale >= 0, but now it's " +
std::to_string(max_val) + ".");
scale->push_back(max_val / 127);
zero->push_back(0);
} else if (quant_axis == 1) {
float max_val = -1;
int64_t inner_offset = shape.size() == 4 ? shape[2] * shape[3] : 1;
for (int64_t outter = 0; outter < shape[0]; outter++) {
int64_t index = outter * channel_count * inner_offset;
for (int64_t inner = 0; inner < inner_offset; inner++) {
int64_t final_index = index + i * inner_offset + inner;
if (fabs(tensor[final_index]) > max_val) {
max_val = fabs(tensor[final_index]);
}
}
}
Assert(
max_val >= 0,
"[GetChannelWiseQuantizeInfo] Require the scale >= 0, but now it's " +
std::to_string(max_val) + ".");
scale->push_back(max_val / 127);
zero->push_back(0);
} else {
Assert(false,
"QuantizeModelProcessor::GetChannelWiseQuantizeInfo only supports "
"quant_axis equals to 0 or 1, but now it's " +
std::to_string(quant_axis) + ".");
}
}
}
template <typename T>
bool QuantizeModelProcessor::GetTensorByName(const std::string& name,
std::vector<T>* value) {
// Find tensor values in the following order, if found, store the data in
// value, and return true
// 1. updated_parameters, the weight of conv or matmul.
// 2. parameters of original graph, the scale or bias of BN.
// 3. constant node in nodes, other vals.
auto updated_params_iter = helper_->updated_params.find(name);
if (updated_params_iter != helper_->updated_params.end()) {
(updated_params_iter->second).get(value);
return true;
}
for (int64_t block_index = 0; block_index < parser_->NumOfBlocks();
block_index++) {
if (parser_->TryGetTensorValue(block_index, name, value)) {
return true;
}
}
return helper_->TryGetTensorValue(name, value);
}
bool QuantizeModelProcessor::ConnectToOutput(const std::string& output_name) {
std::vector<std::string> names = {output_name};
while (!names.empty()) {
std::string name = names[names.size() - 1];
names.pop_back();
if (IsGraphOutput(name)) {
return true;
}
auto next_nodes = name2node_dict_[name];
for (auto& next : next_nodes) {
if (next->op_type() == "Identity") {
names.push_back(next->output(0));
}
}
}
return false;
}
bool QuantizeModelProcessor::CanBeQuantize(
const std::vector<std::string>& tensor_names,
const std::vector<int64_t>& output_index) {
for (auto& tensor : tensor_names) {
if (helper_->quantize_info.find(tensor) == helper_->quantize_info.end()) {
return false;
}
}
// If there is an OP linked to the output by identity, it needs to be skipped,
// do not quantize the OP
for (auto i = 0; i < output_index.size(); i++) {
int64_t index = output_index[i];
if (index == -1) {
index = tensor_names.size() - 1;
}
std::string output_name = tensor_names[index];
if (ConnectToOutput(output_name)) {
return false;
}
}
return true;
}
void QuantizeModelProcessor::AppendQuantizeTensor(const std::string& tensor,
const bool& only_dequantize) {
if (only_dequantize) {
if (std::find(only_dequantize_tensors.begin(),
only_dequantize_tensors.end(),
tensor) == only_dequantize_tensors.end()) {
only_dequantize_tensors.push_back(tensor);
}
} else {
if (std::find(tensors_to_be_quantize.begin(), tensors_to_be_quantize.end(),
tensor) == tensors_to_be_quantize.end()) {
tensors_to_be_quantize.push_back(tensor);
}
}
}
} // namespace paddle2onnx