// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle2onnx/mapper/quantize_helper.h" namespace paddle2onnx { void QuantizeModelProcessor::RemoveNodeByName(const std::string& name, const bool& update_io) { if (name.empty()) { return; } for (auto iter = nodes_->begin(); iter != nodes_->end(); iter++) { if ((*iter)->name() == name) { std::string input_name = (*iter)->input(0); std::string output_name = (*iter)->output(0); nodes_->erase(iter); if (update_io) { ReplaceInputOfAllNodes(output_name, input_name); } return; } } } void QuantizeModelProcessor::ReplaceInputOfAllNodes( const std::string& old_name, const std::string& new_name, const std::vector>& except_nodes) { auto iter = name2node_dict_.find(old_name); std::vector> need_rename_nodes; // after replace all old_name to new_name, replace the quantize_info of new // name with the quantize_info of old name auto quantize_info_iter = helper_->quantize_info.find(old_name); if (quantize_info_iter != helper_->quantize_info.end()) { helper_->quantize_info[new_name] = helper_->quantize_info[old_name]; } if (iter != name2node_dict_.end()) { need_rename_nodes = iter->second; } for (auto& node : need_rename_nodes) { auto iter = std::find(except_nodes.begin(), except_nodes.end(), node); if (iter != except_nodes.end()) { continue; } for (size_t i = 0; i < node->input_size(); ++i) { if (node->input(i) == old_name) { node->set_input(i, new_name); } } } } void QuantizeModelProcessor::UpdateInputNameToNodes() { name2node_dict_.clear(); for (auto& node : *nodes_) { for (size_t i = 0; i < node->input_size(); ++i) { std::string node_input = node->input(i); if (name2node_dict_.find(node_input) != name2node_dict_.end()) { name2node_dict_[node_input].push_back(node); } else { name2node_dict_[node_input] = {node}; } } } } void QuantizeModelProcessor::ProcessQuantizeModel( std::vector>* parameters, std::vector>* inputs, std::vector>* outputs, std::vector>* nodes, OnnxHelper* helper, const std::string& deploy_backend, const PaddleParser& parser, std::string* calibration_cache) { // Determine whether the model contains quantization related OPs, if not, exit // directly bool quantized_model = false; for (auto& node : *nodes) { if (node->op_type() == "QuantizeLinear" || node->op_type() == "DequantizeLinear") { quantized_model = true; break; } } if (!quantized_model) { return; } parser_ = &parser; helper_ = helper; parameters_ = parameters; inputs_ = inputs; outputs_ = outputs; nodes_ = nodes; P2OLogger() << "[Info] Quantize model deploy backend is: " << deploy_backend << std::endl; // Determine the format of the exported ONNX quantization model according to // the deploy_backend if (deploy_backend == "others") { // If deploy_backend is others, the quantization model is exported as a // float model + quantization table. RemoveAllQuantizeOps(); std::ofstream outfile; outfile.open("max_range.txt", std::ios::out); if (!outfile.is_open()) { P2OLogger() << "[WARNING] Quantize model processer failed to write range " "information in current location." << std::endl; return; } for (auto iter = helper_->quantize_info.begin(); iter != helper_->quantize_info.end(); iter++) { std::string log = iter->first; auto scale = iter->second.scale_; if (scale.size() == 1) { log = log + ": " + std::to_string(scale[0] * 127); outfile << log << std::endl; } } outfile.close(); } else if (deploy_backend == "onnxruntime") { // When deploy_backend is ONNXRuntime, use the follow four steps to process: // 1. broadcast quantize info // 2. remove all quantize ops // 3. merge conv and add // 4. merge conv and bn // 5. add Q and DQ according ONNXRuntime quantize OP fuse patten. // 6. use topo sort in nodes QuantizeInfoBroadcast(); RemoveAllQuantizeOps(); MergeConvAdd(); MergeConvBN(); AddQDQForORT(); SortNodes(); } else if (deploy_backend == "tensorrt") { // When deploy_backend is TensorRT, use the follow four steps to process: // For Explicit Quantization // 1. broadcast quantize info // 2. remove all quantize ops // 3. add Q and DQ before conv and matmul. // 4. use topo sort in nodes // For Implicit Quantization // 1. remove all quantize ops // 2. broadcast quantize info // 3. save float onnx model and alibration.cache QuantizeInfoBroadcast(); RemoveAllQuantizeOps(); // Add qdq for Explicit Quantization // AddTrtQDQ(); // SortNodes(); // Genarate calibration.cache for Implicit Quantization // convert float to hex GenerateCache(calibration_cache); } else if (deploy_backend == "rknn") { // When deploy_backend is RKNN, use the follow four steps to process: // 1. broadcast quantize info // 2. remove all quantize ops // 3. merge conv and add // 4. merge conv and bn // 5. add Q and DQ // 6. use topo sort in nodes QuantizeInfoBroadcast(); RemoveAllQuantizeOps(); MergeConvAdd(); MergeConvBN(); AddQDQForRKNN(); SortNodes(); } else { Assert(false, "[QuantizeModelProcessor] Only support 'onnxruntime' / 'tensorrt' " "/ 'others' as " "backend now, but now the backend is: " + deploy_backend + "."); } } void QuantizeModelProcessor::AddQDQForRKNN() { UpdateInputNameToNodes(); supported_quantize_type_ = {"Abs", "Acos", "Add", "Asin", "Atan", "AveragePool", "BatchNormalization", "Ceil", "Clip", "Conv", "ConvTranspose", "Cos", "Cosh", "Concat", "Elu", "Erf", "Exp", "Floor", "Gemm", "HardSigmoid", "InstanceNormalization", "IsInf", "IsNaN", "Log", "MaxPool", "Mul", "Neg", "ReduceMean", "Relu", "Resize", "Round", "Sigmoid", "Sin", "Sinh", "Split", "Sqrt", "Tan", "Tanh"}; for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) { auto node = *iter; auto type_iter = std::find(supported_quantize_type_.begin(), supported_quantize_type_.end(), node->op_type()); if (!supported_quantize_type_.empty() && type_iter == supported_quantize_type_.end()) { continue; } // Add Miss scale for Mul and MatMul if (node->op_type() == "Mul" || node->op_type() == "MatMul" || node->op_type() == "Add") { for (size_t i = 0; i < node->input_size(); ++i) { std::string node_input = node->input(i); if (helper_->quantize_info.find(node_input) != helper_->quantize_info.end()) { continue; } std::vector weight; if (!GetTensorByName(node_input, &weight)) { continue; } std::vector weight_shape; GetTensorShape(node_input, &weight_shape); int64_t quantize_axis = 1; if (node->op_type() == "Add") { quantize_axis = 0; } std::vector scale; std::vector zeros; if (node->op_type() == "Add" || weight_shape.size() == 1 || weight_shape.empty()) { GetTensorWiseQuantizeInfo(weight, &scale, &zeros); } else { GetChannelWiseQuantizeInfo(weight, weight_shape, quantize_axis, &scale, &zeros); } std::string weight_scale_node, weight_zero_node; if (scale.size() == 1) { weight_scale_node = helper_->Constant( {}, ONNX_NAMESPACE::TensorProto::FLOAT, scale[0]); weight_zero_node = helper_->Constant( {}, ONNX_NAMESPACE::TensorProto::INT8, zeros[0]); } else { weight_scale_node = helper_->Constant(ONNX_NAMESPACE::TensorProto::FLOAT, scale); weight_zero_node = helper_->Constant(ONNX_NAMESPACE::TensorProto::INT8, zeros); } QuantizeInfo weight_quantize_info(scale, zeros, weight_scale_node, weight_zero_node, quantize_axis); helper_->quantize_info[node_input] = weight_quantize_info; } } std::vector tensor_names; for (size_t i = 0; i < node->input_size(); ++i) { std::string node_input = node->input(i); tensor_names.push_back(node_input); } for (size_t i = 0; i < node->output_size(); ++i) { std::string node_output = node->output(i); tensor_names.push_back(node_output); } if (!CanBeQuantize(tensor_names)) { continue; } for (auto& name : tensor_names) { AppendQuantizeTensor(name); } } // update name2node_dict for the change of Relu op. UpdateInputNameToNodes(); // Add QDQ in model AddQDQInModel(tensors_to_be_quantize); } void QuantizeModelProcessor::GenerateCache(std::string* calibration_cache) { union { float f; unsigned char farray[4]; } un; *calibration_cache += "TRT-8XXX-EntropyCalibration2 \n"; for (auto iter = helper_->quantize_info.rbegin(); iter != helper_->quantize_info.rend(); iter++) { std::string tensor_name = iter->first; QuantizeInfo quantize_info = iter->second; if (quantize_info.scale_.size() == 1) { float val = quantize_info.scale_[0]; un.f = val; *calibration_cache += (tensor_name + ": "); std::stringstream enc; for (int64_t i = 3; i >= 0; i--) { enc << std::hex << std::setw(2) << std::setfill('0') << (int)(un.farray[i]); } *calibration_cache = *calibration_cache + enc.str() + "\n"; } } } // In TensorRT, all quantized op: Conv, ConvTranspose, liner(MatMul), MaxPool, // AvgPool, AdaptiveAvgPool, rnn(not support now) // https://github.com/NVIDIA/TensorRT/tree/main/tools/pytorch-quantization/pytorch_quantization/nn/modules void QuantizeModelProcessor::AddTrtQDQ() { UpdateInputNameToNodes(); std::vector quantize_tensors; // save the tensor names that need add quantize ops std::vector pool_types = {"MaxPool", "AvgPool", "AdaptiveAvgPool"}; for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) { quantize_tensors.clear(); auto node = *iter; if (node->op_type() == "Conv" || node->op_type() == "ConvTranspose") { std::vector tensor_names = {node->input(0), node->input(1)}; if (!CanBeQuantize(tensor_names)) { continue; } quantize_tensors = tensor_names; } if (node->op_type() == "MatMul") { std::vector tensor_names = {node->input(0), node->input(1)}; for (auto& name : tensor_names) { if (helper_->quantize_info.find(name) != helper_->quantize_info.end()) { continue; } std::vector matmul_weight; if (!GetTensorByName(name, &matmul_weight)) { continue; } std::vector matmul_weight_shape; if (!GetTensorShape(name, &matmul_weight_shape)) { continue; } int64_t quantize_axis = 1; std::vector scale; std::vector zeros; GetChannelWiseQuantizeInfo(matmul_weight, matmul_weight_shape, quantize_axis, &scale, &zeros); auto scale_node = helper_->Constant(ONNX_NAMESPACE::TensorProto::FLOAT, scale); auto zero_node = helper_->Constant(ONNX_NAMESPACE::TensorProto::INT8, zeros); QuantizeInfo matmul_weight_quantize_info(scale, zeros, scale_node, zero_node, quantize_axis); helper_->quantize_info[name] = matmul_weight_quantize_info; } if (!CanBeQuantize(tensor_names)) { continue; } quantize_tensors = tensor_names; } auto type_iter = std::find(pool_types.begin(), pool_types.end(), node->op_type()); if (type_iter != pool_types.end()) { std::vector tensor_names = {node->input(0)}; if (!CanBeQuantize(tensor_names)) { continue; } quantize_tensors = tensor_names; } std::string negative_scale_tensor = ""; for (std::string& name : quantize_tensors) { Assert( helper_->quantize_info.find(name) != helper_->quantize_info.end(), "[QuantizeModelProcessor] Can not find quantize info for tensor: " + name); QuantizeInfo quantize_info = helper_->quantize_info[name]; std::vector scales = quantize_info.scale_; for (auto& i : scales) { if (i <= 1e-10) { negative_scale_tensor = negative_scale_tensor + " " + name; } } } if (negative_scale_tensor.size() > 0) { P2OLogger() << "[Warning] The scale of tensors: [ " + negative_scale_tensor + " ] contains negative scale, so this OP will not be quantized." << std::endl; continue; } // An OP requires a separate quantize op for (std::string& name : quantize_tensors) { if (IsGraphOutput(name)) { continue; } QuantizeInfo quantize_info = helper_->quantize_info[name]; std::string scale_node = quantize_info.scale_node_; std::string zeros_node = quantize_info.zeros_node_; int64_t quantize_axis = quantize_info.quantize_axis_; auto q_node = helper_->MakeNode("QuantizeLinear", {name, scale_node, zeros_node}); if (helper_->GetOpsetVersion() >= 13) { AddAttribute(q_node, "axis", quantize_axis); } auto dq_node = helper_->MakeNode( "DequantizeLinear", {q_node->output(0), scale_node, zeros_node}); if (helper_->GetOpsetVersion() >= 13) { AddAttribute(dq_node, "axis", quantize_axis); } for (size_t i = 0; i < node->input_size(); ++i) { if (node->input(i) == name) { node->set_input(i, dq_node->output(0)); } } } } } // According to: // https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc void QuantizeModelProcessor::AddQDQForORT() { UpdateInputNameToNodes(); supported_quantize_type_ = {"Conv", "MatMul", "Mul", "Sigmoid", "Add", "LeakyRelu", "Relu"}; for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) { auto node = *iter; auto type_iter = std::find(supported_quantize_type_.begin(), supported_quantize_type_.end(), node->op_type()); if (!supported_quantize_type_.empty() && type_iter == supported_quantize_type_.end()) { continue; } // Here we only add Relu, Conv, mul and matmul, all tensors should add Q and // DQ will be saved in tensors_to_be_quantize if (node->op_type() == "Relu") { std::vector tensor_names = {node->input(0), node->output(0)}; if (!CanBeQuantize(tensor_names)) { continue; } node->set_op_type("LeakyRelu"); AddAttribute(node, "alpha", static_cast(0.0)); for (auto& name : tensor_names) { AppendQuantizeTensor(name); } } if (node->op_type() == "LeakyRelu") { std::vector tensor_names = {node->input(0), node->output(0)}; if (!CanBeQuantize(tensor_names)) { continue; } for (auto& name : tensor_names) { AppendQuantizeTensor(name); } } if (node->op_type() == "Add") { std::vector tensor_names = {node->input(0), node->input(1), node->output(0)}; if (!CanBeQuantize(tensor_names)) { continue; } for (auto& name : tensor_names) { AppendQuantizeTensor(name); } } if (node->op_type() == "Sigmoid") { std::vector tensor_names = {node->input(0), node->output(0)}; if (!CanBeQuantize(tensor_names)) { continue; } for (auto& name : tensor_names) { AppendQuantizeTensor(name); } } if (node->op_type() == "Conv") { std::vector tensor_names = {node->input(0), node->input(1), node->output(0)}; if (node->input_size() == 3) { tensor_names.push_back(node->input(2)); } if (!CanBeQuantize(tensor_names, {2})) { continue; } for (auto& name : tensor_names) { AppendQuantizeTensor(name); } } if (node->op_type() == "MatMul") { std::vector tensor_names = {node->input(0), node->input(1), node->output(0)}; for (auto& name : tensor_names) { if (helper_->quantize_info.find(name) != helper_->quantize_info.end()) { continue; } std::vector matmul_weight; if (!GetTensorByName(name, &matmul_weight)) { continue; } std::vector matmul_weight_shape; if (!GetTensorShape(name, &matmul_weight_shape)) { continue; } int64_t quantize_axis = 1; std::vector scale; std::vector zeros; GetChannelWiseQuantizeInfo(matmul_weight, matmul_weight_shape, quantize_axis, &scale, &zeros); auto scale_node = helper_->Constant(ONNX_NAMESPACE::TensorProto::FLOAT, scale); auto zero_node = helper_->Constant(ONNX_NAMESPACE::TensorProto::INT8, zeros); QuantizeInfo matmul_weight_quantize_info(scale, zeros, scale_node, zero_node, quantize_axis); helper_->quantize_info[name] = matmul_weight_quantize_info; } if (!CanBeQuantize(tensor_names)) { tensor_names.pop_back(); if (!CanBeQuantize(tensor_names)) { continue; } } for (auto& name : tensor_names) { AppendQuantizeTensor(name); } } if (node->op_type() == "Mul") { std::vector tensor_names = {node->input(0), node->input(1), node->output(0)}; if (!CanBeQuantize(tensor_names)) { continue; } for (auto& name : tensor_names) { AppendQuantizeTensor(name); } } } // update name2node_dict for the change of Relu op. UpdateInputNameToNodes(); // Add QDQ in model AddQDQInModel(tensors_to_be_quantize); } void QuantizeModelProcessor::AddQDQInModel( const std::vector& tensors_to_be_quantize) { // add Q and DQ according to tensors_to_be_quantize for (auto& name : tensors_to_be_quantize) { if (IsGraphOutput(name)) { continue; } Assert(helper_->quantize_info.find(name) != helper_->quantize_info.end(), "[QuantizeModelProcessor] Can not find quantize info for tensor: " + name); QuantizeInfo quantize_info = helper_->quantize_info[name]; std::string scale_node = quantize_info.scale_node_; std::string zeros_node = quantize_info.zeros_node_; int64_t quantize_axis = quantize_info.quantize_axis_; auto iter = std::find(only_dequantize_tensors.begin(), only_dequantize_tensors.end(), name); if (iter != only_dequantize_tensors.end()) { // if only add DequantizeLinear std::vector scale = quantize_info.scale_; std::vector bias; Assert(GetTensorByName(name, &bias), "[QuantizeModelProcessor] Can not find bias value: " + name); std::vector new_bias(scale.size(), 0); for (int64_t i = 0; i < bias.size(); i++) { float scale_val = scale.size() == 1 ? scale[0] : scale[i]; new_bias[i] = rint(bias[i] / scale_val); } Weight updated_bias; std::vector bias_shape = {static_cast(new_bias.size())}; updated_bias.set(P2ODataType::INT32, bias_shape, new_bias); helper_->updated_params[name] = updated_bias; auto dq_node = helper_->MakeNode("DequantizeLinear", {name, scale_node, zeros_node}); if (helper_->GetOpsetVersion() >= 13) { AddAttribute(dq_node, "axis", quantize_axis); } ReplaceInputOfAllNodes(name, dq_node->output(0)); } else { // Handle the following situations // conv conv // / | \ -> / \ // conv conv scale DQD scale // / \ // conv conv std::vector> except_nodes; auto next_nodes = name2node_dict_[name]; if (next_nodes.size() > 1) { for (auto& node : next_nodes) { auto iter = std::find(supported_quantize_type_.begin(), supported_quantize_type_.end(), node->op_type()); if (iter == supported_quantize_type_.end()) { except_nodes.push_back(node); } } } // When all the outputs of this tensor cannot be renamed, // it means that the quantization OP will be merged if (next_nodes.size() == except_nodes.size()) { except_nodes.clear(); } auto q_node = helper_->MakeNode("QuantizeLinear", {name, scale_node, zeros_node}); if (helper_->GetOpsetVersion() >= 13) { AddAttribute(q_node, "axis", quantize_axis); } auto dq_node = helper_->MakeNode( "DequantizeLinear", {q_node->output(0), scale_node, zeros_node}); if (helper_->GetOpsetVersion() >= 13) { AddAttribute(dq_node, "axis", quantize_axis); } ReplaceInputOfAllNodes(name, dq_node->output(0), except_nodes); } } } void QuantizeModelProcessor::MergeConvBN() { UpdateInputNameToNodes(); for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) { auto conv_node = *iter; if (conv_node->op_type() != "Conv") { continue; } bool act_has_quantize_info = helper_->quantize_info.find(conv_node->input(0)) != helper_->quantize_info.end(); if (!act_has_quantize_info) { continue; } auto next_nodes = name2node_dict_[conv_node->output(0)]; if (next_nodes.size() > 1 || IsGraphOutput(conv_node->output(0))) { continue; } auto bn_node = next_nodes[0]; if (bn_node->op_type() != "BatchNormalization" || IsGraphOutput(bn_node->output(0))) { continue; } std::vector conv_weight; Assert(GetTensorByName(conv_node->input(1), &conv_weight), "Can not get " + conv_node->input(1) + " from Conv."); std::vector bn_scale; Assert(GetTensorByName(bn_node->input(1), &bn_scale), "Can not get " + bn_node->input(1) + " from BN."); std::vector bn_bias; Assert(GetTensorByName(bn_node->input(2), &bn_bias), "Can not get " + bn_node->input(2) + " from BN."); std::vector bn_mean; Assert(GetTensorByName(bn_node->input(3), &bn_mean), "Can not get " + bn_node->input(3) + " from BN."); std::vector bn_var; Assert(GetTensorByName(bn_node->input(4), &bn_var), "Can not get " + bn_node->input(4) + " from BN."); float epsilon = 1; for (auto i = 0; i < bn_node->attribute_size(); i++) { auto attr = bn_node->attribute(i); if (attr.name() == "epsilon") { epsilon = attr.f(); } } std::vector conv_bias(bn_bias.size(), 0); std::string conv_bias_node = conv_node->input(1) + ".merged.bias"; if (conv_node->input_size() == 3) { conv_bias_node = conv_node->input(2); conv_bias.clear(); Assert(GetTensorByName(conv_bias_node, &conv_bias), "Can not get " + conv_node->input(2) + " in Conv."); } // merge conv and bn std::vector alpha(bn_scale.size()); for (int64_t i = 0; i < bn_scale.size(); i++) { alpha[i] = bn_scale[i] / sqrt(bn_var[i] + epsilon); } std::vector new_bias(bn_scale.size()); for (int64_t i = 0; i < bn_scale.size(); i++) { new_bias[i] = conv_bias[i] * alpha[i] + (bn_bias[i] - bn_mean[i] * alpha[i]); } std::vector new_weight(conv_weight.size()); int64_t offset = conv_weight.size() / bn_bias.size(); for (int64_t i = 0; i < bn_scale.size(); i++) { int64_t outter_offset = i * offset; for (int64_t j = 0; j < offset; j++) { int64_t index = outter_offset + j; new_weight[index] = conv_weight[index] * alpha[i]; } } // update weight std::vector weight_shape; Assert(GetTensorShape(conv_node->input(1), &weight_shape), "Can not get the shape of " + conv_node->input(1) + " in Conv."); Weight updated_conv_weight; updated_conv_weight.set(P2ODataType::FP32, weight_shape, new_weight); helper_->updated_params[conv_node->input(1)] = updated_conv_weight; // update bias Weight updated_bias_weight; std::vector bias_shape = {static_cast(new_bias.size())}; updated_bias_weight.set(P2ODataType::FP32, bias_shape, new_bias); helper_->updated_params[conv_bias_node] = updated_bias_weight; AppendQuantizeTensor(conv_bias_node, true); // update weight scale auto quantize_info = helper_->quantize_info[conv_node->input(1)]; std::string scale_node = quantize_info.scale_node_; std::string zero_node = quantize_info.zeros_node_; int64_t quantize_axis = quantize_info.quantize_axis_; RemoveNodeByName(scale_node); RemoveNodeByName(zero_node); std::vector scale = quantize_info.scale_; std::vector new_scale; std::vector new_zeros; if (scale.size() == 1) { GetTensorWiseQuantizeInfo(new_weight, &new_scale, &new_zeros); } else { GetChannelWiseQuantizeInfo(new_weight, weight_shape, quantize_axis, &new_scale, &new_zeros); } auto weight_scale_node = helper_->Constant(ONNX_NAMESPACE::TensorProto::FLOAT, new_scale); auto weight_zero_node = helper_->Constant(ONNX_NAMESPACE::TensorProto::INT8, new_zeros); QuantizeInfo updated_weight_quantize_info(new_scale, new_zeros, weight_scale_node, weight_zero_node, quantize_axis); helper_->quantize_info[conv_node->input(1)] = updated_weight_quantize_info; // add bias scale and update bias auto act_quantize_info = helper_->quantize_info[conv_node->input(0)]; std::vector act_scale = act_quantize_info.scale_; std::vector bias_scale; for (int64_t i = 0; i < new_scale.size(); i++) { bias_scale.push_back(act_scale[0] * new_scale[i]); } std::vector bias_zeros(bias_scale.size(), 0); auto bias_scale_node = helper_->Constant(ONNX_NAMESPACE::TensorProto::FLOAT, bias_scale); auto bias_zero_node = helper_->Constant(ONNX_NAMESPACE::TensorProto::INT32, bias_zeros); QuantizeInfo bias_quantize_info(bias_scale, bias_zeros, bias_scale_node, bias_zero_node, 0); helper_->quantize_info[conv_bias_node] = bias_quantize_info; if (conv_node->input_size() == 2) { conv_node->add_input(conv_bias_node); } // remove BN op RemoveNodeByName(bn_node->name()); } } void QuantizeModelProcessor::MergeConvAdd() { UpdateInputNameToNodes(); for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) { auto node = *iter; if (node->op_type() != "Conv") { continue; } // if act input of conv does not have quantize info, continue bool act_has_quantize_info = helper_->quantize_info.find(node->input(0)) != helper_->quantize_info.end(); if (!act_has_quantize_info) { continue; } // if weight of conv does not have quantize info, continue bool weight_has_quantize_info = helper_->quantize_info.find(node->input(1)) != helper_->quantize_info.end(); if (!weight_has_quantize_info) { continue; } auto next_nodes = name2node_dict_[node->output(0)]; if (next_nodes.size() > 1 || IsGraphOutput(node->output(0))) { continue; } auto next_node = next_nodes[0]; if (next_node->op_type() != "Add" || IsGraphOutput(next_node->output(0))) { continue; } std::string reshape_node = node->output(0) == next_node->input(0) ? next_node->input(1) : next_node->input(0); std::vector> before_nodes; for (auto& node : *nodes_) { for (size_t i = 0; i < node->output_size(); ++i) { std::string node_output = node->output(i); if (node_output == reshape_node) { before_nodes.push_back(node); break; } } } if (before_nodes.size() != 1 || before_nodes[0]->op_type() != "Reshape") { continue; } std::string bias_node = before_nodes[0]->input(0); // continue if bias is not a constant std::vector bias_val; if (!GetTensorByName(bias_node, &bias_val)) { continue; } // continue if shape tensor of reshape op is not a constant std::vector shape_val; if (!GetTensorByName(before_nodes[0]->input(1), &shape_val)) { continue; } // continue if shape_val != [1, bias_val.size(), 1, 1] std::vector target = {1, static_cast(bias_val.size()), 1, 1}; if (target != shape_val) { continue; } // remove Reshape op RemoveNodeByName(before_nodes[0]->name()); // add scale for bias std::vector weight_scale = helper_->quantize_info[node->input(1)].scale_; std::vector act_scale = helper_->quantize_info[node->input(0)].scale_; std::vector bias_scale; for (int64_t i = 0; i < weight_scale.size(); i++) { bias_scale.push_back(weight_scale[i] * act_scale[0]); } std::vector onnx_zeros(bias_scale.size(), 0); auto scale_node = helper_->Constant(ONNX_NAMESPACE::TensorProto::FLOAT, bias_scale); auto zero_node = helper_->Constant(ONNX_NAMESPACE::TensorProto::INT32, onnx_zeros); QuantizeInfo quantize_info(bias_scale, onnx_zeros, scale_node, zero_node, 0); helper_->quantize_info[bias_node] = quantize_info; AppendQuantizeTensor(bias_node, true); node->add_input(bias_node); RemoveNodeByName(next_node->name()); } } void QuantizeModelProcessor::SortNodes() { // return the topo sort of nodes; // 1. Get i2o_mapper and constant_nodes, i2o_mapper means the node map to its // all output nodes, constant_nodes save all constant nodes. // 2. Nodes without output nodes are first saved to new_nodes, and then // cyclically delete the records of the node in i2o_mapper items, and nodes // whose output nodes are empty are also saved to new_nodes in turn. // 3. Store constant nodes in new_nodes. // 4. Reverse new_nodes, then assign to nodes. std::map> i2o_mapper; std::vector> constant_nodes; std::map> name2node_mapper; for (int64_t i = 0; i < nodes_->size(); i++) { auto node = (*nodes_)[i]; if (node->op_type() == "Constant") { constant_nodes.push_back(node); continue; } name2node_mapper[node->name()] = node; for (int64_t in_index = 0; in_index < node->input_size(); in_index++) { std::string input = node->input(in_index); for (int64_t j = 0; j < nodes_->size(); j++) { if (i == j) { continue; } auto input_node = (*nodes_)[j]; if (input_node->op_type() == "Constant") { continue; } for (int64_t out_index = 0; out_index < input_node->output_size(); out_index++) { if (input == input_node->output(out_index)) { if (i2o_mapper.find(input_node->name()) == i2o_mapper.end()) { i2o_mapper[input_node->name()] = {node->name()}; } else { auto iter = std::find(i2o_mapper[input_node->name()].begin(), i2o_mapper[input_node->name()].end(), node->name()); if (iter == i2o_mapper[input_node->name()].end()) { i2o_mapper[input_node->name()].push_back(node->name()); } } } } } } } std::vector> new_nodes; for (int64_t i = 0; i < nodes_->size(); i++) { auto node_name = (*nodes_)[i]->name(); auto node = (*nodes_)[i]; if (node->op_type() == "Constant") { continue; } if (i2o_mapper.find(node_name) == i2o_mapper.end()) { new_nodes.push_back(node); } } int64_t index = 0; while (index < new_nodes.size()) { auto current_node = new_nodes[index]; std::string current_node_name = current_node->name(); for (auto iter = i2o_mapper.begin(); iter != i2o_mapper.end(); iter++) { std::string input_node_name = iter->first; std::vector* output_nodes_name = &iter->second; if (output_nodes_name->empty()) { continue; } auto in_inter = std::find(output_nodes_name->begin(), output_nodes_name->end(), current_node_name); if (in_inter != output_nodes_name->end()) { output_nodes_name->erase(in_inter); } if (output_nodes_name->empty()) { new_nodes.push_back(name2node_mapper[input_node_name]); } } index++; } for (auto& node : constant_nodes) { new_nodes.push_back(node); } std::reverse(new_nodes.begin(), new_nodes.end()); Assert(nodes_->size() == new_nodes.size(), "The number of nodes after topological sorting is not equal to the " "number before sorting"); *nodes_ = new_nodes; } void QuantizeModelProcessor::RemoveAllQuantizeOps() { UpdateInputNameToNodes(); for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) { auto node = *iter; if (node->op_type() != "QuantizeLinear") { continue; } auto next_node_names = name2node_dict_[node->output(0)]; if (next_node_names.empty() || !next_node_names[0]->has_op_type() || next_node_names[0]->op_type() != "DequantizeLinear") { continue; } std::string input_name = node->input(0); RemoveNodeByName(node->name()); std::string output_name = next_node_names[0]->output(0); RemoveNodeByName(next_node_names[0]->name()); ReplaceInputOfAllNodes(output_name, input_name); } } // Broadcast quantize info between the input and output of the OPs that will not // change quantize info void QuantizeModelProcessor::QuantizeInfoBroadcast() { UpdateInputNameToNodes(); for (auto iter = nodes_->begin(); iter < nodes_->end(); iter++) { auto node = *iter; if (node->op_type() != "Identity") { continue; } std::string input_name = node->input(0); std::string output_name = node->output(0); auto input_quantize_info_iter = helper_->quantize_info.find(input_name); auto output_quantize_info_iter = helper_->quantize_info.find(output_name); // The input and output of Identity do not have quantize info if (input_quantize_info_iter == helper_->quantize_info.end() && output_quantize_info_iter == helper_->quantize_info.end()) { continue; } // The input and output of Identity have quantize info if (input_quantize_info_iter != helper_->quantize_info.end() && output_quantize_info_iter != helper_->quantize_info.end()) { continue; } if (input_quantize_info_iter != helper_->quantize_info.end()) { helper_->quantize_info[output_name] = helper_->quantize_info[input_name]; } else if (output_quantize_info_iter != helper_->quantize_info.end()) { helper_->quantize_info[input_name] = helper_->quantize_info[output_name]; } if (ConnectToOutput(output_name)) { continue; } RemoveNodeByName(node->name()); iter--; } } bool QuantizeModelProcessor::IsGraphOutput(const std::string& name) { for (auto& item : *outputs_) { auto out_node = (*item.get()); if (name == out_node.name()) { return true; } } return false; } // Try get tensor shape value bool QuantizeModelProcessor::GetTensorShape(const std::string& name, std::vector* shape) { for (auto& item : *parameters_) { auto node = *(item.get()); if (node.output(0) != name) { continue; } for (auto i = 0; i < node.attribute_size(); i++) { auto attr = node.attribute(i); if (attr.name() == "value") { auto tensor = attr.mutable_t(); for (int64_t i = 0; i < tensor->dims_size(); i++) { shape->push_back(tensor->dims(i)); } } } } return !shape->empty(); } void QuantizeModelProcessor::GetTensorWiseQuantizeInfo( const std::vector& tensor, std::vector* scale, std::vector* zero) { float max_val = -1; for (int64_t i = 0; i < tensor.size(); i++) { if (fabs(tensor[i]) > max_val) { max_val = fabs(tensor[i]); } } Assert(max_val >= 0, "[GetTensorWiseQuantizeInfo] Require the scale >= 0, but now it's " + std::to_string(max_val) + "."); scale->push_back(max_val / 127); zero->push_back(0); } void QuantizeModelProcessor::GetChannelWiseQuantizeInfo( const std::vector& tensor, const std::vector& shape, const int64_t& quant_axis, std::vector* scale, std::vector* zero) { int64_t channel_count = shape[quant_axis]; for (int64_t i = 0; i < channel_count; i++) { if (quant_axis == 0) { float max_val = -1; int64_t inner_offset = 1; for (auto& j : shape) { inner_offset *= j; } inner_offset /= channel_count; int64_t index = i * inner_offset; for (int64_t j = 0; j < inner_offset; j++) { if (fabs(tensor[index + j]) > max_val) { max_val = fabs(tensor[index + j]); } } Assert( max_val >= 0, "[GetChannelWiseQuantizeInfo] Require the scale >= 0, but now it's " + std::to_string(max_val) + "."); scale->push_back(max_val / 127); zero->push_back(0); } else if (quant_axis == 1) { float max_val = -1; int64_t inner_offset = shape.size() == 4 ? shape[2] * shape[3] : 1; for (int64_t outter = 0; outter < shape[0]; outter++) { int64_t index = outter * channel_count * inner_offset; for (int64_t inner = 0; inner < inner_offset; inner++) { int64_t final_index = index + i * inner_offset + inner; if (fabs(tensor[final_index]) > max_val) { max_val = fabs(tensor[final_index]); } } } Assert( max_val >= 0, "[GetChannelWiseQuantizeInfo] Require the scale >= 0, but now it's " + std::to_string(max_val) + "."); scale->push_back(max_val / 127); zero->push_back(0); } else { Assert(false, "QuantizeModelProcessor::GetChannelWiseQuantizeInfo only supports " "quant_axis equals to 0 or 1, but now it's " + std::to_string(quant_axis) + "."); } } } template bool QuantizeModelProcessor::GetTensorByName(const std::string& name, std::vector* value) { // Find tensor values in the following order, if found, store the data in // value, and return true: // 1. updated_parameters, the weight of conv or matmul. // 2. parameters of original graph, the scale or bias of BN. // 3. constant node in nodes, other vals. auto updated_params_iter = helper_->updated_params.find(name); if (updated_params_iter != helper_->updated_params.end()) { (updated_params_iter->second).get(value); return true; } for (int64_t block_index = 0; block_index < parser_->NumOfBlocks(); block_index++) { if (parser_->TryGetTensorValue(block_index, name, value)) { return true; } } return helper_->TryGetTensorValue(name, value); } bool QuantizeModelProcessor::ConnectToOutput(const std::string& output_name) { std::vector names = {output_name}; while (!names.empty()) { std::string name = names[names.size() - 1]; names.pop_back(); if (IsGraphOutput(name)) { return true; } auto next_nodes = name2node_dict_[name]; for (auto& next : next_nodes) { if (next->op_type() == "Identity") { names.push_back(next->output(0)); } } } return false; } bool QuantizeModelProcessor::CanBeQuantize( const std::vector& tensor_names, const std::vector& output_index) { for (auto& tensor : tensor_names) { if (helper_->quantize_info.find(tensor) == helper_->quantize_info.end()) { return false; } } // If there is an OP linked to the output by identity, it needs to be skipped, // do not quantize the OP for (auto i = 0; i < output_index.size(); i++) { int64_t index = output_index[i]; if (index == -1) { index = tensor_names.size() - 1; } std::string output_name = tensor_names[index]; if (ConnectToOutput(output_name)) { return false; } } return true; } void QuantizeModelProcessor::AppendQuantizeTensor(const std::string& tensor, const bool& only_dequantize) { if (only_dequantize) { if (std::find(only_dequantize_tensors.begin(), only_dequantize_tensors.end(), tensor) == only_dequantize_tensors.end()) { only_dequantize_tensors.push_back(tensor); } } else { if (std::find(tensors_to_be_quantize.begin(), tensors_to_be_quantize.end(), tensor) == tensors_to_be_quantize.end()) { tensors_to_be_quantize.push_back(tensor); } } } } // namespace paddle2onnx