diff --git a/examples/runtime/README.md b/examples/runtime/README.md
index 849d6ecef..18651bd69 100644
--- a/examples/runtime/README.md
+++ b/examples/runtime/README.md
@@ -1,16 +1,16 @@
-# FastDeploy Runtime推理示例
+# FastDeploy Runtime examples
 
-| 示例代码 | 编程语言 | 说明 |
+| Example Code | Program Language | Description |
 | :------- | :------- | :---- |
-| python/infer_paddle_paddle_inference.py | Python | paddle模型通过paddle inference在cpu/gpu上的推理 |
-| python/infer_paddle_tensorrt.py | Python | paddle模型通过tensorrt在gpu上的推理 |
-| python/infer_paddle_openvino.py | Python | paddle模型通过openvino在cpu上的推理 |
-| python/infer_paddle_onnxruntime.py | Python | paddle模型通过onnx runtime在cpu/gpu上的推理 |
-| python/infer_onnx_openvino.py | Python | onnx模型通过openvino在cpu上的推理 |
-| python/infer_onnx_tensorrt.py | Python | onnx模型通过tensorrt在gpu上的推理 |
-| cpp/infer_paddle_paddle_inference.cc | C++ | paddle模型通过paddle inference在cpu/gpu上的推理 |
-| cpp/infer_paddle_tensorrt.cc | C++ | paddle模型通过tensorrt在gpu上的推理 |
-| cpp/infer_paddle_openvino.cc | C++ | paddle模型通过openvino在cpu上的推理 |
-| cpp/infer_paddle_onnxruntime.cc | C++ | paddle模型通过onnx runtime在cpu/gpu上的推理 |
-| cpp/infer_onnx_openvino.cc | C++ | onnx模型通过openvino在cpu上的推理 |
-| cpp/infer_onnx_tensorrt.cc | C++ | onnx模型通过tensorrt在gpu上的推理 |
+| python/infer_paddle_paddle_inference.py | Python | Deploy Paddle model with Paddle Inference(CPU/GPU) |
+| python/infer_paddle_tensorrt.py | Python | Deploy Paddle model with TensorRT(GPU) |
+| python/infer_paddle_openvino.py | Python | Deploy Paddle model with OpenVINO(CPU)  |
+| python/infer_paddle_onnxruntime.py | Python | Deploy Paddle model with ONNX Runtime(CPU/GPU)  |
+| python/infer_onnx_openvino.py | Python | Deploy ONNX model with OpenVINO(CPU) |
+| python/infer_onnx_tensorrt.py | Python | Deploy ONNX model with TensorRT(GPU) |
+| cpp/infer_paddle_paddle_inference.cc | C++ | Deploy Paddle model with Paddle Inference(CPU/GPU) |
+| cpp/infer_paddle_tensorrt.cc | C++ | Deploy Paddle model with TensorRT(GPU) |
+| cpp/infer_paddle_openvino.cc | C++ | Deploy Paddle model with OpenVINO(CPU |
+| cpp/infer_paddle_onnxruntime.cc | C++ | Deploy Paddle model with ONNX Runtime(CPU/GPU) |
+| cpp/infer_onnx_openvino.cc | C++ | Deploy ONNX model with OpenVINO(CPU) |
+| cpp/infer_onnx_tensorrt.cc | C++ | Deploy ONNX model with TensorRT(GPU) |
diff --git a/fastdeploy/vision/common/processors/normalize.cc b/fastdeploy/vision/common/processors/normalize.cc
index 726ba67a7..e16379ba5 100644
--- a/fastdeploy/vision/common/processors/normalize.cc
+++ b/fastdeploy/vision/common/processors/normalize.cc
@@ -19,7 +19,7 @@ namespace vision {
 Normalize::Normalize(const std::vector<float>& mean,
                      const std::vector<float>& std, bool is_scale,
                      const std::vector<float>& min,
-                     const std::vector<float>& max) {
+                     const std::vector<float>& max, bool swap_rb) {
   FDASSERT(mean.size() == std.size(),
            "Normalize: requires the size of mean equal to the size of std.");
   std::vector<double> mean_(mean.begin(), mean.end());
@@ -50,6 +50,7 @@ Normalize::Normalize(const std::vector<float>& mean,
     alpha_.push_back(alpha);
     beta_.push_back(beta);
   }
+  swap_rb_ = swap_rb;
 }
 
 bool Normalize::ImplByOpenCV(Mat* mat) {
@@ -57,6 +58,7 @@ bool Normalize::ImplByOpenCV(Mat* mat) {
 
   std::vector<cv::Mat> split_im;
   cv::split(*im, split_im);
+  if (swap_rb_) std::swap(split_im[0], split_im[2]);
   for (int c = 0; c < im->channels(); c++) {
     split_im[c].convertTo(split_im[c], CV_32FC1, alpha_[c], beta_[c]);
   }
@@ -79,9 +81,13 @@ bool Normalize::ImplByFlyCV(Mat* mat) {
     std[i] = 1.0 / alpha_[i];
     mean[i] = -1 * beta_[i] * std[i];
   }
+
+  std::vector<uint32_t> channel_reorder_index = {0, 1, 2};
+  if (swap_rb_) std::swap(channel_reorder_index[0], channel_reorder_index[2]);
+
   fcv::Mat new_im(im->width(), im->height(),
                   fcv::FCVImageType::PKG_BGR_F32);
-  fcv::normalize_to_submean_to_reorder(*im, mean, std, std::vector<uint32_t>(),
+  fcv::normalize_to_submean_to_reorder(*im, mean, std, channel_reorder_index,
                                        new_im, true);
   mat->SetMat(new_im);
   return true;
@@ -91,8 +97,8 @@ bool Normalize::ImplByFlyCV(Mat* mat) {
 bool Normalize::Run(Mat* mat, const std::vector<float>& mean,
                     const std::vector<float>& std, bool is_scale,
                     const std::vector<float>& min,
-                    const std::vector<float>& max, ProcLib lib) {
-  auto n = Normalize(mean, std, is_scale, min, max);
+                    const std::vector<float>& max, ProcLib lib, bool swap_rb) {
+  auto n = Normalize(mean, std, is_scale, min, max, swap_rb);
   return n(mat, lib);
 }
 
diff --git a/fastdeploy/vision/common/processors/normalize.h b/fastdeploy/vision/common/processors/normalize.h
index 515fcd7e6..c489207df 100644
--- a/fastdeploy/vision/common/processors/normalize.h
+++ b/fastdeploy/vision/common/processors/normalize.h
@@ -23,7 +23,8 @@ class FASTDEPLOY_DECL Normalize : public Processor {
   Normalize(const std::vector<float>& mean, const std::vector<float>& std,
             bool is_scale = true,
             const std::vector<float>& min = std::vector<float>(),
-            const std::vector<float>& max = std::vector<float>());
+            const std::vector<float>& max = std::vector<float>(),
+            bool swap_rb = false);
   bool ImplByOpenCV(Mat* mat);
 #ifdef ENABLE_FLYCV
   bool ImplByFlyCV(Mat* mat);
@@ -44,14 +45,23 @@ class FASTDEPLOY_DECL Normalize : public Processor {
                   const std::vector<float>& std, bool is_scale = true,
                   const std::vector<float>& min = std::vector<float>(),
                   const std::vector<float>& max = std::vector<float>(),
-                  ProcLib lib = ProcLib::DEFAULT);
+                  ProcLib lib = ProcLib::DEFAULT, bool swap_rb = false);
 
   std::vector<float> GetAlpha() const { return alpha_; }
   std::vector<float> GetBeta() const { return beta_; }
 
+  bool GetSwapRB() {
+    return swap_rb_;
+  }
+
+  void SetSwapRB(bool swap_rb) {
+    swap_rb_ = swap_rb;
+  }
+
  private:
   std::vector<float> alpha_;
   std::vector<float> beta_;
+  bool swap_rb_;
 };
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/normalize_and_permute.cc b/fastdeploy/vision/common/processors/normalize_and_permute.cc
index cb78cc720..ca1565ec8 100644
--- a/fastdeploy/vision/common/processors/normalize_and_permute.cc
+++ b/fastdeploy/vision/common/processors/normalize_and_permute.cc
@@ -21,7 +21,8 @@ NormalizeAndPermute::NormalizeAndPermute(const std::vector<float>& mean,
                                          const std::vector<float>& std,
                                          bool is_scale,
                                          const std::vector<float>& min,
-                                         const std::vector<float>& max) {
+                                         const std::vector<float>& max,
+                                         bool swap_rb) {
   FDASSERT(mean.size() == std.size(),
            "Normalize: requires the size of mean equal to the size of std.");
   std::vector<double> mean_(mean.begin(), mean.end());
@@ -52,6 +53,7 @@ NormalizeAndPermute::NormalizeAndPermute(const std::vector<float>& mean,
     alpha_.push_back(alpha);
     beta_.push_back(beta);
   }
+  swap_rb_ = swap_rb;
 }
 
 bool NormalizeAndPermute::ImplByOpenCV(Mat* mat) {
@@ -60,6 +62,7 @@ bool NormalizeAndPermute::ImplByOpenCV(Mat* mat) {
   int origin_h = im->rows;
   std::vector<cv::Mat> split_im;
   cv::split(*im, split_im);
+  if (swap_rb_) std::swap(split_im[0], split_im[2]);
   for (int c = 0; c < im->channels(); c++) {
     split_im[c].convertTo(split_im[c], CV_32FC1, alpha_[c], beta_[c]);
   }
@@ -94,8 +97,12 @@ bool NormalizeAndPermute::ImplByFlyCV(Mat* mat) {
     std[i] = 1.0 / alpha_[i];
     mean[i] = -1 * beta_[i] * std[i];
   }
+
+  std::vector<uint32_t> channel_reorder_index = {0, 1, 2};
+  if (swap_rb_) std::swap(channel_reorder_index[0], channel_reorder_index[2]);
+
   fcv::Mat new_im;
-  fcv::normalize_to_submean_to_reorder(*im, mean, std, std::vector<uint32_t>(),
+  fcv::normalize_to_submean_to_reorder(*im, mean, std, channel_reorder_index,
                                        new_im, false);
   mat->SetMat(new_im);
   mat->layout = Layout::CHW;
@@ -106,8 +113,9 @@ bool NormalizeAndPermute::ImplByFlyCV(Mat* mat) {
 bool NormalizeAndPermute::Run(Mat* mat, const std::vector<float>& mean,
                               const std::vector<float>& std, bool is_scale,
                               const std::vector<float>& min,
-                              const std::vector<float>& max, ProcLib lib) {
-  auto n = NormalizeAndPermute(mean, std, is_scale, min, max);
+                              const std::vector<float>& max, ProcLib lib,
+                              bool swap_rb) {
+  auto n = NormalizeAndPermute(mean, std, is_scale, min, max, swap_rb);
   return n(mat, lib);
 }
 
diff --git a/fastdeploy/vision/common/processors/normalize_and_permute.h b/fastdeploy/vision/common/processors/normalize_and_permute.h
index ec4766526..04715d9d7 100644
--- a/fastdeploy/vision/common/processors/normalize_and_permute.h
+++ b/fastdeploy/vision/common/processors/normalize_and_permute.h
@@ -23,7 +23,8 @@ class FASTDEPLOY_DECL NormalizeAndPermute : public Processor {
   NormalizeAndPermute(const std::vector<float>& mean,
                       const std::vector<float>& std, bool is_scale = true,
                       const std::vector<float>& min = std::vector<float>(),
-                      const std::vector<float>& max = std::vector<float>());
+                      const std::vector<float>& max = std::vector<float>(),
+                      bool swap_rb = false);
   bool ImplByOpenCV(Mat* mat);
 #ifdef ENABLE_FLYCV
   bool ImplByFlyCV(Mat* mat);
@@ -44,7 +45,7 @@ class FASTDEPLOY_DECL NormalizeAndPermute : public Processor {
                   const std::vector<float>& std, bool is_scale = true,
                   const std::vector<float>& min = std::vector<float>(),
                   const std::vector<float>& max = std::vector<float>(),
-                  ProcLib lib = ProcLib::DEFAULT);
+                  ProcLib lib = ProcLib::DEFAULT, bool swap_rb = false);
 
   void SetAlpha(const std::vector<float>& alpha) {
     alpha_.clear();
@@ -58,9 +59,18 @@ class FASTDEPLOY_DECL NormalizeAndPermute : public Processor {
     beta_.assign(beta.begin(), beta.end());
   }
 
+  bool GetSwapRB() {
+    return swap_rb_;
+  }
+
+  void SetSwapRB(bool swap_rb) {
+    swap_rb_ = swap_rb;
+  }
+
  private:
   std::vector<float> alpha_;
   std::vector<float> beta_;
+  bool swap_rb_;
 };
 }  // namespace vision
 }  // namespace fastdeploy
diff --git a/fastdeploy/vision/common/processors/transform.cc b/fastdeploy/vision/common/processors/transform.cc
index 8d440b9c6..d54a4bca4 100644
--- a/fastdeploy/vision/common/processors/transform.cc
+++ b/fastdeploy/vision/common/processors/transform.cc
@@ -95,10 +95,77 @@ void FuseNormalizeHWC2CHW(
          << std::endl;
 }
 
+void FuseNormalizeColorConvert(
+    std::vector<std::shared_ptr<Processor>>* processors) {
+  // Fuse Normalize and BGR2RGB/RGB2BGR
+  int normalize_index = -1;
+  int color_convert_index = -1;
+  // If these middle processors are after BGR2RGB/RGB2BGR and before Normalize,
+  // we can still fuse Normalize and BGR2RGB/RGB2BGR
+  static std::unordered_set<std::string> middle_processors(
+      {"Resize", "ResizeByShort", "ResizeByLong", "Crop", "CenterCrop",
+       "LimitByStride", "LimitShort", "Pad", "PadToSize", "StridePad",
+       "WarpAffine"});
+
+  for (size_t i = 0; i < processors->size(); ++i) {
+    if ((*processors)[i]->Name() == "BGR2RGB" ||
+        (*processors)[i]->Name() == "RGB2BGR") {
+      color_convert_index = i;
+      for (size_t j = color_convert_index + 1; j < processors->size(); ++j) {
+        if ((*processors)[j]->Name() == "Normalize" ||
+            (*processors)[j]->Name() == "NormalizeAndPermute") {
+          normalize_index = j;
+          break;
+        }
+      }
+      if (normalize_index < 0) {
+        return;
+      }
+      for (size_t j = color_convert_index + 1; j < normalize_index; ++j) {
+        if (middle_processors.count((*processors)[j]->Name())) {
+          continue;
+        }
+        return;
+      }
+    }
+  }
+
+  if (color_convert_index < 0) {
+    return;
+  }
+
+  // Delete Color Space Convert
+  std::string color_processor_name = (*processors)[color_convert_index]->Name();
+  processors->erase(processors->begin() + color_convert_index);
+
+  // Toggle the swap_rb option of the Normalize processor
+  std::string normalize_processor_name =
+      (*processors)[normalize_index - 1]->Name();
+  bool swap_rb;
+  if (normalize_processor_name == "Normalize") {
+    auto processor = dynamic_cast<Normalize*>(
+        (*processors)[normalize_index - 1].get());
+    swap_rb = processor->GetSwapRB();
+    processor->SetSwapRB(!swap_rb);
+  } else if (normalize_processor_name == "NormalizeAndPermute") {
+    auto processor = dynamic_cast<NormalizeAndPermute*>(
+        (*processors)[normalize_index - 1].get());
+    swap_rb = processor->GetSwapRB();
+    processor->SetSwapRB(!swap_rb);
+  } else {
+    FDASSERT(false, "Something wrong in FuseNormalizeColorConvert().");
+  }
+
+  FDINFO << color_processor_name << " and " << normalize_processor_name
+         << " are fused to " << normalize_processor_name
+         << " with swap_rb=" << !swap_rb << std::endl;
+}
+
 void FuseTransforms(
     std::vector<std::shared_ptr<Processor>>* processors) {
   FuseNormalizeCast(processors);
   FuseNormalizeHWC2CHW(processors);
+  FuseNormalizeColorConvert(processors);
 }
 
 
diff --git a/fastdeploy/vision/common/processors/transform.h b/fastdeploy/vision/common/processors/transform.h
index 53f7ffd63..2a914fff7 100644
--- a/fastdeploy/vision/common/processors/transform.h
+++ b/fastdeploy/vision/common/processors/transform.h
@@ -31,6 +31,7 @@
 #include "fastdeploy/vision/common/processors/resize_by_short.h"
 #include "fastdeploy/vision/common/processors/stride_pad.h"
 #include "fastdeploy/vision/common/processors/warp_affine.h"
+#include <unordered_set>
 
 namespace fastdeploy {
 namespace vision {
@@ -41,6 +42,9 @@ void FuseTransforms(std::vector<std::shared_ptr<Processor>>* processors);
 void FuseNormalizeCast(std::vector<std::shared_ptr<Processor>>* processors);
 // Fuse Normalize + HWC2CHW to NormalizeAndPermute
 void FuseNormalizeHWC2CHW(std::vector<std::shared_ptr<Processor>>* processors);
+// Fuse Normalize + Color Convert
+void FuseNormalizeColorConvert(
+    std::vector<std::shared_ptr<Processor>>* processors);
 
 }  // namespace vision
 }  // namespace fastdeploy