Sync v2.0 version of code to github repo

2025-12-24 13:28:13 +08:00 · 2025-06-29 23:29:37 +00:00
parent d151496038
commit 92c2cfa2e7
597 changed files with 78776 additions and 22905 deletions
--- a/custom_ops/cpu_ops/avx_weight_only.cc
+++ b/custom_ops/cpu_ops/avx_weight_only.cc
@@ -1,188 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "dtype.h"
-#include "matmul_helper.h"
-#include "my_types.h"
-#include "paddle/extension.h"
-#include "paddle/phi/core/kernel_registry.h"
-template <typename T>
-void AvxCompute(const paddle::Tensor &x,
-                const paddle::Tensor &weight,
-                const paddle::Tensor &w_bias,
-                bool trans,
-                const std::string alog,
-                paddle::Tensor &out,
-                xft::Matrix<T> &quantizedWeight,
-                xft::Vector<float> &WeightScale,
-                xft::Vector<float> &WeightZero,
-                xft::Vector<float> &WeightSum,
-                MMHelper *mmHelper) {
-    auto out_data = out.data<float>();
-    const float *x_data = reinterpret_cast<const float *>(x.data<float>());
-    const float *bias_data = nullptr;
-    if (w_bias.initialized()) {
-        bias_data = reinterpret_cast<const float *>(w_bias.data<float>());
-    }
-    int m = 1;
-    for (int i = 0; i < x.shape().size() - 1; i++) {
-        m = m * x.shape()[i];
-    }
-    int k = x.shape()[x.shape().size() - 1];
-    int l = weight.shape()[1];
-    int n = weight.shape()[1];
-    if (w_bias.initialized()) {
-        mmHelper->compute_bias(false,
-                               m,
-                               n,
-                               k,
-                               1.0f,
-                               x_data,
-                               k,
-                               quantizedWeight.Data(),
-                               WeightScale.Data(),
-                               WeightZero.Data(),
-                               WeightSum.Data(),
-                               0.0f,
-                               out_data,
-                               l,
-                               bias_data);
-    } else {
-        mmHelper->compute(false,
-                          m,
-                          n,
-                          k,
-                          1.0f,
-                          x_data,
-                          k,
-                          quantizedWeight.Data(),
-                          WeightScale.Data(),
-                          WeightZero.Data(),
-                          WeightSum.Data(),
-                          0.0,
-                          out_data,
-                          l);
-    }
-};
-template <typename T>
-void AvxWeightOnly(const paddle::Tensor &x,
-                   const paddle::Tensor &weight,
-                   const paddle::Tensor &w_bias,
-                   bool trans,
-                   const std::string alog,
-                   paddle::Tensor &out) {
-    static std::unordered_map<std::string,
-                              std::tuple<xft::Matrix<T> *,
-                                         xft::Vector<float> *,
-                                         xft::Vector<float> *,
-                                         xft::Vector<float> *>>
-        weight_only_hub;
-    std::stringstream weights_addr;
-    weights_addr << weight.data<float>() << alog;
-    std::string weight_only_key = weights_addr.str();
-    auto it_created = weight_only_hub.find(weight_only_key);
-    static MMHelper *mmHelper;
-    int rows = weight.shape()[0], cols = weight.shape()[1];
-    xft::Vector<float> *WeightScale =
-        new xft::Vector<float>();  // if weight is int8
-    xft::Vector<float> *WeightZero =
-        new xft::Vector<float>();  // if weight is int8
-    xft::Vector<float> *WeightSum =
-        new xft::Vector<float>();  // if weight is int8
-    xft::Matrix<T> *quantizedWeight = new xft::Matrix<T>();
-    if (it_created == weight_only_hub.end()) {
-        auto weight_ptr = reinterpret_cast<const float *>(weight.data<float>());
-        xft::Matrix<T> convertedWeight;
-        mmHelper = new MMHelper(xft::DeviceKind::iCPU, 0);
-        mmHelper->convertWeight(trans,
-                                rows,
-                                cols,
-                                weight_ptr,
-                                nullptr,
-                                nullptr,
-                                convertedWeight,
-                                *WeightScale,
-                                *WeightZero,
-                                *WeightSum);
-        quantizedWeight->Resize(rows, cols);
-        mmHelper->packWeight(trans, convertedWeight, *quantizedWeight);
-        weight_only_hub[weight_only_key] = std::make_tuple(
-            quantizedWeight, WeightScale, WeightZero, WeightSum);
-        AvxCompute<T>(x,
-                      weight,
-                      w_bias,
-                      trans,
-                      alog,
-                      out,
-                      *quantizedWeight,
-                      *WeightScale,
-                      *WeightZero,
-                      *WeightSum,
-                      mmHelper);
-    } else {
-        AvxCompute<T>(x,
-                      weight,
-                      w_bias,
-                      trans,
-                      alog,
-                      out,
-                      *(std::get<0>(it_created->second)),
-                      *(std::get<1>(it_created->second)),
-                      *(std::get<2>(it_created->second)),
-                      *(std::get<3>(it_created->second)),
-                      mmHelper);
-    }
-}
-std::vector<paddle::Tensor> InvokeAvxWeightOnly(const paddle::Tensor &x,
-                                                const paddle::Tensor &weight,
-                                                const paddle::Tensor &w_bias,
-                                                const std::string &alog,
-                                                bool trans) {
-    auto out_shape = x.shape();
-    out_shape[out_shape.size() - 1] = weight.shape()[1];
-    auto out = paddle::empty(out_shape, x.dtype(), paddle::CPUPlace());
-    if (alog == "int8") {
-        AvxWeightOnly<int8_t>(x, weight, w_bias, trans, alog, out);
-    } else if (alog == "fp16") {
-        AvxWeightOnly<float16_t>(x, weight, w_bias, trans, alog, out);
-    } else {
-        AvxWeightOnly<float16_t>(x, weight, w_bias, trans, alog, out);
-    }
-    return {out};
-}
-
-std::vector<std::vector<int64_t>> AvxWeightOnlyInferShape(
-    std::vector<int64_t> x_shape,
-    std::vector<int64_t> weigh_shape,
-    std::vector<int64_t> weigh_bias_shape) {
-    int m = 1;
-    for (int i = 0; i < x_shape.size() - 1; i++) {
-        m = m * x_shape[i];
-    }
-    return {std::vector<int64_t>{m, weigh_shape[1]}};
-}
-
-std::vector<paddle::DataType> AvxWeightOnlyInferDtype(
-    paddle::DataType x_dtype,
-    paddle::DataType weight_dtype,
-    paddle::DataType weight_bias_dtype) {
-    return {x_dtype};
-}
-
-PD_BUILD_STATIC_OP(avx_weight_only)
-    .Inputs({"x", "weight", "w_bias"})
-    .Outputs({"out"})
-    .Attrs({"alog: std::string", "trans:bool"})
-    .SetKernelFn(PD_KERNEL(InvokeAvxWeightOnly))
-    .SetInferShapeFn(PD_INFER_SHAPE(AvxWeightOnlyInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(AvxWeightOnlyInferDtype));
--- a/custom_ops/cpu_ops/rebuild_padding.cc
+++ b/custom_ops/cpu_ops/rebuild_padding.cc
@@ -0,0 +1,268 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "paddle/extension.h"
+
+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
+
+template <typename T>
+void RebuildPaddingCPUImpl(T *output_data,
+                           const T *input_data,
+                           const int *cum_offsets_data,
+                           const int *seq_len_this_time_data,
+                           const int *seq_lens_decoder_data,
+                           const int *seq_lens_encoder_data,
+                           int max_input_length,
+                           int dim_embed,
+                           const int elem_nums) {
+    for (int i = 0; i < elem_nums; ++i) {
+        const int bi = i / dim_embed;
+        const int bias_idx = i % dim_embed;
+        int seq_id = 0;
+
+        if (seq_len_this_time_data[bi] == 0) {
+            continue;
+        }
+        if (seq_lens_decoder_data[bi] == 0 && seq_lens_encoder_data[bi] == 0) {
+            continue;
+        }
+        if (seq_lens_encoder_data[bi] > 0) {
+            seq_id = seq_lens_encoder_data[bi] - 1;
+        }
+        const int ori_token_idx =
+            bi * max_input_length - cum_offsets_data[bi] + seq_id;
+        const int src_offset = ori_token_idx * dim_embed + bias_idx;
+
+        output_data[i] = input_data[src_offset];
+    }
+}
+
+template <typename T>
+void RebuildAppendPaddingCPUImpl(T *output_data,
+                                 const T *input_data,
+                                 const int *cum_offsets_data,
+                                 const int *seq_len_this_time_data,
+                                 const int *seq_lens_decoder_data,
+                                 const int *seq_lens_encoder_data,
+                                 const int *output_padding_offset_data,
+                                 const int max_input_length,
+                                 const int dim_embed,
+                                 const int64_t output_elem_nums) {
+    for (int i = 0; i < output_elem_nums; ++i) {
+        int out_token_id = i / dim_embed;
+        int ori_token_id =
+            out_token_id + output_padding_offset_data[out_token_id];
+        int bi = ori_token_id / max_input_length;
+        if (seq_len_this_time_data[bi] == 0 ||
+            (seq_lens_decoder_data[bi] == 0 &&
+             seq_lens_encoder_data[bi] == 0)) {
+            continue;
+        }
+        int seq_id = 0;
+        if (seq_lens_encoder_data[bi] > 0) {
+            seq_id = seq_lens_encoder_data[bi] - 1;
+        }
+        int input_token_id = ori_token_id - cum_offsets_data[bi] + seq_id;
+        int bias_idx = i % dim_embed;
+        int src_offset = input_token_id * dim_embed + bias_idx;
+        output_data[i] = input_data[src_offset];
+    }
+}
+
+std::vector<paddle::Tensor> RebuildPaddingCPU(
+    const paddle::Tensor &tmp_out,
+    const paddle::Tensor &cum_offsets,
+    const paddle::Tensor &seq_len_this_time,
+    const paddle::Tensor &seq_lens_decoder,
+    const paddle::Tensor &seq_lens_encoder,
+    const paddle::optional<paddle::Tensor> &output_padding_offset,
+    int max_input_length) {
+    auto tmp_out_cpu = tmp_out.copy_to(paddle::CPUPlace(), true);
+    auto cum_offsets_cpu = cum_offsets.copy_to(paddle::CPUPlace(), true);
+    auto seq_len_this_time_cpu =
+        seq_len_this_time.copy_to(paddle::CPUPlace(), true);
+    auto seq_lens_decoder_cpu =
+        seq_lens_decoder.copy_to(paddle::CPUPlace(), true);
+    auto seq_lens_encoder_cpu =
+        seq_lens_encoder.copy_to(paddle::CPUPlace(), true);
+    paddle::optional<paddle::Tensor> output_padding_offset_cpu;
+    if (output_padding_offset) {
+        output_padding_offset_cpu =
+            output_padding_offset->copy_to(paddle::CPUPlace(), true);
+    }
+
+    int token_num = tmp_out_cpu.shape()[0];
+    int dim_embed = tmp_out_cpu.shape()[1];
+    int bsz = cum_offsets_cpu.shape()[0];
+
+    paddle::Tensor out;
+    if (output_padding_offset_cpu) {
+        int need_delete_token_num = 0;
+        for (int i = 0; i < bsz; ++i) {
+            if (seq_lens_encoder_cpu.data<int>()[i] > 0) {
+                need_delete_token_num +=
+                    seq_lens_encoder_cpu.data<int>()[i] - 1;
+            }
+        }
+        int output_token_num = token_num - need_delete_token_num;
+        out = paddle::full({output_token_num, dim_embed},
+                           0,
+                           tmp_out_cpu.dtype(),
+                           paddle::CPUPlace());
+    } else {
+        out = paddle::full(
+            {bsz, dim_embed}, 0, tmp_out_cpu.dtype(), paddle::CPUPlace());
+    }
+
+    const int *cum_offsets_data = cum_offsets_cpu.data<int>();
+    const int *seq_len_this_time_data = seq_len_this_time_cpu.data<int>();
+    const int *seq_lens_decoder_data = seq_lens_decoder_cpu.data<int>();
+    const int *seq_lens_encoder_data = seq_lens_encoder_cpu.data<int>();
+    int elem_nums = out.numel();
+
+    if (output_padding_offset_cpu) {
+        const int *output_padding_offset_data =
+            output_padding_offset_cpu->data<int>();
+        switch (tmp_out_cpu.dtype()) {
+            case paddle::DataType::FLOAT32:
+                RebuildAppendPaddingCPUImpl<float>(out.data<float>(),
+                                                   tmp_out_cpu.data<float>(),
+                                                   cum_offsets_data,
+                                                   seq_len_this_time_data,
+                                                   seq_lens_decoder_data,
+                                                   seq_lens_encoder_data,
+                                                   output_padding_offset_data,
+                                                   max_input_length,
+                                                   dim_embed,
+                                                   elem_nums);
+                break;
+            case paddle::DataType::FLOAT16:
+                RebuildAppendPaddingCPUImpl<paddle::float16>(
+                    out.data<paddle::float16>(),
+                    tmp_out_cpu.data<paddle::float16>(),
+                    cum_offsets_data,
+                    seq_len_this_time_data,
+                    seq_lens_decoder_data,
+                    seq_lens_encoder_data,
+                    output_padding_offset_data,
+                    max_input_length,
+                    dim_embed,
+                    elem_nums);
+                break;
+            case paddle::DataType::BFLOAT16:
+                RebuildAppendPaddingCPUImpl<paddle::bfloat16>(
+                    out.data<paddle::bfloat16>(),
+                    tmp_out_cpu.data<paddle::bfloat16>(),
+                    cum_offsets_data,
+                    seq_len_this_time_data,
+                    seq_lens_decoder_data,
+                    seq_lens_encoder_data,
+                    output_padding_offset_data,
+                    max_input_length,
+                    dim_embed,
+                    elem_nums);
+                break;
+            default:
+                PD_THROW(
+                    "Unsupported data type for rebuild_padding_cpu. "
+                    "Only float32, float16, and bfloat16 are supported.");
+        }
+    } else {
+        switch (tmp_out_cpu.dtype()) {
+            case paddle::DataType::FLOAT32:
+                RebuildPaddingCPUImpl<float>(out.data<float>(),
+                                             tmp_out_cpu.data<float>(),
+                                             cum_offsets_data,
+                                             seq_len_this_time_data,
+                                             seq_lens_decoder_data,
+                                             seq_lens_encoder_data,
+                                             max_input_length,
+                                             dim_embed,
+                                             elem_nums);
+                break;
+            case paddle::DataType::FLOAT16:
+                RebuildPaddingCPUImpl<paddle::float16>(
+                    out.data<paddle::float16>(),
+                    tmp_out_cpu.data<paddle::float16>(),
+                    cum_offsets_data,
+                    seq_len_this_time_data,
+                    seq_lens_decoder_data,
+                    seq_lens_encoder_data,
+                    max_input_length,
+                    dim_embed,
+                    elem_nums);
+                break;
+            case paddle::DataType::BFLOAT16:
+
+                RebuildPaddingCPUImpl<paddle::bfloat16>(
+                    out.data<paddle::bfloat16>(),
+                    tmp_out_cpu.data<paddle::bfloat16>(),
+                    cum_offsets_data,
+                    seq_len_this_time_data,
+                    seq_lens_decoder_data,
+                    seq_lens_encoder_data,
+                    max_input_length,
+                    dim_embed,
+                    elem_nums);
+                break;
+            default:
+                PD_THROW(
+                    "Unsupported data type for rebuild_padding_cpu. "
+                    "Only float32, float16, and bfloat16 are supported.");
+        }
+    }
+    return {out};
+}
+
+std::vector<std::vector<int64_t>> RebuildPaddingInferShape(
+    const std::vector<int64_t> &tmp_out_shape,
+    const std::vector<int64_t> &cum_offsets_shape,
+    const std::vector<int64_t> &seq_len_this_time_shape,
+    const std::vector<int64_t> &seq_lens_decoder_shape,
+    const std::vector<int64_t> &seq_lens_encoder_shape,
+    const paddle::optional<std::vector<int64_t>> &output_padding_offset_shape) {
+    int64_t dim_embed = tmp_out_shape[1];
+    if (output_padding_offset_shape) {
+        return {{-1, dim_embed}};
+    } else {
+        int64_t bsz = cum_offsets_shape[0];
+        return {{bsz, dim_embed}};
+    }
+}
+
+std::vector<paddle::DataType> RebuildPaddingInferDtype(
+    const paddle::DataType &tmp_out_dtype,
+    const paddle::DataType &cum_offsets_dtype,
+    const paddle::DataType &seq_len_this_time_dtype,
+    const paddle::DataType &seq_lens_decoder_dtype,
+    const paddle::DataType &seq_lens_encoder_dtype,
+    const paddle::optional<paddle::DataType> &output_padding_offset_dtype) {
+    return {tmp_out_dtype};
+}
+
+PD_BUILD_STATIC_OP(rebuild_padding_cpu)
+    .Inputs({"tmp_out",
+             "cum_offsets",
+             "seq_len_this_time",
+             "seq_lens_decoder",
+             "seq_lens_encoder",
+             paddle::Optional("output_padding_offset")})
+    .Outputs({"out"})
+    .Attrs({"max_input_length: int"})
+    .SetKernelFn(PD_KERNEL(RebuildPaddingCPU))
+    .SetInferShapeFn(PD_INFER_SHAPE(RebuildPaddingInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(RebuildPaddingInferDtype));
--- a/custom_ops/cpu_ops/xft_all_layer.cc
+++ b/custom_ops/cpu_ops/xft_all_layer.cc
@@ -1,201 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "layers_decoder.h"
-#include "paddle/extension.h"
-#include "paddle/phi/core/kernel_registry.h"
-
-std::vector<paddle::Tensor> InvokeAllLLaMALayer(
-    const paddle::Tensor &input,
-    const std::vector<paddle::Tensor> &ln1Gamma,
-    const std::vector<paddle::Tensor> &ln1Beta,
-    const std::vector<paddle::Tensor> &qkvWeight,
-    const std::vector<paddle::Tensor> &qkvBiasWeight,
-    const std::vector<paddle::Tensor> &attnOutWeight,
-    const std::vector<paddle::Tensor> &attnOutBias,
-    const std::vector<paddle::Tensor> &ln2Gamma,
-    const std::vector<paddle::Tensor> &ln2Beta,
-    const std::vector<paddle::Tensor> &gateWeight,
-    const std::vector<paddle::Tensor> &gateBias,
-    const std::vector<paddle::Tensor> &upWeight,
-    const std::vector<paddle::Tensor> &upBias,
-    const std::vector<paddle::Tensor> &downWeight,
-    const std::vector<paddle::Tensor> &downBias,
-    const paddle::Tensor &pastSeqLen,
-    const paddle::Tensor &currentSeqLen,
-    const paddle::Tensor &step,
-    int hiddensize,
-    int totalLayer,
-    const std::string &computeType,
-    const std::string &activation,
-    const std::string &normType,
-    int attHeadDim,
-    int attHeadNum,
-    int kvHeadNum,
-    int maxPositions,
-    int maxPosEmbed,
-    int intermediateSize) {
-    auto out = paddle::empty_like(input);
-    auto batchSize = input.shape()[0];
-    auto inputSeqLen = input.shape()[1];
-    auto past_seq_len = pastSeqLen.data<int64_t>()[0];
-    auto cur_seq_len = static_cast<int64_t>(currentSeqLen.data<int32_t>()[0]);
-    auto step_id = step.data<int64_t>()[0];
-    auto output_ptr = reinterpret_cast<void *>(out.data<float>());
-    auto xft_data_type = xft::DataType::fp16;
-    if (computeType == "bf16") {
-        xft_data_type = xft::DataType::bf16;
-    } else if (computeType == "bf16_int8") {
-        xft_data_type = xft::DataType::bf16_int8;
-    }
-    auto xft_act_type = xft::ActivationType::SILU;
-    if (activation == "relu") {
-        xft_act_type = xft::ActivationType::RELU;
-    } else if (activation == "gelu") {
-        xft_act_type = xft::ActivationType::GELU;
-    } else if (activation == "swiglu") {
-        xft_act_type = xft::ActivationType::SWIGLU;
-    }
-    auto xft_norm_type = xft::NormType::RMS;
-    if (normType == "layernorm") {
-        xft_norm_type = xft::NormType::LN;
-    }
-    auto input_ptr = reinterpret_cast<const void *>(input.data<float>());
-    for (int i = 0; i < totalLayer; ++i) {
-        auto ln1Gamma_ptr =
-            reinterpret_cast<const float *>(ln1Gamma[i].data<float>());
-        auto ln1Beta_ptr =
-            reinterpret_cast<const float *>(ln1Beta[i].data<float>());
-        auto qkvWeight_ptr =
-            reinterpret_cast<const void *>(qkvWeight[i].data<float>());
-        auto qkvBiasWeight_ptr =
-            reinterpret_cast<const float *>(qkvBiasWeight[i].data<float>());
-        auto attnOutWeight_ptr =
-            reinterpret_cast<const void *>(attnOutWeight[i].data<float>());
-        auto ln2Gamma_ptr =
-            reinterpret_cast<const float *>(ln2Gamma[i].data<float>());
-        auto ln2Beta_ptr =
-            reinterpret_cast<const float *>(ln2Beta[i].data<float>());
-        auto gate_weight_ptr =
-            reinterpret_cast<const void *>(gateWeight[i].data<float>());
-        auto up_weight_ptr =
-            reinterpret_cast<const void *>(upWeight[i].data<float>());
-        auto down_weight_ptr =
-            reinterpret_cast<const void *>(downWeight[i].data<float>());
-        auto gate_bias_ptr =
-            reinterpret_cast<const float *>(gateBias[i].data<float>());
-        auto up_bias_ptr =
-            reinterpret_cast<const float *>(upBias[i].data<float>());
-        auto down_bias_ptr =
-            reinterpret_cast<const float *>(downBias[i].data<float>());
-        auto attnOutBias_ptr =
-            reinterpret_cast<const float *>(attnOutBias[i].data<float>());
-        invokeLayerLLaMA(
-            xft_data_type,                         // dt
-            xft_act_type,                          // at
-            xft_norm_type,                         // nt
-            i,                                     // layerId
-            totalLayer,                            // totalLayers
-            batchSize,                             // batchSize
-            inputSeqLen,                           // inputSeqLen
-            attHeadDim,                            // attHeadDim
-            attHeadNum,                            // attHeadNum
-            kvHeadNum,                             // kvHeadNum
-            maxPositions,                          // maxPositions
-            maxPosEmbed,                           // maxPosEmbed
-            past_seq_len,                          // pastSeqLen
-            cur_seq_len,                           // currentSeqLen
-            step_id,                               // step
-            hiddensize,                            // hiddenSize
-            intermediateSize,                      // intermediateSize
-            reinterpret_cast<void *>(output_ptr),  // output
-            hiddensize,                            // outputStride
-            input_ptr,                             // input
-            hiddensize,                            // inputStride
-            ln1Gamma_ptr,                          // ln1Gamma
-            ln1Beta_ptr,                           // ln1Beta
-            qkvWeight_ptr,                         // queryWeight
-            qkvWeight_ptr + hiddensize,            // keyWeight
-            qkvWeight_ptr + hiddensize + kvHeadNum * attHeadDim,  // valueWeight
-            attnOutWeight_ptr,  // attnOutWeight
-            ln2Gamma_ptr,       // ln2Gamma
-            ln2Beta_ptr,        // ln2Beta
-            gate_weight_ptr,
-            up_weight_ptr,
-            down_weight_ptr,
-            qkvBiasWeight_ptr,               // queryBias
-            qkvBiasWeight_ptr + hiddensize,  // keyBias
-            qkvBiasWeight_ptr + hiddensize +
-                kvHeadNum * attHeadDim,  // valueBias
-            attnOutBias_ptr,             // attnOutBias
-            qkvWeight_ptr,               // myqkvWeight
-            gate_bias_ptr,
-            up_bias_ptr,
-            down_bias_ptr,
-            qkvBiasWeight_ptr);
-        if (i < totalLayer - 1) {
-            memcpy(const_cast<void *>(input_ptr),
-                   output_ptr,
-                   batchSize * inputSeqLen * hiddensize * sizeof(float));
-        }
-    }
-    return {out};
-}
-
-std::vector<std::vector<int64_t>> AllLLaMALayerInferShape(
-    std::vector<int64_t> x_shape) {
-    return {x_shape};
-}
-
-std::vector<paddle::DataType> AllLLaMALayerInferDtype(
-    paddle::DataType x_dtype) {
-    return {x_dtype};
-}
-
-PD_BUILD_STATIC_OP(xft_llama_all_layer)
-    .Inputs({
-        "x",
-        paddle::Vec("ln1Gamma"),
-        paddle::Vec("ln1Beta"),
-        paddle::Vec("qkvWeight"),
-        paddle::Vec("qkvBiasWeight"),
-        paddle::Vec("attnOutWeight"),
-        paddle::Vec("attnOutBias"),
-        paddle::Vec("ln2Gamma"),
-        paddle::Vec("ln2Beta"),
-        paddle::Vec("gateWeight"),
-        paddle::Vec("gateBias"),
-        paddle::Vec("upWeight"),
-        paddle::Vec("upBias"),
-        paddle::Vec("downWeight"),
-        paddle::Vec("downBias"),
-        "pastSeqLen",
-        "currentSeqLen",
-        "step",
-    })
-    .Outputs({"out"})
-    .Attrs({"hiddensize :int",
-            "totalLayer :int",
-            "computeType : std::string",
-            "activation :std::string",
-            "normType :std::string",
-            "attHeadDim: int",
-            "attHeadNum: int",
-            "kvHeadNum: int",
-            "maxPositions: int",
-            "maxPosEmbed: int",
-            "intermediateSize: int"})
-    .SetKernelFn(PD_KERNEL(InvokeAllLLaMALayer))
-    .SetInferShapeFn(PD_INFER_SHAPE(AllLLaMALayerInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(AllLLaMALayerInferDtype));
--- a/custom_ops/cpu_ops/xft_greedy_search.cc
+++ b/custom_ops/cpu_ops/xft_greedy_search.cc
@@ -1,126 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <omp.h>
-#include <cstdio>
-#include <iostream>
-#include "paddle/extension.h"
-
-void greedy_search(const float *probs,
-                   int64_t *next_token_ids,
-                   int bsz,
-                   int vocab_size) {
-    int numThreads = 0;
-#pragma omp parallel
-    {
-        int tid = omp_get_thread_num();
-        if (tid == 0) {
-            numThreads = omp_get_num_threads();
-        }
-    }
-    float maxVals[bsz];
-
-    // Small batch size (each sample can have at least 2 threads)
-    if (numThreads / bsz >= 2) {
-        int thrPerSample = numThreads / bsz;
-        int sizePerThr = (vocab_size + thrPerSample - 1) / thrPerSample;
-        int maxIndices[bsz * thrPerSample];
-        float maxValues[bsz * thrPerSample];
-
-        // TODO: if size is small, possible to cause out of boundary
-#pragma omp parallel for collapse(2)
-        for (int b = 0; b < bsz; ++b) {
-            for (int t = 0; t < thrPerSample; ++t) {
-                int start = t * sizePerThr;
-                int end = (start + sizePerThr) > vocab_size
-                              ? vocab_size
-                              : (start + sizePerThr);
-                const float *p = probs + b * vocab_size;
-                int maxIdx = start;
-                float maxVal = p[start];
-                for (int off = start + 1; off < end; ++off) {
-                    if (p[off] > maxVal) {
-                        maxVal = p[off];
-                        maxIdx = off;
-                    }
-                }
-
-                // False sharing happens, but since only one time, not avoided
-                maxIndices[b * thrPerSample + t] = maxIdx;
-                maxValues[b * thrPerSample + t] = maxVal;
-            }
-        }
-
-        // Local reduction
-        for (int i = 0; i < bsz; ++i) {
-            int *pIndices = maxIndices + i * thrPerSample;
-            float *pValues = maxValues + i * thrPerSample;
-            int maxIdx = pIndices[0];
-            float maxVal = pValues[0];
-            for (int j = 1; j < thrPerSample; ++j) {
-                if (pValues[j] > maxVal) {
-                    maxVal = pValues[j];
-                    maxIdx = pIndices[j];
-                }
-            }
-            next_token_ids[i] = maxIdx;
-            maxVals[i] = maxVal;
-        }
-    }
-
-    // Each thread handle one sample (one row)
-    else {
-#pragma omp parallel for
-        for (int i = 0; i < bsz; ++i) {
-            int maxId = 0;
-            const float *p = probs + i * vocab_size;
-            float maxVal = p[0];
-            for (int j = 1; j < vocab_size; ++j) {
-                if (p[j] > maxVal) {
-                    maxVal = p[j];
-                    maxId = j;
-                }
-            }
-            next_token_ids[i] = maxId;
-            maxVals[i] = maxVal;
-        }
-    }
-    return;
-}
-std::vector<paddle::Tensor> XftGreedySearch(const paddle::Tensor &probs) {
-    const int bsz = probs.shape()[0];
-    const int vocab_size = probs.shape()[1];
-    auto next_tokens =
-        paddle::empty({bsz, 1}, paddle::DataType::INT64, probs.place());
-
-    greedy_search(probs.data<float>(),
-                  const_cast<int64_t *>(next_tokens.data<int64_t>()),
-                  bsz,
-                  vocab_size);
-    return {next_tokens};
-}
-std::vector<std::vector<int64_t>> XftGreedySearchInferShape(
-    const std::vector<int64_t> &probs_shape) {
-    int64_t bsz = probs_shape[0];
-    return {{bsz, 1}};
-}
-std::vector<paddle::DataType> XftGreedySearchInferDtype(
-    const paddle::DataType &probs_dtype) {
-    return {paddle::DataType::INT64};
-}
-PD_BUILD_STATIC_OP(xft_greedy_search)
-    .Inputs({"probs"})
-    .Outputs({"next_tokens_ids"})
-    .SetInferShapeFn(PD_INFER_SHAPE(XftGreedySearchInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(XftGreedySearchInferDtype))
-    .SetKernelFn(PD_KERNEL(XftGreedySearch));