diff --git a/CMakeLists.txt b/CMakeLists.txt index 387a56f87..0b24ea2e7 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -415,14 +415,14 @@ endif() if(ANDROID OR IOS) if(ENABLE_TEXT) set(ENABLE_TEXT OFF CACHE BOOL "Force ENABLE_TEXT OFF" FORCE) - message(STATUS "Found Android or IOS, force ENABLE_TEXT OFF. We do not support faster_tokenizer with Android/IOS now.") + message(STATUS "Found Android or IOS, force ENABLE_TEXT OFF. We do not support fast_tokenizer with Android/IOS now.") endif() endif() if(ENABLE_TEXT) add_definitions(-DENABLE_TEXT) list(APPEND ALL_DEPLOY_SRCS ${DEPLOY_TEXT_SRCS}) - include(${PROJECT_SOURCE_DIR}/cmake/faster_tokenizer.cmake) + include(${PROJECT_SOURCE_DIR}/cmake/fast_tokenizer.cmake) endif() if(ENABLE_PADDLE_FRONTEND) diff --git a/FastDeploy.cmake.in b/FastDeploy.cmake.in index 92439af61..5e2eacf3a 100755 --- a/FastDeploy.cmake.in +++ b/FastDeploy.cmake.in @@ -213,10 +213,10 @@ if (ENABLE_TEXT) message(FATAL_ERROR "Not support fastdeploy text APIs with Android now!") endif() # Add dependency libs later - find_library(FASTER_TOKENIZER_LIB core_tokenizers ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/faster_tokenizer/lib NO_DEFAULT_PATH) - list(APPEND FASTDEPLOY_LIBS ${FASTER_TOKENIZER_LIB}) - list(APPEND FASTDEPLOY_INCS ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/faster_tokenizer/include) - list(APPEND FASTDEPLOY_INCS ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/faster_tokenizer/third_party/include) + find_library(FAST_TOKENIZER_LIB core_tokenizers ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/fast_tokenizer/lib NO_DEFAULT_PATH) + list(APPEND FASTDEPLOY_LIBS ${FAST_TOKENIZER_LIB}) + list(APPEND FASTDEPLOY_INCS ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/fast_tokenizer/include) + list(APPEND FASTDEPLOY_INCS ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/fast_tokenizer/third_party/include) endif() if(ENABLE_PADDLE_FRONTEND) diff --git a/cmake/fast_tokenizer.cmake b/cmake/fast_tokenizer.cmake new file mode 100644 index 000000000..9550d28b8 --- /dev/null +++ b/cmake/fast_tokenizer.cmake @@ -0,0 +1,108 @@ + + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +include(ExternalProject) + +set(FASTTOKENIZER_PROJECT "extern_fast_tokenizer") +set(FASTTOKENIZER_PREFIX_DIR ${THIRD_PARTY_PATH}/fast_tokenizer) +set(FASTTOKENIZER_SOURCE_DIR + ${THIRD_PARTY_PATH}/fast_tokenizer/src/${FASTTOKENIZER_PROJECT}) +set(FASTTOKENIZER_INSTALL_DIR ${THIRD_PARTY_PATH}/install/fast_tokenizer) +set(FASTTOKENIZER_INC_DIR + "${FASTTOKENIZER_INSTALL_DIR}/include" + "${FASTTOKENIZER_INSTALL_DIR}/third_party/include" + CACHE PATH "fast_tokenizer include directory." FORCE) +set(FASTTOKENIZER_LIB_DIR + "${FASTTOKENIZER_INSTALL_DIR}/lib/" + CACHE PATH "fast_tokenizer lib directory." FORCE) +set(FASTTOKENIZER_THIRD_LIB_DIR + "${FASTTOKENIZER_INSTALL_DIR}/third_party/lib/" + CACHE PATH "fast_tokenizer lib directory." FORCE) +set(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" + "${FASTTOKENIZER_LIB_DIR}") + +include_directories(${FASTTOKENIZER_INC_DIR}) + +# Set lib path +if(WIN32) +set(FASTTOKENIZER_COMPILE_LIB "${FASTTOKENIZER_LIB_DIR}/core_tokenizers.lib" + CACHE FILEPATH "fast_tokenizer compile library." FORCE) +message("FASTTOKENIZER_COMPILE_LIB = ${FASTTOKENIZER_COMPILE_LIB}") +set(ICUDT_LIB "${FASTTOKENIZER_THIRD_LIB_DIR}/icudt.lib") +set(ICUUC_LIB "${FASTTOKENIZER_THIRD_LIB_DIR}/icuuc.lib") + +elseif(APPLE) +set(FASTTOKENIZER_COMPILE_LIB "${FASTTOKENIZER_LIB_DIR}/libcore_tokenizers.dylib" + CACHE FILEPATH "fast_tokenizer compile library." FORCE) +else() + +set(FASTTOKENIZER_COMPILE_LIB "${FASTTOKENIZER_LIB_DIR}/libcore_tokenizers.so" + CACHE FILEPATH "fast_tokenizer compile library." FORCE) +message("FASTTOKENIZER_COMPILE_LIB = ${FASTTOKENIZER_COMPILE_LIB}") +endif(WIN32) + +set(FASTTOKENIZER_URL_BASE "https://bj.bcebos.com/paddlenlp/fast_tokenizer/") +set(FASTTOKENIZER_VERSION "1.0.0") + +# Set download url +if(WIN32) + set(FASTTOKENIZER_FILE "fast_tokenizer-win-x64-${FASTTOKENIZER_VERSION}.zip") + if(NOT CMAKE_CL_64) + set(FASTTOKENIZER_FILE "fast_tokenizer-win-x86-${FASTTOKENIZER_VERSION}.zip") + endif() +elseif(APPLE) + if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "arm64") + set(FASTTOKENIZER_FILE "fast_tokenizer-osx-arm64-${FASTTOKENIZER_VERSION}.tgz") + else() + set(FASTTOKENIZER_FILE "fast_tokenizer-osx-x86_64-${FASTTOKENIZER_VERSION}.tgz") + endif() +else() + if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") + set(FASTTOKENIZER_FILE "fast_tokenizer-linux-aarch64-${FASTTOKENIZER_VERSION}.tgz") + else() + set(FASTTOKENIZER_FILE "fast_tokenizer-linux-x64-${FASTTOKENIZER_VERSION}.tgz") + endif() +endif() +set(FASTTOKENIZER_URL "${FASTTOKENIZER_URL_BASE}${FASTTOKENIZER_FILE}") + +ExternalProject_Add( + ${FASTTOKENIZER_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + URL ${FASTTOKENIZER_URL} + PREFIX ${FASTTOKENIZER_PREFIX_DIR} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND + ${CMAKE_COMMAND} -E copy_directory ${FASTTOKENIZER_SOURCE_DIR} ${FASTTOKENIZER_INSTALL_DIR} + BUILD_BYPRODUCTS ${FASTTOKENIZER_COMPILE_LIB}) + +add_library(fast_tokenizer STATIC IMPORTED GLOBAL) +set_property(TARGET fast_tokenizer PROPERTY IMPORTED_LOCATION ${FASTTOKENIZER_COMPILE_LIB}) +add_dependencies(fast_tokenizer ${FASTTOKENIZER_PROJECT}) +list(APPEND DEPEND_LIBS fast_tokenizer) + +if (WIN32) + add_library(icudt STATIC IMPORTED GLOBAL) + set_property(TARGET icudt PROPERTY IMPORTED_LOCATION ${ICUDT_LIB}) + add_dependencies(icudt ${FASTTOKENIZER_PROJECT}) + list(APPEND DEPEND_LIBS icudt) + + add_library(icuuc STATIC IMPORTED GLOBAL) + set_property(TARGET icuuc PROPERTY IMPORTED_LOCATION ${ICUUC_LIB}) + add_dependencies(icuuc ${FASTTOKENIZER_PROJECT}) + list(APPEND DEPEND_LIBS icuuc) +endif() diff --git a/cmake/faster_tokenizer.cmake b/cmake/faster_tokenizer.cmake deleted file mode 100644 index 730aeff66..000000000 --- a/cmake/faster_tokenizer.cmake +++ /dev/null @@ -1,108 +0,0 @@ - - -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -include(ExternalProject) - -set(FASTERTOKENIZER_PROJECT "extern_faster_tokenizer") -set(FASTERTOKENIZER_PREFIX_DIR ${THIRD_PARTY_PATH}/faster_tokenizer) -set(FASTERTOKENIZER_SOURCE_DIR - ${THIRD_PARTY_PATH}/faster_tokenizer/src/${FASTERTOKENIZER_PROJECT}) -set(FASTERTOKENIZER_INSTALL_DIR ${THIRD_PARTY_PATH}/install/faster_tokenizer) -set(FASTERTOKENIZER_INC_DIR - "${FASTERTOKENIZER_INSTALL_DIR}/include" - "${FASTERTOKENIZER_INSTALL_DIR}/third_party/include" - CACHE PATH "faster_tokenizer include directory." FORCE) -set(FASTERTOKENIZER_LIB_DIR - "${FASTERTOKENIZER_INSTALL_DIR}/lib/" - CACHE PATH "faster_tokenizer lib directory." FORCE) -set(FASTERTOKENIZER_THIRD_LIB_DIR - "${FASTERTOKENIZER_INSTALL_DIR}/third_party/lib/" - CACHE PATH "faster_tokenizer lib directory." FORCE) -set(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" - "${FASTERTOKENIZER_LIB_DIR}") - -include_directories(${FASTERTOKENIZER_INC_DIR}) - -# Set lib path -if(WIN32) -set(FASTERTOKENIZER_COMPILE_LIB "${FASTERTOKENIZER_LIB_DIR}/core_tokenizers.lib" - CACHE FILEPATH "faster_tokenizer compile library." FORCE) -message("FASTERTOKENIZER_COMPILE_LIB = ${FASTERTOKENIZER_COMPILE_LIB}") -set(ICUDT_LIB "${FASTERTOKENIZER_THIRD_LIB_DIR}/icudt.lib") -set(ICUUC_LIB "${FASTERTOKENIZER_THIRD_LIB_DIR}/icuuc.lib") - -elseif(APPLE) -set(FASTERTOKENIZER_COMPILE_LIB "${FASTERTOKENIZER_LIB_DIR}/libcore_tokenizers.dylib" - CACHE FILEPATH "faster_tokenizer compile library." FORCE) -else() - -set(FASTERTOKENIZER_COMPILE_LIB "${FASTERTOKENIZER_LIB_DIR}/libcore_tokenizers.so" - CACHE FILEPATH "faster_tokenizer compile library." FORCE) -message("FASTERTOKENIZER_COMPILE_LIB = ${FASTERTOKENIZER_COMPILE_LIB}") -endif(WIN32) - -set(FASTERTOKENIZER_URL_BASE "https://bj.bcebos.com/paddlenlp/faster_tokenizer/") -set(FASTERTOKENIZER_VERSION "dev") - -# Set download url -if(WIN32) - set(FASTERTOKENIZER_FILE "faster_tokenizer-win-x64-${FASTERTOKENIZER_VERSION}.zip") - if(NOT CMAKE_CL_64) - set(FASTERTOKENIZER_FILE "faster_tokenizer-win-x86-${FASTERTOKENIZER_VERSION}.zip") - endif() -elseif(APPLE) - if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "arm64") - set(FASTERTOKENIZER_FILE "faster_tokenizer-osx-arm64-${FASTERTOKENIZER_VERSION}.tgz") - else() - set(FASTERTOKENIZER_FILE "faster_tokenizer-osx-x86_64-${FASTERTOKENIZER_VERSION}.tgz") - endif() -else() - if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") - set(FASTERTOKENIZER_FILE "faster_tokenizer-linux-aarch64-${FASTERTOKENIZER_VERSION}.tgz") - else() - set(FASTERTOKENIZER_FILE "faster_tokenizer-linux-x64-${FASTERTOKENIZER_VERSION}.tgz") - endif() -endif() -set(FASTERTOKENIZER_URL "${FASTERTOKENIZER_URL_BASE}${FASTERTOKENIZER_FILE}") - -ExternalProject_Add( - ${FASTERTOKENIZER_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - URL ${FASTERTOKENIZER_URL} - PREFIX ${FASTERTOKENIZER_PREFIX_DIR} - DOWNLOAD_NO_PROGRESS 1 - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - UPDATE_COMMAND "" - INSTALL_COMMAND - ${CMAKE_COMMAND} -E copy_directory ${FASTERTOKENIZER_SOURCE_DIR} ${FASTERTOKENIZER_INSTALL_DIR} - BUILD_BYPRODUCTS ${FASTERTOKENIZER_COMPILE_LIB}) - -add_library(faster_tokenizer STATIC IMPORTED GLOBAL) -set_property(TARGET faster_tokenizer PROPERTY IMPORTED_LOCATION ${FASTERTOKENIZER_COMPILE_LIB}) -add_dependencies(faster_tokenizer ${FASTERTOKENIZER_PROJECT}) -list(APPEND DEPEND_LIBS faster_tokenizer) - -if (WIN32) - add_library(icudt STATIC IMPORTED GLOBAL) - set_property(TARGET icudt PROPERTY IMPORTED_LOCATION ${ICUDT_LIB}) - add_dependencies(icudt ${FASTERTOKENIZER_PROJECT}) - list(APPEND DEPEND_LIBS icudt) - - add_library(icuuc STATIC IMPORTED GLOBAL) - set_property(TARGET icuuc PROPERTY IMPORTED_LOCATION ${ICUUC_LIB}) - add_dependencies(icuuc ${FASTERTOKENIZER_PROJECT}) - list(APPEND DEPEND_LIBS icuuc) -endif() \ No newline at end of file diff --git a/examples/text/ernie-3.0/cpp/seq_cls_infer.cc b/examples/text/ernie-3.0/cpp/seq_cls_infer.cc index b7ab495bd..c0f58da8d 100644 --- a/examples/text/ernie-3.0/cpp/seq_cls_infer.cc +++ b/examples/text/ernie-3.0/cpp/seq_cls_infer.cc @@ -18,11 +18,11 @@ #include "fastdeploy/function/softmax.h" #include "fastdeploy/runtime.h" #include "fastdeploy/utils/path.h" -#include "faster_tokenizer/tokenizers/ernie_faster_tokenizer.h" +#include "fast_tokenizer/tokenizers/ernie_fast_tokenizer.h" #include "gflags/gflags.h" using namespace paddlenlp; -using namespace faster_tokenizer::tokenizers_impl; +using namespace fast_tokenizer::tokenizers_impl; #ifdef WIN32 const char sep = '\\'; #else @@ -124,10 +124,10 @@ struct SeqClsResult { struct ErnieForSequenceClassificationPredictor { fastdeploy::Runtime runtime_; - ErnieFasterTokenizer tokenizer_; + ErnieFastTokenizer tokenizer_; ErnieForSequenceClassificationPredictor( const fastdeploy::RuntimeOption& option, - const ErnieFasterTokenizer& tokenizer) + const ErnieFastTokenizer& tokenizer) : tokenizer_(tokenizer) { runtime_.Init(option); } @@ -135,8 +135,8 @@ struct ErnieForSequenceClassificationPredictor { bool Preprocess(const std::vector& texts, const std::vector& texts_pair, std::vector* inputs) { - std::vector encodings; - std::vector text_pair_input; + std::vector encodings; + std::vector text_pair_input; // 1. Tokenize the text or (text, text_pair) if (texts_pair.empty()) { for (int i = 0; i < texts.size(); ++i) { @@ -242,7 +242,7 @@ int main(int argc, char* argv[]) { return -1; } } - ErnieFasterTokenizer tokenizer(vocab_path); + ErnieFastTokenizer tokenizer(vocab_path); ErnieForSequenceClassificationPredictor predictor(option, tokenizer); diff --git a/examples/text/ernie-3.0/python/requirements.txt b/examples/text/ernie-3.0/python/requirements.txt index 29711008e..27f236104 100755 --- a/examples/text/ernie-3.0/python/requirements.txt +++ b/examples/text/ernie-3.0/python/requirements.txt @@ -1,2 +1,2 @@ -faster_tokenizer +fast-tokenizer-python paddlenlp diff --git a/examples/text/ernie-3.0/python/seq_cls_infer.py b/examples/text/ernie-3.0/python/seq_cls_infer.py index de67884a1..eb99d1d84 100644 --- a/examples/text/ernie-3.0/python/seq_cls_infer.py +++ b/examples/text/ernie-3.0/python/seq_cls_infer.py @@ -15,7 +15,7 @@ import os import distutils.util import numpy as np -import faster_tokenizer +import fast_tokenizer from paddlenlp.transformers import AutoTokenizer import fastdeploy as fd diff --git a/fastdeploy/text/uie/model.cc b/fastdeploy/text/uie/model.cc index c04d4d6d3..490a4a0fb 100644 --- a/fastdeploy/text/uie/model.cc +++ b/fastdeploy/text/uie/model.cc @@ -19,8 +19,8 @@ #include #include -#include "faster_tokenizer/pretokenizers/pretokenizer.h" -#include "faster_tokenizer/utils/utf8.h" +#include "fast_tokenizer/pretokenizers/pretokenizer.h" +#include "fast_tokenizer/utils/utf8.h" namespace fastdeploy { namespace text { @@ -30,9 +30,9 @@ static std::string DBC2SBC(const std::string& content) { size_t content_utf8_len = 0; while (content_utf8_len < content.length()) { uint32_t content_char; - auto content_char_width = faster_tokenizer::utils::UTF8ToUInt32( + auto content_char_width = fast_tokenizer::utils::UTF8ToUInt32( content.data() + content_utf8_len, &content_char); - content_char = faster_tokenizer::utils::UTF8ToUnicode(content_char); + content_char = fast_tokenizer::utils::UTF8ToUnicode(content_char); if (content_char == 0x3000) { content_char = 0x0020; } else { @@ -43,9 +43,9 @@ static std::string DBC2SBC(const std::string& content) { } else { char dst_char[5] = {0}; uint32_t utf8_uint32 = - faster_tokenizer::utils::UnicodeToUTF8(content_char); + fast_tokenizer::utils::UnicodeToUTF8(content_char); uint32_t utf8_char_count = - faster_tokenizer::utils::UnicodeToUTF8Char(utf8_uint32, dst_char); + fast_tokenizer::utils::UnicodeToUTF8Char(utf8_uint32, dst_char); result.append(dst_char, utf8_char_count); } content_utf8_len += content_char_width; @@ -177,8 +177,8 @@ UIEModel::UIEModel(const std::string& model_file, initialized = Initialize(); SetSchema(schema); tokenizer_.EnableTruncMethod( - max_length, 0, faster_tokenizer::core::Direction::RIGHT, - faster_tokenizer::core::TruncStrategy::LONGEST_FIRST); + max_length, 0, fast_tokenizer::core::Direction::RIGHT, + fast_tokenizer::core::TruncStrategy::LONGEST_FIRST); } UIEModel::UIEModel(const std::string& model_file, @@ -198,8 +198,8 @@ UIEModel::UIEModel(const std::string& model_file, initialized = Initialize(); SetSchema(schema); tokenizer_.EnableTruncMethod( - max_length, 0, faster_tokenizer::core::Direction::RIGHT, - faster_tokenizer::core::TruncStrategy::LONGEST_FIRST); + max_length, 0, fast_tokenizer::core::Direction::RIGHT, + fast_tokenizer::core::TruncStrategy::LONGEST_FIRST); } UIEModel::UIEModel(const std::string& model_file, @@ -219,8 +219,8 @@ UIEModel::UIEModel(const std::string& model_file, initialized = Initialize(); SetSchema(schema); tokenizer_.EnableTruncMethod( - max_length, 0, faster_tokenizer::core::Direction::RIGHT, - faster_tokenizer::core::TruncStrategy::LONGEST_FIRST); + max_length, 0, fast_tokenizer::core::Direction::RIGHT, + fast_tokenizer::core::TruncStrategy::LONGEST_FIRST); } bool UIEModel::Initialize() { @@ -253,7 +253,7 @@ void UIEModel::AutoSplitter(const std::vector& texts, size_t cnt_org = 0; size_t cnt_short = 0; for (auto& text : texts) { - auto text_len = faster_tokenizer::utils::GetUnicodeLenFromUTF8( + auto text_len = fast_tokenizer::utils::GetUnicodeLenFromUTF8( text.c_str(), text.length()); if (text_len <= max_length) { short_texts->push_back(text); @@ -264,14 +264,14 @@ void UIEModel::AutoSplitter(const std::vector& texts, } cnt_short += 1; } else { - faster_tokenizer::pretokenizers::CharToBytesOffsetConverter converter( + fast_tokenizer::pretokenizers::CharToBytesOffsetConverter converter( text); for (size_t start = 0; start < text_len; start += max_length) { size_t end = start + max_length; if (end > text_len) { end = text_len; } - faster_tokenizer::core::Offset byte_offset; + fast_tokenizer::core::Offset byte_offset; converter.convert({start, end}, &byte_offset); short_texts->emplace_back(text.data() + byte_offset.first, byte_offset.second - byte_offset.first); @@ -344,12 +344,12 @@ void UIEModel::GetSpan(const std::vector& start_idx_prob, } void UIEModel::GetSpanIdxAndProbs( const SPAN_SET& span_set, - const std::vector& offset_mapping, + const std::vector& offset_mapping, std::vector* span_idxs, std::vector* probs) const { auto first_sep_idx = std::find_if(offset_mapping.begin() + 1, offset_mapping.end(), - [](const faster_tokenizer::core::Offset& offset) { - return offset == faster_tokenizer::core::Offset(0, 0); + [](const fast_tokenizer::core::Offset& offset) { + return offset == fast_tokenizer::core::Offset(0, 0); }); auto prompt_end_token_id = std::distance(offset_mapping.begin(), first_sep_idx) - 1; @@ -384,9 +384,9 @@ void UIEModel::ConvertSpanToUIEResult( std::string span_text; std::vector offset_mapping; if (span_idxs[i][j].is_prompt_) { - faster_tokenizer::pretokenizers::CharToBytesOffsetConverter converter( + fast_tokenizer::pretokenizers::CharToBytesOffsetConverter converter( prompt); - faster_tokenizer::core::Offset byte_offset; + fast_tokenizer::core::Offset byte_offset; converter.convert({start, end}, &byte_offset); span_text = prompt.substr(byte_offset.first, byte_offset.second - byte_offset.first); @@ -394,9 +394,9 @@ void UIEModel::ConvertSpanToUIEResult( start = 0; end = 0; } else { - faster_tokenizer::pretokenizers::CharToBytesOffsetConverter converter( + fast_tokenizer::pretokenizers::CharToBytesOffsetConverter converter( text); - faster_tokenizer::core::Offset byte_offset; + fast_tokenizer::core::Offset byte_offset; converter.convert({start, end}, &byte_offset); span_text = text.substr(byte_offset.first, byte_offset.second - byte_offset.first); @@ -461,14 +461,14 @@ void UIEModel::AutoJoiner(const std::vector& short_texts, for (auto&& result_idx : input_mapping_item) { if (result_idx == 0) { result_list = std::move((*results)[result_idx]); - offset += faster_tokenizer::utils::GetUnicodeLenFromUTF8( + offset += fast_tokenizer::utils::GetUnicodeLenFromUTF8( short_texts[result_idx].c_str(), short_texts[result_idx].size()); } else { for (auto&& curr_result : (*results)[result_idx]) { curr_result.start_ += offset; curr_result.end_ += offset; } - offset += faster_tokenizer::utils::GetUnicodeLenFromUTF8( + offset += fast_tokenizer::utils::GetUnicodeLenFromUTF8( short_texts[result_idx].c_str(), short_texts[result_idx].size()); result_list.insert(result_list.end(), (*results)[result_idx].begin(), (*results)[result_idx].end()); @@ -521,13 +521,13 @@ bool UIEModel::ConstructTextsAndPrompts( auto max_prompt_iter = std::max_element( prompts->begin(), prompts->end(), [](const std::string& lhs, const std::string& rhs) { - auto lhs_ulen = faster_tokenizer::utils::GetUnicodeLenFromUTF8( + auto lhs_ulen = fast_tokenizer::utils::GetUnicodeLenFromUTF8( lhs.c_str(), lhs.length()); - auto rhs_ulen = faster_tokenizer::utils::GetUnicodeLenFromUTF8( + auto rhs_ulen = fast_tokenizer::utils::GetUnicodeLenFromUTF8( rhs.c_str(), rhs.length()); return lhs_ulen < rhs_ulen; }); - auto max_prompt_len = faster_tokenizer::utils::GetUnicodeLenFromUTF8( + auto max_prompt_len = fast_tokenizer::utils::GetUnicodeLenFromUTF8( max_prompt_iter->c_str(), max_prompt_iter->length()); auto max_predict_len = max_length_ - 3 - max_prompt_len; @@ -547,10 +547,10 @@ bool UIEModel::ConstructTextsAndPrompts( void UIEModel::Preprocess( const std::vector& input_texts, const std::vector& prompts, - std::vector* encodings, + std::vector* encodings, std::vector* inputs) { // 1. Tokenize the short texts and short prompts - std::vector text_pair_input; + std::vector text_pair_input; for (int i = 0; i < input_texts.size(); ++i) { text_pair_input.emplace_back( std::pair(prompts[i], input_texts[i])); @@ -596,7 +596,7 @@ void UIEModel::Preprocess( void UIEModel::Postprocess( const std::vector& outputs, - const std::vector& encodings, + const std::vector& encodings, const std::vector& short_input_texts, const std::vector& short_prompts, const std::vector>& input_mapping_with_short_text, @@ -611,7 +611,7 @@ void UIEModel::Postprocess( GetCandidateIdx(end_prob, outputs[1].shape[0], outputs[1].shape[1], &end_candidate_idx_prob, position_prob_); - std::vector> offset_mapping; + std::vector> offset_mapping; for (int i = 0; i < encodings.size(); ++i) { auto&& curr_offsets = encodings[i].GetOffsets(); offset_mapping.push_back(curr_offsets); @@ -739,7 +739,7 @@ void UIEModel::Predict( if (has_prompt) { // 2. Convert texts and prompts to FDTensor std::vector inputs; - std::vector encodings; + std::vector encodings; Preprocess(short_input_texts, short_prompts, &encodings, &inputs); // 3. Infer diff --git a/fastdeploy/text/uie/model.h b/fastdeploy/text/uie/model.h index 48b21f8b0..c813369d3 100644 --- a/fastdeploy/text/uie/model.h +++ b/fastdeploy/text/uie/model.h @@ -21,7 +21,7 @@ #include #include "fastdeploy/fastdeploy_model.h" #include "fastdeploy/utils/unique_ptr.h" -#include "faster_tokenizer/tokenizers/ernie_faster_tokenizer.h" +#include "fast_tokenizer/tokenizers/ernie_fast_tokenizer.h" using namespace paddlenlp; @@ -133,11 +133,11 @@ struct FASTDEPLOY_DECL UIEModel : public FastDeployModel { std::vector>* input_mapping_with_short_text); void Preprocess(const std::vector& input_texts, const std::vector& prompts, - std::vector* encodings, + std::vector* encodings, std::vector* inputs); void Postprocess( const std::vector& outputs, - const std::vector& encodings, + const std::vector& encodings, const std::vector& short_input_texts, const std::vector& short_prompts, const std::vector>& input_mapping_with_short_text, @@ -167,7 +167,7 @@ struct FASTDEPLOY_DECL UIEModel : public FastDeployModel { }; using SPAN_SET = std::set, IdxProbCmp>; struct SpanIdx { - faster_tokenizer::core::Offset offset_; + fast_tokenizer::core::Offset offset_; bool is_prompt_; }; void SetValidBackend(); @@ -188,7 +188,7 @@ struct FASTDEPLOY_DECL UIEModel : public FastDeployModel { SPAN_SET* span_set) const; void GetSpanIdxAndProbs( const SPAN_SET& span_set, - const std::vector& offset_mapping, + const std::vector& offset_mapping, std::vector* span_idxs, std::vector* probs) const; void ConvertSpanToUIEResult( const std::vector& texts, @@ -200,7 +200,7 @@ struct FASTDEPLOY_DECL UIEModel : public FastDeployModel { size_t max_length_; float position_prob_; SchemaLanguage schema_language_; - faster_tokenizer::tokenizers_impl::ErnieFasterTokenizer tokenizer_; + fast_tokenizer::tokenizers_impl::ErnieFastTokenizer tokenizer_; }; } // namespace text diff --git a/scripts/fastdeploy_init.bat b/scripts/fastdeploy_init.bat index a7b8567b2..aaaf8e75e 100644 --- a/scripts/fastdeploy_init.bat +++ b/scripts/fastdeploy_init.bat @@ -46,8 +46,8 @@ if "%__script_action_type%" == "show" ( echo !__3rd_lib_file! | findstr "opencv">nul && set __3rd_needed_flag=true echo !__3rd_lib_file! | findstr "opencv">nul && set __api_tag=!__api_tag!::vision if "!__3rd_needed_flag!"=="true" (echo !__3rd_lib_file! | findstr d\.lib>nul && set __3rd_needed_flag=false) - echo !__3rd_lib_file! | findstr "faster_tokenizer">nul && set __3rd_needed_flag=true - echo !__3rd_lib_file! | findstr "faster_tokenizer">nul && set __api_tag=!__api_tag!::text + echo !__3rd_lib_file! | findstr "fast_tokenizer">nul && set __3rd_needed_flag=true + echo !__3rd_lib_file! | findstr "fast_tokenizer">nul && set __api_tag=!__api_tag!::text if "!__3rd_needed_flag!"=="true" (echo [Lib] !__3rd_lib_file! **[NEEDED][!__api_tag!]**) else (echo [Lib] !__3rd_lib_file!) ) @@ -58,8 +58,8 @@ if "%__script_action_type%" == "show" ( set __3rd_include_dir=%%a && set __3rd_needed_flag=false && set __api_tag=fastdeploy echo !__3rd_include_dir! | findstr "opencv">nul && set __3rd_needed_flag=true echo !__3rd_include_dir! | findstr "opencv">nul && set __api_tag=!__api_tag!::vision - echo !__3rd_include_dir! | findstr "faster_tokenizer">nul && set __3rd_needed_flag=true - echo !__3rd_include_dir! | findstr "faster_tokenizer">nul && set __api_tag=!__api_tag!::text + echo !__3rd_include_dir! | findstr "fast_tokenizer">nul && set __3rd_needed_flag=true + echo !__3rd_include_dir! | findstr "fast_tokenizer">nul && set __api_tag=!__api_tag!::text if "!__3rd_needed_flag!"=="true" (echo [Include] !__3rd_include_dir! **[NEEDED][!__api_tag!]**) else (echo [Include] !__3rd_include_dir!) ) @@ -164,4 +164,4 @@ echo --------------------------------------------------------------------------- goto:eof @rem end -@echo on \ No newline at end of file +@echo on diff --git a/scripts/fastdeploy_init.sh b/scripts/fastdeploy_init.sh index 1e5bc0bb3..0f39e8844 100644 --- a/scripts/fastdeploy_init.sh +++ b/scripts/fastdeploy_init.sh @@ -60,9 +60,4 @@ if [ -d ${INSTALLED_PREBUILT_FASTDEPLOY_DIR}/third_libs/install/paddlelite ]; th echo "Paddle Lite Lib: ${INSTALLED_PREBUILT_FASTDEPLOY_DIR}/third_libs/install/paddlelite/lib" fi -if [ -d ${INSTALLED_PREBUILT_FASTDEPLOY_DIR}/third_libs/install/faster_tokenizer ]; then - export LD_LIBRARY_PATH=${INSTALLED_PREBUILT_FASTDEPLOY_DIR}/third_libs/install/faster_tokenizer/lib:${LD_LIBRARY_PATH} - echo "Faster Tokenizer Lib: ${INSTALLED_PREBUILT_FASTDEPLOY_DIR}/third_libs/install/faster_tokenizer/lib" -fi - cd ${CURRENT_EXE_DIR} diff --git a/serving/Dockerfile b/serving/Dockerfile index cca94ebe9..fc5a45693 100644 --- a/serving/Dockerfile +++ b/serving/Dockerfile @@ -35,7 +35,7 @@ RUN apt-get update \ RUN apt-get update \ && apt-get install -y --no-install-recommends libre2-5 libb64-0d python3 python3-pip libarchive-dev ffmpeg libsm6 libxext6 \ && python3 -m pip install -U pip \ - && python3 -m pip install paddlepaddle-gpu paddlenlp faster_tokenizer + && python3 -m pip install paddlepaddle-gpu paddlenlp fast-tokenizer-python COPY python/dist/*.whl /opt/fastdeploy/ RUN python3 -m pip install /opt/fastdeploy/*.whl \ diff --git a/serving/Dockerfile_cpu b/serving/Dockerfile_cpu index 9e6d8b0ef..59171e85f 100644 --- a/serving/Dockerfile_cpu +++ b/serving/Dockerfile_cpu @@ -19,7 +19,7 @@ ENV TZ=Asia/Shanghai \ RUN apt-get update && apt-get install -y --no-install-recommends apt-utils libgomp1 ffmpeg libsm6 libxext6 \ && python3 -m pip install -U pip \ - && python3 -m pip install paddlepaddle paddlenlp faster_tokenizer + && python3 -m pip install paddlepaddle paddlenlp fast-tokenizer-python COPY python/dist/*.whl *.whl /opt/fastdeploy/ RUN python3 -m pip install /opt/fastdeploy/*.whl \ diff --git a/tests/release_task/cpp_run.bat b/tests/release_task/cpp_run.bat index 4d92ea51b..8584733c3 100644 --- a/tests/release_task/cpp_run.bat +++ b/tests/release_task/cpp_run.bat @@ -58,8 +58,8 @@ set PATH=%FASTDEPLOY_HOME%\third_libs\install\paddle_inference\third_party\insta set PATH=%FASTDEPLOY_HOME%\third_libs\install\paddle_inference\third_party\install\mklml\lib;%PATH% set PATH=%FASTDEPLOY_HOME%\third_libs\install\paddle2onnx\lib;%PATH% set PATH=%FASTDEPLOY_HOME%\third_libs\install\tensorrt\lib;%PATH% -set PATH=%FASTDEPLOY_HOME%\third_libs\install\faster_tokenizer\lib;%PATH% -set PATH=%FASTDEPLOY_HOME%\third_libs\install\faster_tokenizer\third_party\lib;%PATH% +set PATH=%FASTDEPLOY_HOME%\third_libs\install\fast_tokenizer\lib;%PATH% +set PATH=%FASTDEPLOY_HOME%\third_libs\install\fast_tokenizer\third_party\lib;%PATH% set PATH=%FASTDEPLOY_HOME%\third_libs\install\yaml-cpp\lib;%PATH% set PATH=%FASTDEPLOY_HOME%\third_libs\install\opencv\build\x64\vc15\bin;%PATH% set PATH=%FASTDEPLOY_HOME%\third_libs\install\openvino\runtime\bin;%PATH%