[Other] faster_tokenizer->fast_tokenizer (#636)

* faster_tokenizer->fast_tokenizer

* ErnieFasterTokenizer->ErnieFastTokenizer

* update the fastdeploy_init

Co-authored-by: Jason <jiangjiajun@baidu.com>
This commit is contained in:
Jack Zhou
2022-11-21 13:45:00 +08:00
committed by GitHub
parent 3e1fc69a0c
commit eeae48deff
14 changed files with 170 additions and 175 deletions

View File

@@ -415,14 +415,14 @@ endif()
if(ANDROID OR IOS) if(ANDROID OR IOS)
if(ENABLE_TEXT) if(ENABLE_TEXT)
set(ENABLE_TEXT OFF CACHE BOOL "Force ENABLE_TEXT OFF" FORCE) set(ENABLE_TEXT OFF CACHE BOOL "Force ENABLE_TEXT OFF" FORCE)
message(STATUS "Found Android or IOS, force ENABLE_TEXT OFF. We do not support faster_tokenizer with Android/IOS now.") message(STATUS "Found Android or IOS, force ENABLE_TEXT OFF. We do not support fast_tokenizer with Android/IOS now.")
endif() endif()
endif() endif()
if(ENABLE_TEXT) if(ENABLE_TEXT)
add_definitions(-DENABLE_TEXT) add_definitions(-DENABLE_TEXT)
list(APPEND ALL_DEPLOY_SRCS ${DEPLOY_TEXT_SRCS}) list(APPEND ALL_DEPLOY_SRCS ${DEPLOY_TEXT_SRCS})
include(${PROJECT_SOURCE_DIR}/cmake/faster_tokenizer.cmake) include(${PROJECT_SOURCE_DIR}/cmake/fast_tokenizer.cmake)
endif() endif()
if(ENABLE_PADDLE_FRONTEND) if(ENABLE_PADDLE_FRONTEND)

View File

@@ -213,10 +213,10 @@ if (ENABLE_TEXT)
message(FATAL_ERROR "Not support fastdeploy text APIs with Android now!") message(FATAL_ERROR "Not support fastdeploy text APIs with Android now!")
endif() endif()
# Add dependency libs later # Add dependency libs later
find_library(FASTER_TOKENIZER_LIB core_tokenizers ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/faster_tokenizer/lib NO_DEFAULT_PATH) find_library(FAST_TOKENIZER_LIB core_tokenizers ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/fast_tokenizer/lib NO_DEFAULT_PATH)
list(APPEND FASTDEPLOY_LIBS ${FASTER_TOKENIZER_LIB}) list(APPEND FASTDEPLOY_LIBS ${FAST_TOKENIZER_LIB})
list(APPEND FASTDEPLOY_INCS ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/faster_tokenizer/include) list(APPEND FASTDEPLOY_INCS ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/fast_tokenizer/include)
list(APPEND FASTDEPLOY_INCS ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/faster_tokenizer/third_party/include) list(APPEND FASTDEPLOY_INCS ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/fast_tokenizer/third_party/include)
endif() endif()
if(ENABLE_PADDLE_FRONTEND) if(ENABLE_PADDLE_FRONTEND)

108
cmake/fast_tokenizer.cmake Normal file
View File

@@ -0,0 +1,108 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include(ExternalProject)
set(FASTTOKENIZER_PROJECT "extern_fast_tokenizer")
set(FASTTOKENIZER_PREFIX_DIR ${THIRD_PARTY_PATH}/fast_tokenizer)
set(FASTTOKENIZER_SOURCE_DIR
${THIRD_PARTY_PATH}/fast_tokenizer/src/${FASTTOKENIZER_PROJECT})
set(FASTTOKENIZER_INSTALL_DIR ${THIRD_PARTY_PATH}/install/fast_tokenizer)
set(FASTTOKENIZER_INC_DIR
"${FASTTOKENIZER_INSTALL_DIR}/include"
"${FASTTOKENIZER_INSTALL_DIR}/third_party/include"
CACHE PATH "fast_tokenizer include directory." FORCE)
set(FASTTOKENIZER_LIB_DIR
"${FASTTOKENIZER_INSTALL_DIR}/lib/"
CACHE PATH "fast_tokenizer lib directory." FORCE)
set(FASTTOKENIZER_THIRD_LIB_DIR
"${FASTTOKENIZER_INSTALL_DIR}/third_party/lib/"
CACHE PATH "fast_tokenizer lib directory." FORCE)
set(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}"
"${FASTTOKENIZER_LIB_DIR}")
include_directories(${FASTTOKENIZER_INC_DIR})
# Set lib path
if(WIN32)
set(FASTTOKENIZER_COMPILE_LIB "${FASTTOKENIZER_LIB_DIR}/core_tokenizers.lib"
CACHE FILEPATH "fast_tokenizer compile library." FORCE)
message("FASTTOKENIZER_COMPILE_LIB = ${FASTTOKENIZER_COMPILE_LIB}")
set(ICUDT_LIB "${FASTTOKENIZER_THIRD_LIB_DIR}/icudt.lib")
set(ICUUC_LIB "${FASTTOKENIZER_THIRD_LIB_DIR}/icuuc.lib")
elseif(APPLE)
set(FASTTOKENIZER_COMPILE_LIB "${FASTTOKENIZER_LIB_DIR}/libcore_tokenizers.dylib"
CACHE FILEPATH "fast_tokenizer compile library." FORCE)
else()
set(FASTTOKENIZER_COMPILE_LIB "${FASTTOKENIZER_LIB_DIR}/libcore_tokenizers.so"
CACHE FILEPATH "fast_tokenizer compile library." FORCE)
message("FASTTOKENIZER_COMPILE_LIB = ${FASTTOKENIZER_COMPILE_LIB}")
endif(WIN32)
set(FASTTOKENIZER_URL_BASE "https://bj.bcebos.com/paddlenlp/fast_tokenizer/")
set(FASTTOKENIZER_VERSION "1.0.0")
# Set download url
if(WIN32)
set(FASTTOKENIZER_FILE "fast_tokenizer-win-x64-${FASTTOKENIZER_VERSION}.zip")
if(NOT CMAKE_CL_64)
set(FASTTOKENIZER_FILE "fast_tokenizer-win-x86-${FASTTOKENIZER_VERSION}.zip")
endif()
elseif(APPLE)
if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "arm64")
set(FASTTOKENIZER_FILE "fast_tokenizer-osx-arm64-${FASTTOKENIZER_VERSION}.tgz")
else()
set(FASTTOKENIZER_FILE "fast_tokenizer-osx-x86_64-${FASTTOKENIZER_VERSION}.tgz")
endif()
else()
if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
set(FASTTOKENIZER_FILE "fast_tokenizer-linux-aarch64-${FASTTOKENIZER_VERSION}.tgz")
else()
set(FASTTOKENIZER_FILE "fast_tokenizer-linux-x64-${FASTTOKENIZER_VERSION}.tgz")
endif()
endif()
set(FASTTOKENIZER_URL "${FASTTOKENIZER_URL_BASE}${FASTTOKENIZER_FILE}")
ExternalProject_Add(
${FASTTOKENIZER_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
URL ${FASTTOKENIZER_URL}
PREFIX ${FASTTOKENIZER_PREFIX_DIR}
DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
UPDATE_COMMAND ""
INSTALL_COMMAND
${CMAKE_COMMAND} -E copy_directory ${FASTTOKENIZER_SOURCE_DIR} ${FASTTOKENIZER_INSTALL_DIR}
BUILD_BYPRODUCTS ${FASTTOKENIZER_COMPILE_LIB})
add_library(fast_tokenizer STATIC IMPORTED GLOBAL)
set_property(TARGET fast_tokenizer PROPERTY IMPORTED_LOCATION ${FASTTOKENIZER_COMPILE_LIB})
add_dependencies(fast_tokenizer ${FASTTOKENIZER_PROJECT})
list(APPEND DEPEND_LIBS fast_tokenizer)
if (WIN32)
add_library(icudt STATIC IMPORTED GLOBAL)
set_property(TARGET icudt PROPERTY IMPORTED_LOCATION ${ICUDT_LIB})
add_dependencies(icudt ${FASTTOKENIZER_PROJECT})
list(APPEND DEPEND_LIBS icudt)
add_library(icuuc STATIC IMPORTED GLOBAL)
set_property(TARGET icuuc PROPERTY IMPORTED_LOCATION ${ICUUC_LIB})
add_dependencies(icuuc ${FASTTOKENIZER_PROJECT})
list(APPEND DEPEND_LIBS icuuc)
endif()

View File

@@ -1,108 +0,0 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include(ExternalProject)
set(FASTERTOKENIZER_PROJECT "extern_faster_tokenizer")
set(FASTERTOKENIZER_PREFIX_DIR ${THIRD_PARTY_PATH}/faster_tokenizer)
set(FASTERTOKENIZER_SOURCE_DIR
${THIRD_PARTY_PATH}/faster_tokenizer/src/${FASTERTOKENIZER_PROJECT})
set(FASTERTOKENIZER_INSTALL_DIR ${THIRD_PARTY_PATH}/install/faster_tokenizer)
set(FASTERTOKENIZER_INC_DIR
"${FASTERTOKENIZER_INSTALL_DIR}/include"
"${FASTERTOKENIZER_INSTALL_DIR}/third_party/include"
CACHE PATH "faster_tokenizer include directory." FORCE)
set(FASTERTOKENIZER_LIB_DIR
"${FASTERTOKENIZER_INSTALL_DIR}/lib/"
CACHE PATH "faster_tokenizer lib directory." FORCE)
set(FASTERTOKENIZER_THIRD_LIB_DIR
"${FASTERTOKENIZER_INSTALL_DIR}/third_party/lib/"
CACHE PATH "faster_tokenizer lib directory." FORCE)
set(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}"
"${FASTERTOKENIZER_LIB_DIR}")
include_directories(${FASTERTOKENIZER_INC_DIR})
# Set lib path
if(WIN32)
set(FASTERTOKENIZER_COMPILE_LIB "${FASTERTOKENIZER_LIB_DIR}/core_tokenizers.lib"
CACHE FILEPATH "faster_tokenizer compile library." FORCE)
message("FASTERTOKENIZER_COMPILE_LIB = ${FASTERTOKENIZER_COMPILE_LIB}")
set(ICUDT_LIB "${FASTERTOKENIZER_THIRD_LIB_DIR}/icudt.lib")
set(ICUUC_LIB "${FASTERTOKENIZER_THIRD_LIB_DIR}/icuuc.lib")
elseif(APPLE)
set(FASTERTOKENIZER_COMPILE_LIB "${FASTERTOKENIZER_LIB_DIR}/libcore_tokenizers.dylib"
CACHE FILEPATH "faster_tokenizer compile library." FORCE)
else()
set(FASTERTOKENIZER_COMPILE_LIB "${FASTERTOKENIZER_LIB_DIR}/libcore_tokenizers.so"
CACHE FILEPATH "faster_tokenizer compile library." FORCE)
message("FASTERTOKENIZER_COMPILE_LIB = ${FASTERTOKENIZER_COMPILE_LIB}")
endif(WIN32)
set(FASTERTOKENIZER_URL_BASE "https://bj.bcebos.com/paddlenlp/faster_tokenizer/")
set(FASTERTOKENIZER_VERSION "dev")
# Set download url
if(WIN32)
set(FASTERTOKENIZER_FILE "faster_tokenizer-win-x64-${FASTERTOKENIZER_VERSION}.zip")
if(NOT CMAKE_CL_64)
set(FASTERTOKENIZER_FILE "faster_tokenizer-win-x86-${FASTERTOKENIZER_VERSION}.zip")
endif()
elseif(APPLE)
if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "arm64")
set(FASTERTOKENIZER_FILE "faster_tokenizer-osx-arm64-${FASTERTOKENIZER_VERSION}.tgz")
else()
set(FASTERTOKENIZER_FILE "faster_tokenizer-osx-x86_64-${FASTERTOKENIZER_VERSION}.tgz")
endif()
else()
if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
set(FASTERTOKENIZER_FILE "faster_tokenizer-linux-aarch64-${FASTERTOKENIZER_VERSION}.tgz")
else()
set(FASTERTOKENIZER_FILE "faster_tokenizer-linux-x64-${FASTERTOKENIZER_VERSION}.tgz")
endif()
endif()
set(FASTERTOKENIZER_URL "${FASTERTOKENIZER_URL_BASE}${FASTERTOKENIZER_FILE}")
ExternalProject_Add(
${FASTERTOKENIZER_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
URL ${FASTERTOKENIZER_URL}
PREFIX ${FASTERTOKENIZER_PREFIX_DIR}
DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
UPDATE_COMMAND ""
INSTALL_COMMAND
${CMAKE_COMMAND} -E copy_directory ${FASTERTOKENIZER_SOURCE_DIR} ${FASTERTOKENIZER_INSTALL_DIR}
BUILD_BYPRODUCTS ${FASTERTOKENIZER_COMPILE_LIB})
add_library(faster_tokenizer STATIC IMPORTED GLOBAL)
set_property(TARGET faster_tokenizer PROPERTY IMPORTED_LOCATION ${FASTERTOKENIZER_COMPILE_LIB})
add_dependencies(faster_tokenizer ${FASTERTOKENIZER_PROJECT})
list(APPEND DEPEND_LIBS faster_tokenizer)
if (WIN32)
add_library(icudt STATIC IMPORTED GLOBAL)
set_property(TARGET icudt PROPERTY IMPORTED_LOCATION ${ICUDT_LIB})
add_dependencies(icudt ${FASTERTOKENIZER_PROJECT})
list(APPEND DEPEND_LIBS icudt)
add_library(icuuc STATIC IMPORTED GLOBAL)
set_property(TARGET icuuc PROPERTY IMPORTED_LOCATION ${ICUUC_LIB})
add_dependencies(icuuc ${FASTERTOKENIZER_PROJECT})
list(APPEND DEPEND_LIBS icuuc)
endif()

View File

@@ -18,11 +18,11 @@
#include "fastdeploy/function/softmax.h" #include "fastdeploy/function/softmax.h"
#include "fastdeploy/runtime.h" #include "fastdeploy/runtime.h"
#include "fastdeploy/utils/path.h" #include "fastdeploy/utils/path.h"
#include "faster_tokenizer/tokenizers/ernie_faster_tokenizer.h" #include "fast_tokenizer/tokenizers/ernie_fast_tokenizer.h"
#include "gflags/gflags.h" #include "gflags/gflags.h"
using namespace paddlenlp; using namespace paddlenlp;
using namespace faster_tokenizer::tokenizers_impl; using namespace fast_tokenizer::tokenizers_impl;
#ifdef WIN32 #ifdef WIN32
const char sep = '\\'; const char sep = '\\';
#else #else
@@ -124,10 +124,10 @@ struct SeqClsResult {
struct ErnieForSequenceClassificationPredictor { struct ErnieForSequenceClassificationPredictor {
fastdeploy::Runtime runtime_; fastdeploy::Runtime runtime_;
ErnieFasterTokenizer tokenizer_; ErnieFastTokenizer tokenizer_;
ErnieForSequenceClassificationPredictor( ErnieForSequenceClassificationPredictor(
const fastdeploy::RuntimeOption& option, const fastdeploy::RuntimeOption& option,
const ErnieFasterTokenizer& tokenizer) const ErnieFastTokenizer& tokenizer)
: tokenizer_(tokenizer) { : tokenizer_(tokenizer) {
runtime_.Init(option); runtime_.Init(option);
} }
@@ -135,8 +135,8 @@ struct ErnieForSequenceClassificationPredictor {
bool Preprocess(const std::vector<std::string>& texts, bool Preprocess(const std::vector<std::string>& texts,
const std::vector<std::string>& texts_pair, const std::vector<std::string>& texts_pair,
std::vector<fastdeploy::FDTensor>* inputs) { std::vector<fastdeploy::FDTensor>* inputs) {
std::vector<faster_tokenizer::core::Encoding> encodings; std::vector<fast_tokenizer::core::Encoding> encodings;
std::vector<faster_tokenizer::core::EncodeInput> text_pair_input; std::vector<fast_tokenizer::core::EncodeInput> text_pair_input;
// 1. Tokenize the text or (text, text_pair) // 1. Tokenize the text or (text, text_pair)
if (texts_pair.empty()) { if (texts_pair.empty()) {
for (int i = 0; i < texts.size(); ++i) { for (int i = 0; i < texts.size(); ++i) {
@@ -242,7 +242,7 @@ int main(int argc, char* argv[]) {
return -1; return -1;
} }
} }
ErnieFasterTokenizer tokenizer(vocab_path); ErnieFastTokenizer tokenizer(vocab_path);
ErnieForSequenceClassificationPredictor predictor(option, tokenizer); ErnieForSequenceClassificationPredictor predictor(option, tokenizer);

View File

@@ -1,2 +1,2 @@
faster_tokenizer fast-tokenizer-python
paddlenlp paddlenlp

View File

@@ -15,7 +15,7 @@ import os
import distutils.util import distutils.util
import numpy as np import numpy as np
import faster_tokenizer import fast_tokenizer
from paddlenlp.transformers import AutoTokenizer from paddlenlp.transformers import AutoTokenizer
import fastdeploy as fd import fastdeploy as fd

View File

@@ -19,8 +19,8 @@
#include <queue> #include <queue>
#include <sstream> #include <sstream>
#include "faster_tokenizer/pretokenizers/pretokenizer.h" #include "fast_tokenizer/pretokenizers/pretokenizer.h"
#include "faster_tokenizer/utils/utf8.h" #include "fast_tokenizer/utils/utf8.h"
namespace fastdeploy { namespace fastdeploy {
namespace text { namespace text {
@@ -30,9 +30,9 @@ static std::string DBC2SBC(const std::string& content) {
size_t content_utf8_len = 0; size_t content_utf8_len = 0;
while (content_utf8_len < content.length()) { while (content_utf8_len < content.length()) {
uint32_t content_char; uint32_t content_char;
auto content_char_width = faster_tokenizer::utils::UTF8ToUInt32( auto content_char_width = fast_tokenizer::utils::UTF8ToUInt32(
content.data() + content_utf8_len, &content_char); content.data() + content_utf8_len, &content_char);
content_char = faster_tokenizer::utils::UTF8ToUnicode(content_char); content_char = fast_tokenizer::utils::UTF8ToUnicode(content_char);
if (content_char == 0x3000) { if (content_char == 0x3000) {
content_char = 0x0020; content_char = 0x0020;
} else { } else {
@@ -43,9 +43,9 @@ static std::string DBC2SBC(const std::string& content) {
} else { } else {
char dst_char[5] = {0}; char dst_char[5] = {0};
uint32_t utf8_uint32 = uint32_t utf8_uint32 =
faster_tokenizer::utils::UnicodeToUTF8(content_char); fast_tokenizer::utils::UnicodeToUTF8(content_char);
uint32_t utf8_char_count = uint32_t utf8_char_count =
faster_tokenizer::utils::UnicodeToUTF8Char(utf8_uint32, dst_char); fast_tokenizer::utils::UnicodeToUTF8Char(utf8_uint32, dst_char);
result.append(dst_char, utf8_char_count); result.append(dst_char, utf8_char_count);
} }
content_utf8_len += content_char_width; content_utf8_len += content_char_width;
@@ -177,8 +177,8 @@ UIEModel::UIEModel(const std::string& model_file,
initialized = Initialize(); initialized = Initialize();
SetSchema(schema); SetSchema(schema);
tokenizer_.EnableTruncMethod( tokenizer_.EnableTruncMethod(
max_length, 0, faster_tokenizer::core::Direction::RIGHT, max_length, 0, fast_tokenizer::core::Direction::RIGHT,
faster_tokenizer::core::TruncStrategy::LONGEST_FIRST); fast_tokenizer::core::TruncStrategy::LONGEST_FIRST);
} }
UIEModel::UIEModel(const std::string& model_file, UIEModel::UIEModel(const std::string& model_file,
@@ -198,8 +198,8 @@ UIEModel::UIEModel(const std::string& model_file,
initialized = Initialize(); initialized = Initialize();
SetSchema(schema); SetSchema(schema);
tokenizer_.EnableTruncMethod( tokenizer_.EnableTruncMethod(
max_length, 0, faster_tokenizer::core::Direction::RIGHT, max_length, 0, fast_tokenizer::core::Direction::RIGHT,
faster_tokenizer::core::TruncStrategy::LONGEST_FIRST); fast_tokenizer::core::TruncStrategy::LONGEST_FIRST);
} }
UIEModel::UIEModel(const std::string& model_file, UIEModel::UIEModel(const std::string& model_file,
@@ -219,8 +219,8 @@ UIEModel::UIEModel(const std::string& model_file,
initialized = Initialize(); initialized = Initialize();
SetSchema(schema); SetSchema(schema);
tokenizer_.EnableTruncMethod( tokenizer_.EnableTruncMethod(
max_length, 0, faster_tokenizer::core::Direction::RIGHT, max_length, 0, fast_tokenizer::core::Direction::RIGHT,
faster_tokenizer::core::TruncStrategy::LONGEST_FIRST); fast_tokenizer::core::TruncStrategy::LONGEST_FIRST);
} }
bool UIEModel::Initialize() { bool UIEModel::Initialize() {
@@ -253,7 +253,7 @@ void UIEModel::AutoSplitter(const std::vector<std::string>& texts,
size_t cnt_org = 0; size_t cnt_org = 0;
size_t cnt_short = 0; size_t cnt_short = 0;
for (auto& text : texts) { for (auto& text : texts) {
auto text_len = faster_tokenizer::utils::GetUnicodeLenFromUTF8( auto text_len = fast_tokenizer::utils::GetUnicodeLenFromUTF8(
text.c_str(), text.length()); text.c_str(), text.length());
if (text_len <= max_length) { if (text_len <= max_length) {
short_texts->push_back(text); short_texts->push_back(text);
@@ -264,14 +264,14 @@ void UIEModel::AutoSplitter(const std::vector<std::string>& texts,
} }
cnt_short += 1; cnt_short += 1;
} else { } else {
faster_tokenizer::pretokenizers::CharToBytesOffsetConverter converter( fast_tokenizer::pretokenizers::CharToBytesOffsetConverter converter(
text); text);
for (size_t start = 0; start < text_len; start += max_length) { for (size_t start = 0; start < text_len; start += max_length) {
size_t end = start + max_length; size_t end = start + max_length;
if (end > text_len) { if (end > text_len) {
end = text_len; end = text_len;
} }
faster_tokenizer::core::Offset byte_offset; fast_tokenizer::core::Offset byte_offset;
converter.convert({start, end}, &byte_offset); converter.convert({start, end}, &byte_offset);
short_texts->emplace_back(text.data() + byte_offset.first, short_texts->emplace_back(text.data() + byte_offset.first,
byte_offset.second - byte_offset.first); byte_offset.second - byte_offset.first);
@@ -344,12 +344,12 @@ void UIEModel::GetSpan(const std::vector<IDX_PROB>& start_idx_prob,
} }
void UIEModel::GetSpanIdxAndProbs( void UIEModel::GetSpanIdxAndProbs(
const SPAN_SET& span_set, const SPAN_SET& span_set,
const std::vector<faster_tokenizer::core::Offset>& offset_mapping, const std::vector<fast_tokenizer::core::Offset>& offset_mapping,
std::vector<SpanIdx>* span_idxs, std::vector<float>* probs) const { std::vector<SpanIdx>* span_idxs, std::vector<float>* probs) const {
auto first_sep_idx = auto first_sep_idx =
std::find_if(offset_mapping.begin() + 1, offset_mapping.end(), std::find_if(offset_mapping.begin() + 1, offset_mapping.end(),
[](const faster_tokenizer::core::Offset& offset) { [](const fast_tokenizer::core::Offset& offset) {
return offset == faster_tokenizer::core::Offset(0, 0); return offset == fast_tokenizer::core::Offset(0, 0);
}); });
auto prompt_end_token_id = auto prompt_end_token_id =
std::distance(offset_mapping.begin(), first_sep_idx) - 1; std::distance(offset_mapping.begin(), first_sep_idx) - 1;
@@ -384,9 +384,9 @@ void UIEModel::ConvertSpanToUIEResult(
std::string span_text; std::string span_text;
std::vector<uint32_t> offset_mapping; std::vector<uint32_t> offset_mapping;
if (span_idxs[i][j].is_prompt_) { if (span_idxs[i][j].is_prompt_) {
faster_tokenizer::pretokenizers::CharToBytesOffsetConverter converter( fast_tokenizer::pretokenizers::CharToBytesOffsetConverter converter(
prompt); prompt);
faster_tokenizer::core::Offset byte_offset; fast_tokenizer::core::Offset byte_offset;
converter.convert({start, end}, &byte_offset); converter.convert({start, end}, &byte_offset);
span_text = prompt.substr(byte_offset.first, span_text = prompt.substr(byte_offset.first,
byte_offset.second - byte_offset.first); byte_offset.second - byte_offset.first);
@@ -394,9 +394,9 @@ void UIEModel::ConvertSpanToUIEResult(
start = 0; start = 0;
end = 0; end = 0;
} else { } else {
faster_tokenizer::pretokenizers::CharToBytesOffsetConverter converter( fast_tokenizer::pretokenizers::CharToBytesOffsetConverter converter(
text); text);
faster_tokenizer::core::Offset byte_offset; fast_tokenizer::core::Offset byte_offset;
converter.convert({start, end}, &byte_offset); converter.convert({start, end}, &byte_offset);
span_text = text.substr(byte_offset.first, span_text = text.substr(byte_offset.first,
byte_offset.second - byte_offset.first); byte_offset.second - byte_offset.first);
@@ -461,14 +461,14 @@ void UIEModel::AutoJoiner(const std::vector<std::string>& short_texts,
for (auto&& result_idx : input_mapping_item) { for (auto&& result_idx : input_mapping_item) {
if (result_idx == 0) { if (result_idx == 0) {
result_list = std::move((*results)[result_idx]); result_list = std::move((*results)[result_idx]);
offset += faster_tokenizer::utils::GetUnicodeLenFromUTF8( offset += fast_tokenizer::utils::GetUnicodeLenFromUTF8(
short_texts[result_idx].c_str(), short_texts[result_idx].size()); short_texts[result_idx].c_str(), short_texts[result_idx].size());
} else { } else {
for (auto&& curr_result : (*results)[result_idx]) { for (auto&& curr_result : (*results)[result_idx]) {
curr_result.start_ += offset; curr_result.start_ += offset;
curr_result.end_ += offset; curr_result.end_ += offset;
} }
offset += faster_tokenizer::utils::GetUnicodeLenFromUTF8( offset += fast_tokenizer::utils::GetUnicodeLenFromUTF8(
short_texts[result_idx].c_str(), short_texts[result_idx].size()); short_texts[result_idx].c_str(), short_texts[result_idx].size());
result_list.insert(result_list.end(), (*results)[result_idx].begin(), result_list.insert(result_list.end(), (*results)[result_idx].begin(),
(*results)[result_idx].end()); (*results)[result_idx].end());
@@ -521,13 +521,13 @@ bool UIEModel::ConstructTextsAndPrompts(
auto max_prompt_iter = std::max_element( auto max_prompt_iter = std::max_element(
prompts->begin(), prompts->end(), prompts->begin(), prompts->end(),
[](const std::string& lhs, const std::string& rhs) { [](const std::string& lhs, const std::string& rhs) {
auto lhs_ulen = faster_tokenizer::utils::GetUnicodeLenFromUTF8( auto lhs_ulen = fast_tokenizer::utils::GetUnicodeLenFromUTF8(
lhs.c_str(), lhs.length()); lhs.c_str(), lhs.length());
auto rhs_ulen = faster_tokenizer::utils::GetUnicodeLenFromUTF8( auto rhs_ulen = fast_tokenizer::utils::GetUnicodeLenFromUTF8(
rhs.c_str(), rhs.length()); rhs.c_str(), rhs.length());
return lhs_ulen < rhs_ulen; return lhs_ulen < rhs_ulen;
}); });
auto max_prompt_len = faster_tokenizer::utils::GetUnicodeLenFromUTF8( auto max_prompt_len = fast_tokenizer::utils::GetUnicodeLenFromUTF8(
max_prompt_iter->c_str(), max_prompt_iter->length()); max_prompt_iter->c_str(), max_prompt_iter->length());
auto max_predict_len = max_length_ - 3 - max_prompt_len; auto max_predict_len = max_length_ - 3 - max_prompt_len;
@@ -547,10 +547,10 @@ bool UIEModel::ConstructTextsAndPrompts(
void UIEModel::Preprocess( void UIEModel::Preprocess(
const std::vector<std::string>& input_texts, const std::vector<std::string>& input_texts,
const std::vector<std::string>& prompts, const std::vector<std::string>& prompts,
std::vector<faster_tokenizer::core::Encoding>* encodings, std::vector<fast_tokenizer::core::Encoding>* encodings,
std::vector<fastdeploy::FDTensor>* inputs) { std::vector<fastdeploy::FDTensor>* inputs) {
// 1. Tokenize the short texts and short prompts // 1. Tokenize the short texts and short prompts
std::vector<faster_tokenizer::core::EncodeInput> text_pair_input; std::vector<fast_tokenizer::core::EncodeInput> text_pair_input;
for (int i = 0; i < input_texts.size(); ++i) { for (int i = 0; i < input_texts.size(); ++i) {
text_pair_input.emplace_back( text_pair_input.emplace_back(
std::pair<std::string, std::string>(prompts[i], input_texts[i])); std::pair<std::string, std::string>(prompts[i], input_texts[i]));
@@ -596,7 +596,7 @@ void UIEModel::Preprocess(
void UIEModel::Postprocess( void UIEModel::Postprocess(
const std::vector<fastdeploy::FDTensor>& outputs, const std::vector<fastdeploy::FDTensor>& outputs,
const std::vector<faster_tokenizer::core::Encoding>& encodings, const std::vector<fast_tokenizer::core::Encoding>& encodings,
const std::vector<std::string>& short_input_texts, const std::vector<std::string>& short_input_texts,
const std::vector<std::string>& short_prompts, const std::vector<std::string>& short_prompts,
const std::vector<std::vector<size_t>>& input_mapping_with_short_text, const std::vector<std::vector<size_t>>& input_mapping_with_short_text,
@@ -611,7 +611,7 @@ void UIEModel::Postprocess(
GetCandidateIdx(end_prob, outputs[1].shape[0], outputs[1].shape[1], GetCandidateIdx(end_prob, outputs[1].shape[0], outputs[1].shape[1],
&end_candidate_idx_prob, position_prob_); &end_candidate_idx_prob, position_prob_);
std::vector<std::vector<faster_tokenizer::core::Offset>> offset_mapping; std::vector<std::vector<fast_tokenizer::core::Offset>> offset_mapping;
for (int i = 0; i < encodings.size(); ++i) { for (int i = 0; i < encodings.size(); ++i) {
auto&& curr_offsets = encodings[i].GetOffsets(); auto&& curr_offsets = encodings[i].GetOffsets();
offset_mapping.push_back(curr_offsets); offset_mapping.push_back(curr_offsets);
@@ -739,7 +739,7 @@ void UIEModel::Predict(
if (has_prompt) { if (has_prompt) {
// 2. Convert texts and prompts to FDTensor // 2. Convert texts and prompts to FDTensor
std::vector<FDTensor> inputs; std::vector<FDTensor> inputs;
std::vector<faster_tokenizer::core::Encoding> encodings; std::vector<fast_tokenizer::core::Encoding> encodings;
Preprocess(short_input_texts, short_prompts, &encodings, &inputs); Preprocess(short_input_texts, short_prompts, &encodings, &inputs);
// 3. Infer // 3. Infer

View File

@@ -21,7 +21,7 @@
#include <vector> #include <vector>
#include "fastdeploy/fastdeploy_model.h" #include "fastdeploy/fastdeploy_model.h"
#include "fastdeploy/utils/unique_ptr.h" #include "fastdeploy/utils/unique_ptr.h"
#include "faster_tokenizer/tokenizers/ernie_faster_tokenizer.h" #include "fast_tokenizer/tokenizers/ernie_fast_tokenizer.h"
using namespace paddlenlp; using namespace paddlenlp;
@@ -133,11 +133,11 @@ struct FASTDEPLOY_DECL UIEModel : public FastDeployModel {
std::vector<std::vector<size_t>>* input_mapping_with_short_text); std::vector<std::vector<size_t>>* input_mapping_with_short_text);
void Preprocess(const std::vector<std::string>& input_texts, void Preprocess(const std::vector<std::string>& input_texts,
const std::vector<std::string>& prompts, const std::vector<std::string>& prompts,
std::vector<faster_tokenizer::core::Encoding>* encodings, std::vector<fast_tokenizer::core::Encoding>* encodings,
std::vector<fastdeploy::FDTensor>* inputs); std::vector<fastdeploy::FDTensor>* inputs);
void Postprocess( void Postprocess(
const std::vector<fastdeploy::FDTensor>& outputs, const std::vector<fastdeploy::FDTensor>& outputs,
const std::vector<faster_tokenizer::core::Encoding>& encodings, const std::vector<fast_tokenizer::core::Encoding>& encodings,
const std::vector<std::string>& short_input_texts, const std::vector<std::string>& short_input_texts,
const std::vector<std::string>& short_prompts, const std::vector<std::string>& short_prompts,
const std::vector<std::vector<size_t>>& input_mapping_with_short_text, const std::vector<std::vector<size_t>>& input_mapping_with_short_text,
@@ -167,7 +167,7 @@ struct FASTDEPLOY_DECL UIEModel : public FastDeployModel {
}; };
using SPAN_SET = std::set<std::pair<IDX_PROB, IDX_PROB>, IdxProbCmp>; using SPAN_SET = std::set<std::pair<IDX_PROB, IDX_PROB>, IdxProbCmp>;
struct SpanIdx { struct SpanIdx {
faster_tokenizer::core::Offset offset_; fast_tokenizer::core::Offset offset_;
bool is_prompt_; bool is_prompt_;
}; };
void SetValidBackend(); void SetValidBackend();
@@ -188,7 +188,7 @@ struct FASTDEPLOY_DECL UIEModel : public FastDeployModel {
SPAN_SET* span_set) const; SPAN_SET* span_set) const;
void GetSpanIdxAndProbs( void GetSpanIdxAndProbs(
const SPAN_SET& span_set, const SPAN_SET& span_set,
const std::vector<faster_tokenizer::core::Offset>& offset_mapping, const std::vector<fast_tokenizer::core::Offset>& offset_mapping,
std::vector<SpanIdx>* span_idxs, std::vector<float>* probs) const; std::vector<SpanIdx>* span_idxs, std::vector<float>* probs) const;
void ConvertSpanToUIEResult( void ConvertSpanToUIEResult(
const std::vector<std::string>& texts, const std::vector<std::string>& texts,
@@ -200,7 +200,7 @@ struct FASTDEPLOY_DECL UIEModel : public FastDeployModel {
size_t max_length_; size_t max_length_;
float position_prob_; float position_prob_;
SchemaLanguage schema_language_; SchemaLanguage schema_language_;
faster_tokenizer::tokenizers_impl::ErnieFasterTokenizer tokenizer_; fast_tokenizer::tokenizers_impl::ErnieFastTokenizer tokenizer_;
}; };
} // namespace text } // namespace text

View File

@@ -46,8 +46,8 @@ if "%__script_action_type%" == "show" (
echo !__3rd_lib_file! | findstr "opencv">nul && set __3rd_needed_flag=true echo !__3rd_lib_file! | findstr "opencv">nul && set __3rd_needed_flag=true
echo !__3rd_lib_file! | findstr "opencv">nul && set __api_tag=!__api_tag!::vision echo !__3rd_lib_file! | findstr "opencv">nul && set __api_tag=!__api_tag!::vision
if "!__3rd_needed_flag!"=="true" (echo !__3rd_lib_file! | findstr d\.lib>nul && set __3rd_needed_flag=false) if "!__3rd_needed_flag!"=="true" (echo !__3rd_lib_file! | findstr d\.lib>nul && set __3rd_needed_flag=false)
echo !__3rd_lib_file! | findstr "faster_tokenizer">nul && set __3rd_needed_flag=true echo !__3rd_lib_file! | findstr "fast_tokenizer">nul && set __3rd_needed_flag=true
echo !__3rd_lib_file! | findstr "faster_tokenizer">nul && set __api_tag=!__api_tag!::text echo !__3rd_lib_file! | findstr "fast_tokenizer">nul && set __api_tag=!__api_tag!::text
if "!__3rd_needed_flag!"=="true" (echo [Lib] !__3rd_lib_file! **[NEEDED][!__api_tag!]**) else (echo [Lib] !__3rd_lib_file!) if "!__3rd_needed_flag!"=="true" (echo [Lib] !__3rd_lib_file! **[NEEDED][!__api_tag!]**) else (echo [Lib] !__3rd_lib_file!)
) )
@@ -58,8 +58,8 @@ if "%__script_action_type%" == "show" (
set __3rd_include_dir=%%a && set __3rd_needed_flag=false && set __api_tag=fastdeploy set __3rd_include_dir=%%a && set __3rd_needed_flag=false && set __api_tag=fastdeploy
echo !__3rd_include_dir! | findstr "opencv">nul && set __3rd_needed_flag=true echo !__3rd_include_dir! | findstr "opencv">nul && set __3rd_needed_flag=true
echo !__3rd_include_dir! | findstr "opencv">nul && set __api_tag=!__api_tag!::vision echo !__3rd_include_dir! | findstr "opencv">nul && set __api_tag=!__api_tag!::vision
echo !__3rd_include_dir! | findstr "faster_tokenizer">nul && set __3rd_needed_flag=true echo !__3rd_include_dir! | findstr "fast_tokenizer">nul && set __3rd_needed_flag=true
echo !__3rd_include_dir! | findstr "faster_tokenizer">nul && set __api_tag=!__api_tag!::text echo !__3rd_include_dir! | findstr "fast_tokenizer">nul && set __api_tag=!__api_tag!::text
if "!__3rd_needed_flag!"=="true" (echo [Include] !__3rd_include_dir! **[NEEDED][!__api_tag!]**) else (echo [Include] !__3rd_include_dir!) if "!__3rd_needed_flag!"=="true" (echo [Include] !__3rd_include_dir! **[NEEDED][!__api_tag!]**) else (echo [Include] !__3rd_include_dir!)
) )

View File

@@ -60,9 +60,4 @@ if [ -d ${INSTALLED_PREBUILT_FASTDEPLOY_DIR}/third_libs/install/paddlelite ]; th
echo "Paddle Lite Lib: ${INSTALLED_PREBUILT_FASTDEPLOY_DIR}/third_libs/install/paddlelite/lib" echo "Paddle Lite Lib: ${INSTALLED_PREBUILT_FASTDEPLOY_DIR}/third_libs/install/paddlelite/lib"
fi fi
if [ -d ${INSTALLED_PREBUILT_FASTDEPLOY_DIR}/third_libs/install/faster_tokenizer ]; then
export LD_LIBRARY_PATH=${INSTALLED_PREBUILT_FASTDEPLOY_DIR}/third_libs/install/faster_tokenizer/lib:${LD_LIBRARY_PATH}
echo "Faster Tokenizer Lib: ${INSTALLED_PREBUILT_FASTDEPLOY_DIR}/third_libs/install/faster_tokenizer/lib"
fi
cd ${CURRENT_EXE_DIR} cd ${CURRENT_EXE_DIR}

View File

@@ -35,7 +35,7 @@ RUN apt-get update \
RUN apt-get update \ RUN apt-get update \
&& apt-get install -y --no-install-recommends libre2-5 libb64-0d python3 python3-pip libarchive-dev ffmpeg libsm6 libxext6 \ && apt-get install -y --no-install-recommends libre2-5 libb64-0d python3 python3-pip libarchive-dev ffmpeg libsm6 libxext6 \
&& python3 -m pip install -U pip \ && python3 -m pip install -U pip \
&& python3 -m pip install paddlepaddle-gpu paddlenlp faster_tokenizer && python3 -m pip install paddlepaddle-gpu paddlenlp fast-tokenizer-python
COPY python/dist/*.whl /opt/fastdeploy/ COPY python/dist/*.whl /opt/fastdeploy/
RUN python3 -m pip install /opt/fastdeploy/*.whl \ RUN python3 -m pip install /opt/fastdeploy/*.whl \

View File

@@ -19,7 +19,7 @@ ENV TZ=Asia/Shanghai \
RUN apt-get update && apt-get install -y --no-install-recommends apt-utils libgomp1 ffmpeg libsm6 libxext6 \ RUN apt-get update && apt-get install -y --no-install-recommends apt-utils libgomp1 ffmpeg libsm6 libxext6 \
&& python3 -m pip install -U pip \ && python3 -m pip install -U pip \
&& python3 -m pip install paddlepaddle paddlenlp faster_tokenizer && python3 -m pip install paddlepaddle paddlenlp fast-tokenizer-python
COPY python/dist/*.whl *.whl /opt/fastdeploy/ COPY python/dist/*.whl *.whl /opt/fastdeploy/
RUN python3 -m pip install /opt/fastdeploy/*.whl \ RUN python3 -m pip install /opt/fastdeploy/*.whl \

View File

@@ -58,8 +58,8 @@ set PATH=%FASTDEPLOY_HOME%\third_libs\install\paddle_inference\third_party\insta
set PATH=%FASTDEPLOY_HOME%\third_libs\install\paddle_inference\third_party\install\mklml\lib;%PATH% set PATH=%FASTDEPLOY_HOME%\third_libs\install\paddle_inference\third_party\install\mklml\lib;%PATH%
set PATH=%FASTDEPLOY_HOME%\third_libs\install\paddle2onnx\lib;%PATH% set PATH=%FASTDEPLOY_HOME%\third_libs\install\paddle2onnx\lib;%PATH%
set PATH=%FASTDEPLOY_HOME%\third_libs\install\tensorrt\lib;%PATH% set PATH=%FASTDEPLOY_HOME%\third_libs\install\tensorrt\lib;%PATH%
set PATH=%FASTDEPLOY_HOME%\third_libs\install\faster_tokenizer\lib;%PATH% set PATH=%FASTDEPLOY_HOME%\third_libs\install\fast_tokenizer\lib;%PATH%
set PATH=%FASTDEPLOY_HOME%\third_libs\install\faster_tokenizer\third_party\lib;%PATH% set PATH=%FASTDEPLOY_HOME%\third_libs\install\fast_tokenizer\third_party\lib;%PATH%
set PATH=%FASTDEPLOY_HOME%\third_libs\install\yaml-cpp\lib;%PATH% set PATH=%FASTDEPLOY_HOME%\third_libs\install\yaml-cpp\lib;%PATH%
set PATH=%FASTDEPLOY_HOME%\third_libs\install\opencv\build\x64\vc15\bin;%PATH% set PATH=%FASTDEPLOY_HOME%\third_libs\install\opencv\build\x64\vc15\bin;%PATH%
set PATH=%FASTDEPLOY_HOME%\third_libs\install\openvino\runtime\bin;%PATH% set PATH=%FASTDEPLOY_HOME%\third_libs\install\openvino\runtime\bin;%PATH%