[Other] faster_tokenizer->fast_tokenizer (#636)

* faster_tokenizer->fast_tokenizer

* ErnieFasterTokenizer->ErnieFastTokenizer

* update the fastdeploy_init

Co-authored-by: Jason <jiangjiajun@baidu.com>
This commit is contained in:
Jack Zhou
2022-11-21 13:45:00 +08:00
committed by GitHub
parent 3e1fc69a0c
commit eeae48deff
14 changed files with 170 additions and 175 deletions

View File

@@ -415,14 +415,14 @@ endif()
if(ANDROID OR IOS)
if(ENABLE_TEXT)
set(ENABLE_TEXT OFF CACHE BOOL "Force ENABLE_TEXT OFF" FORCE)
message(STATUS "Found Android or IOS, force ENABLE_TEXT OFF. We do not support faster_tokenizer with Android/IOS now.")
message(STATUS "Found Android or IOS, force ENABLE_TEXT OFF. We do not support fast_tokenizer with Android/IOS now.")
endif()
endif()
if(ENABLE_TEXT)
add_definitions(-DENABLE_TEXT)
list(APPEND ALL_DEPLOY_SRCS ${DEPLOY_TEXT_SRCS})
include(${PROJECT_SOURCE_DIR}/cmake/faster_tokenizer.cmake)
include(${PROJECT_SOURCE_DIR}/cmake/fast_tokenizer.cmake)
endif()
if(ENABLE_PADDLE_FRONTEND)

View File

@@ -213,10 +213,10 @@ if (ENABLE_TEXT)
message(FATAL_ERROR "Not support fastdeploy text APIs with Android now!")
endif()
# Add dependency libs later
find_library(FASTER_TOKENIZER_LIB core_tokenizers ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/faster_tokenizer/lib NO_DEFAULT_PATH)
list(APPEND FASTDEPLOY_LIBS ${FASTER_TOKENIZER_LIB})
list(APPEND FASTDEPLOY_INCS ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/faster_tokenizer/include)
list(APPEND FASTDEPLOY_INCS ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/faster_tokenizer/third_party/include)
find_library(FAST_TOKENIZER_LIB core_tokenizers ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/fast_tokenizer/lib NO_DEFAULT_PATH)
list(APPEND FASTDEPLOY_LIBS ${FAST_TOKENIZER_LIB})
list(APPEND FASTDEPLOY_INCS ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/fast_tokenizer/include)
list(APPEND FASTDEPLOY_INCS ${CMAKE_CURRENT_LIST_DIR}/third_libs/install/fast_tokenizer/third_party/include)
endif()
if(ENABLE_PADDLE_FRONTEND)

108
cmake/fast_tokenizer.cmake Normal file
View File

@@ -0,0 +1,108 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include(ExternalProject)
set(FASTTOKENIZER_PROJECT "extern_fast_tokenizer")
set(FASTTOKENIZER_PREFIX_DIR ${THIRD_PARTY_PATH}/fast_tokenizer)
set(FASTTOKENIZER_SOURCE_DIR
${THIRD_PARTY_PATH}/fast_tokenizer/src/${FASTTOKENIZER_PROJECT})
set(FASTTOKENIZER_INSTALL_DIR ${THIRD_PARTY_PATH}/install/fast_tokenizer)
set(FASTTOKENIZER_INC_DIR
"${FASTTOKENIZER_INSTALL_DIR}/include"
"${FASTTOKENIZER_INSTALL_DIR}/third_party/include"
CACHE PATH "fast_tokenizer include directory." FORCE)
set(FASTTOKENIZER_LIB_DIR
"${FASTTOKENIZER_INSTALL_DIR}/lib/"
CACHE PATH "fast_tokenizer lib directory." FORCE)
set(FASTTOKENIZER_THIRD_LIB_DIR
"${FASTTOKENIZER_INSTALL_DIR}/third_party/lib/"
CACHE PATH "fast_tokenizer lib directory." FORCE)
set(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}"
"${FASTTOKENIZER_LIB_DIR}")
include_directories(${FASTTOKENIZER_INC_DIR})
# Set lib path
if(WIN32)
set(FASTTOKENIZER_COMPILE_LIB "${FASTTOKENIZER_LIB_DIR}/core_tokenizers.lib"
CACHE FILEPATH "fast_tokenizer compile library." FORCE)
message("FASTTOKENIZER_COMPILE_LIB = ${FASTTOKENIZER_COMPILE_LIB}")
set(ICUDT_LIB "${FASTTOKENIZER_THIRD_LIB_DIR}/icudt.lib")
set(ICUUC_LIB "${FASTTOKENIZER_THIRD_LIB_DIR}/icuuc.lib")
elseif(APPLE)
set(FASTTOKENIZER_COMPILE_LIB "${FASTTOKENIZER_LIB_DIR}/libcore_tokenizers.dylib"
CACHE FILEPATH "fast_tokenizer compile library." FORCE)
else()
set(FASTTOKENIZER_COMPILE_LIB "${FASTTOKENIZER_LIB_DIR}/libcore_tokenizers.so"
CACHE FILEPATH "fast_tokenizer compile library." FORCE)
message("FASTTOKENIZER_COMPILE_LIB = ${FASTTOKENIZER_COMPILE_LIB}")
endif(WIN32)
set(FASTTOKENIZER_URL_BASE "https://bj.bcebos.com/paddlenlp/fast_tokenizer/")
set(FASTTOKENIZER_VERSION "1.0.0")
# Set download url
if(WIN32)
set(FASTTOKENIZER_FILE "fast_tokenizer-win-x64-${FASTTOKENIZER_VERSION}.zip")
if(NOT CMAKE_CL_64)
set(FASTTOKENIZER_FILE "fast_tokenizer-win-x86-${FASTTOKENIZER_VERSION}.zip")
endif()
elseif(APPLE)
if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "arm64")
set(FASTTOKENIZER_FILE "fast_tokenizer-osx-arm64-${FASTTOKENIZER_VERSION}.tgz")
else()
set(FASTTOKENIZER_FILE "fast_tokenizer-osx-x86_64-${FASTTOKENIZER_VERSION}.tgz")
endif()
else()
if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
set(FASTTOKENIZER_FILE "fast_tokenizer-linux-aarch64-${FASTTOKENIZER_VERSION}.tgz")
else()
set(FASTTOKENIZER_FILE "fast_tokenizer-linux-x64-${FASTTOKENIZER_VERSION}.tgz")
endif()
endif()
set(FASTTOKENIZER_URL "${FASTTOKENIZER_URL_BASE}${FASTTOKENIZER_FILE}")
ExternalProject_Add(
${FASTTOKENIZER_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
URL ${FASTTOKENIZER_URL}
PREFIX ${FASTTOKENIZER_PREFIX_DIR}
DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
UPDATE_COMMAND ""
INSTALL_COMMAND
${CMAKE_COMMAND} -E copy_directory ${FASTTOKENIZER_SOURCE_DIR} ${FASTTOKENIZER_INSTALL_DIR}
BUILD_BYPRODUCTS ${FASTTOKENIZER_COMPILE_LIB})
add_library(fast_tokenizer STATIC IMPORTED GLOBAL)
set_property(TARGET fast_tokenizer PROPERTY IMPORTED_LOCATION ${FASTTOKENIZER_COMPILE_LIB})
add_dependencies(fast_tokenizer ${FASTTOKENIZER_PROJECT})
list(APPEND DEPEND_LIBS fast_tokenizer)
if (WIN32)
add_library(icudt STATIC IMPORTED GLOBAL)
set_property(TARGET icudt PROPERTY IMPORTED_LOCATION ${ICUDT_LIB})
add_dependencies(icudt ${FASTTOKENIZER_PROJECT})
list(APPEND DEPEND_LIBS icudt)
add_library(icuuc STATIC IMPORTED GLOBAL)
set_property(TARGET icuuc PROPERTY IMPORTED_LOCATION ${ICUUC_LIB})
add_dependencies(icuuc ${FASTTOKENIZER_PROJECT})
list(APPEND DEPEND_LIBS icuuc)
endif()

View File

@@ -1,108 +0,0 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include(ExternalProject)
set(FASTERTOKENIZER_PROJECT "extern_faster_tokenizer")
set(FASTERTOKENIZER_PREFIX_DIR ${THIRD_PARTY_PATH}/faster_tokenizer)
set(FASTERTOKENIZER_SOURCE_DIR
${THIRD_PARTY_PATH}/faster_tokenizer/src/${FASTERTOKENIZER_PROJECT})
set(FASTERTOKENIZER_INSTALL_DIR ${THIRD_PARTY_PATH}/install/faster_tokenizer)
set(FASTERTOKENIZER_INC_DIR
"${FASTERTOKENIZER_INSTALL_DIR}/include"
"${FASTERTOKENIZER_INSTALL_DIR}/third_party/include"
CACHE PATH "faster_tokenizer include directory." FORCE)
set(FASTERTOKENIZER_LIB_DIR
"${FASTERTOKENIZER_INSTALL_DIR}/lib/"
CACHE PATH "faster_tokenizer lib directory." FORCE)
set(FASTERTOKENIZER_THIRD_LIB_DIR
"${FASTERTOKENIZER_INSTALL_DIR}/third_party/lib/"
CACHE PATH "faster_tokenizer lib directory." FORCE)
set(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}"
"${FASTERTOKENIZER_LIB_DIR}")
include_directories(${FASTERTOKENIZER_INC_DIR})
# Set lib path
if(WIN32)
set(FASTERTOKENIZER_COMPILE_LIB "${FASTERTOKENIZER_LIB_DIR}/core_tokenizers.lib"
CACHE FILEPATH "faster_tokenizer compile library." FORCE)
message("FASTERTOKENIZER_COMPILE_LIB = ${FASTERTOKENIZER_COMPILE_LIB}")
set(ICUDT_LIB "${FASTERTOKENIZER_THIRD_LIB_DIR}/icudt.lib")
set(ICUUC_LIB "${FASTERTOKENIZER_THIRD_LIB_DIR}/icuuc.lib")
elseif(APPLE)
set(FASTERTOKENIZER_COMPILE_LIB "${FASTERTOKENIZER_LIB_DIR}/libcore_tokenizers.dylib"
CACHE FILEPATH "faster_tokenizer compile library." FORCE)
else()
set(FASTERTOKENIZER_COMPILE_LIB "${FASTERTOKENIZER_LIB_DIR}/libcore_tokenizers.so"
CACHE FILEPATH "faster_tokenizer compile library." FORCE)
message("FASTERTOKENIZER_COMPILE_LIB = ${FASTERTOKENIZER_COMPILE_LIB}")
endif(WIN32)
set(FASTERTOKENIZER_URL_BASE "https://bj.bcebos.com/paddlenlp/faster_tokenizer/")
set(FASTERTOKENIZER_VERSION "dev")
# Set download url
if(WIN32)
set(FASTERTOKENIZER_FILE "faster_tokenizer-win-x64-${FASTERTOKENIZER_VERSION}.zip")
if(NOT CMAKE_CL_64)
set(FASTERTOKENIZER_FILE "faster_tokenizer-win-x86-${FASTERTOKENIZER_VERSION}.zip")
endif()
elseif(APPLE)
if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "arm64")
set(FASTERTOKENIZER_FILE "faster_tokenizer-osx-arm64-${FASTERTOKENIZER_VERSION}.tgz")
else()
set(FASTERTOKENIZER_FILE "faster_tokenizer-osx-x86_64-${FASTERTOKENIZER_VERSION}.tgz")
endif()
else()
if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
set(FASTERTOKENIZER_FILE "faster_tokenizer-linux-aarch64-${FASTERTOKENIZER_VERSION}.tgz")
else()
set(FASTERTOKENIZER_FILE "faster_tokenizer-linux-x64-${FASTERTOKENIZER_VERSION}.tgz")
endif()
endif()
set(FASTERTOKENIZER_URL "${FASTERTOKENIZER_URL_BASE}${FASTERTOKENIZER_FILE}")
ExternalProject_Add(
${FASTERTOKENIZER_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
URL ${FASTERTOKENIZER_URL}
PREFIX ${FASTERTOKENIZER_PREFIX_DIR}
DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
UPDATE_COMMAND ""
INSTALL_COMMAND
${CMAKE_COMMAND} -E copy_directory ${FASTERTOKENIZER_SOURCE_DIR} ${FASTERTOKENIZER_INSTALL_DIR}
BUILD_BYPRODUCTS ${FASTERTOKENIZER_COMPILE_LIB})
add_library(faster_tokenizer STATIC IMPORTED GLOBAL)
set_property(TARGET faster_tokenizer PROPERTY IMPORTED_LOCATION ${FASTERTOKENIZER_COMPILE_LIB})
add_dependencies(faster_tokenizer ${FASTERTOKENIZER_PROJECT})
list(APPEND DEPEND_LIBS faster_tokenizer)
if (WIN32)
add_library(icudt STATIC IMPORTED GLOBAL)
set_property(TARGET icudt PROPERTY IMPORTED_LOCATION ${ICUDT_LIB})
add_dependencies(icudt ${FASTERTOKENIZER_PROJECT})
list(APPEND DEPEND_LIBS icudt)
add_library(icuuc STATIC IMPORTED GLOBAL)
set_property(TARGET icuuc PROPERTY IMPORTED_LOCATION ${ICUUC_LIB})
add_dependencies(icuuc ${FASTERTOKENIZER_PROJECT})
list(APPEND DEPEND_LIBS icuuc)
endif()

View File

@@ -18,11 +18,11 @@
#include "fastdeploy/function/softmax.h"
#include "fastdeploy/runtime.h"
#include "fastdeploy/utils/path.h"
#include "faster_tokenizer/tokenizers/ernie_faster_tokenizer.h"
#include "fast_tokenizer/tokenizers/ernie_fast_tokenizer.h"
#include "gflags/gflags.h"
using namespace paddlenlp;
using namespace faster_tokenizer::tokenizers_impl;
using namespace fast_tokenizer::tokenizers_impl;
#ifdef WIN32
const char sep = '\\';
#else
@@ -124,10 +124,10 @@ struct SeqClsResult {
struct ErnieForSequenceClassificationPredictor {
fastdeploy::Runtime runtime_;
ErnieFasterTokenizer tokenizer_;
ErnieFastTokenizer tokenizer_;
ErnieForSequenceClassificationPredictor(
const fastdeploy::RuntimeOption& option,
const ErnieFasterTokenizer& tokenizer)
const ErnieFastTokenizer& tokenizer)
: tokenizer_(tokenizer) {
runtime_.Init(option);
}
@@ -135,8 +135,8 @@ struct ErnieForSequenceClassificationPredictor {
bool Preprocess(const std::vector<std::string>& texts,
const std::vector<std::string>& texts_pair,
std::vector<fastdeploy::FDTensor>* inputs) {
std::vector<faster_tokenizer::core::Encoding> encodings;
std::vector<faster_tokenizer::core::EncodeInput> text_pair_input;
std::vector<fast_tokenizer::core::Encoding> encodings;
std::vector<fast_tokenizer::core::EncodeInput> text_pair_input;
// 1. Tokenize the text or (text, text_pair)
if (texts_pair.empty()) {
for (int i = 0; i < texts.size(); ++i) {
@@ -242,7 +242,7 @@ int main(int argc, char* argv[]) {
return -1;
}
}
ErnieFasterTokenizer tokenizer(vocab_path);
ErnieFastTokenizer tokenizer(vocab_path);
ErnieForSequenceClassificationPredictor predictor(option, tokenizer);

View File

@@ -1,2 +1,2 @@
faster_tokenizer
fast-tokenizer-python
paddlenlp

View File

@@ -15,7 +15,7 @@ import os
import distutils.util
import numpy as np
import faster_tokenizer
import fast_tokenizer
from paddlenlp.transformers import AutoTokenizer
import fastdeploy as fd

View File

@@ -19,8 +19,8 @@
#include <queue>
#include <sstream>
#include "faster_tokenizer/pretokenizers/pretokenizer.h"
#include "faster_tokenizer/utils/utf8.h"
#include "fast_tokenizer/pretokenizers/pretokenizer.h"
#include "fast_tokenizer/utils/utf8.h"
namespace fastdeploy {
namespace text {
@@ -30,9 +30,9 @@ static std::string DBC2SBC(const std::string& content) {
size_t content_utf8_len = 0;
while (content_utf8_len < content.length()) {
uint32_t content_char;
auto content_char_width = faster_tokenizer::utils::UTF8ToUInt32(
auto content_char_width = fast_tokenizer::utils::UTF8ToUInt32(
content.data() + content_utf8_len, &content_char);
content_char = faster_tokenizer::utils::UTF8ToUnicode(content_char);
content_char = fast_tokenizer::utils::UTF8ToUnicode(content_char);
if (content_char == 0x3000) {
content_char = 0x0020;
} else {
@@ -43,9 +43,9 @@ static std::string DBC2SBC(const std::string& content) {
} else {
char dst_char[5] = {0};
uint32_t utf8_uint32 =
faster_tokenizer::utils::UnicodeToUTF8(content_char);
fast_tokenizer::utils::UnicodeToUTF8(content_char);
uint32_t utf8_char_count =
faster_tokenizer::utils::UnicodeToUTF8Char(utf8_uint32, dst_char);
fast_tokenizer::utils::UnicodeToUTF8Char(utf8_uint32, dst_char);
result.append(dst_char, utf8_char_count);
}
content_utf8_len += content_char_width;
@@ -177,8 +177,8 @@ UIEModel::UIEModel(const std::string& model_file,
initialized = Initialize();
SetSchema(schema);
tokenizer_.EnableTruncMethod(
max_length, 0, faster_tokenizer::core::Direction::RIGHT,
faster_tokenizer::core::TruncStrategy::LONGEST_FIRST);
max_length, 0, fast_tokenizer::core::Direction::RIGHT,
fast_tokenizer::core::TruncStrategy::LONGEST_FIRST);
}
UIEModel::UIEModel(const std::string& model_file,
@@ -198,8 +198,8 @@ UIEModel::UIEModel(const std::string& model_file,
initialized = Initialize();
SetSchema(schema);
tokenizer_.EnableTruncMethod(
max_length, 0, faster_tokenizer::core::Direction::RIGHT,
faster_tokenizer::core::TruncStrategy::LONGEST_FIRST);
max_length, 0, fast_tokenizer::core::Direction::RIGHT,
fast_tokenizer::core::TruncStrategy::LONGEST_FIRST);
}
UIEModel::UIEModel(const std::string& model_file,
@@ -219,8 +219,8 @@ UIEModel::UIEModel(const std::string& model_file,
initialized = Initialize();
SetSchema(schema);
tokenizer_.EnableTruncMethod(
max_length, 0, faster_tokenizer::core::Direction::RIGHT,
faster_tokenizer::core::TruncStrategy::LONGEST_FIRST);
max_length, 0, fast_tokenizer::core::Direction::RIGHT,
fast_tokenizer::core::TruncStrategy::LONGEST_FIRST);
}
bool UIEModel::Initialize() {
@@ -253,7 +253,7 @@ void UIEModel::AutoSplitter(const std::vector<std::string>& texts,
size_t cnt_org = 0;
size_t cnt_short = 0;
for (auto& text : texts) {
auto text_len = faster_tokenizer::utils::GetUnicodeLenFromUTF8(
auto text_len = fast_tokenizer::utils::GetUnicodeLenFromUTF8(
text.c_str(), text.length());
if (text_len <= max_length) {
short_texts->push_back(text);
@@ -264,14 +264,14 @@ void UIEModel::AutoSplitter(const std::vector<std::string>& texts,
}
cnt_short += 1;
} else {
faster_tokenizer::pretokenizers::CharToBytesOffsetConverter converter(
fast_tokenizer::pretokenizers::CharToBytesOffsetConverter converter(
text);
for (size_t start = 0; start < text_len; start += max_length) {
size_t end = start + max_length;
if (end > text_len) {
end = text_len;
}
faster_tokenizer::core::Offset byte_offset;
fast_tokenizer::core::Offset byte_offset;
converter.convert({start, end}, &byte_offset);
short_texts->emplace_back(text.data() + byte_offset.first,
byte_offset.second - byte_offset.first);
@@ -344,12 +344,12 @@ void UIEModel::GetSpan(const std::vector<IDX_PROB>& start_idx_prob,
}
void UIEModel::GetSpanIdxAndProbs(
const SPAN_SET& span_set,
const std::vector<faster_tokenizer::core::Offset>& offset_mapping,
const std::vector<fast_tokenizer::core::Offset>& offset_mapping,
std::vector<SpanIdx>* span_idxs, std::vector<float>* probs) const {
auto first_sep_idx =
std::find_if(offset_mapping.begin() + 1, offset_mapping.end(),
[](const faster_tokenizer::core::Offset& offset) {
return offset == faster_tokenizer::core::Offset(0, 0);
[](const fast_tokenizer::core::Offset& offset) {
return offset == fast_tokenizer::core::Offset(0, 0);
});
auto prompt_end_token_id =
std::distance(offset_mapping.begin(), first_sep_idx) - 1;
@@ -384,9 +384,9 @@ void UIEModel::ConvertSpanToUIEResult(
std::string span_text;
std::vector<uint32_t> offset_mapping;
if (span_idxs[i][j].is_prompt_) {
faster_tokenizer::pretokenizers::CharToBytesOffsetConverter converter(
fast_tokenizer::pretokenizers::CharToBytesOffsetConverter converter(
prompt);
faster_tokenizer::core::Offset byte_offset;
fast_tokenizer::core::Offset byte_offset;
converter.convert({start, end}, &byte_offset);
span_text = prompt.substr(byte_offset.first,
byte_offset.second - byte_offset.first);
@@ -394,9 +394,9 @@ void UIEModel::ConvertSpanToUIEResult(
start = 0;
end = 0;
} else {
faster_tokenizer::pretokenizers::CharToBytesOffsetConverter converter(
fast_tokenizer::pretokenizers::CharToBytesOffsetConverter converter(
text);
faster_tokenizer::core::Offset byte_offset;
fast_tokenizer::core::Offset byte_offset;
converter.convert({start, end}, &byte_offset);
span_text = text.substr(byte_offset.first,
byte_offset.second - byte_offset.first);
@@ -461,14 +461,14 @@ void UIEModel::AutoJoiner(const std::vector<std::string>& short_texts,
for (auto&& result_idx : input_mapping_item) {
if (result_idx == 0) {
result_list = std::move((*results)[result_idx]);
offset += faster_tokenizer::utils::GetUnicodeLenFromUTF8(
offset += fast_tokenizer::utils::GetUnicodeLenFromUTF8(
short_texts[result_idx].c_str(), short_texts[result_idx].size());
} else {
for (auto&& curr_result : (*results)[result_idx]) {
curr_result.start_ += offset;
curr_result.end_ += offset;
}
offset += faster_tokenizer::utils::GetUnicodeLenFromUTF8(
offset += fast_tokenizer::utils::GetUnicodeLenFromUTF8(
short_texts[result_idx].c_str(), short_texts[result_idx].size());
result_list.insert(result_list.end(), (*results)[result_idx].begin(),
(*results)[result_idx].end());
@@ -521,13 +521,13 @@ bool UIEModel::ConstructTextsAndPrompts(
auto max_prompt_iter = std::max_element(
prompts->begin(), prompts->end(),
[](const std::string& lhs, const std::string& rhs) {
auto lhs_ulen = faster_tokenizer::utils::GetUnicodeLenFromUTF8(
auto lhs_ulen = fast_tokenizer::utils::GetUnicodeLenFromUTF8(
lhs.c_str(), lhs.length());
auto rhs_ulen = faster_tokenizer::utils::GetUnicodeLenFromUTF8(
auto rhs_ulen = fast_tokenizer::utils::GetUnicodeLenFromUTF8(
rhs.c_str(), rhs.length());
return lhs_ulen < rhs_ulen;
});
auto max_prompt_len = faster_tokenizer::utils::GetUnicodeLenFromUTF8(
auto max_prompt_len = fast_tokenizer::utils::GetUnicodeLenFromUTF8(
max_prompt_iter->c_str(), max_prompt_iter->length());
auto max_predict_len = max_length_ - 3 - max_prompt_len;
@@ -547,10 +547,10 @@ bool UIEModel::ConstructTextsAndPrompts(
void UIEModel::Preprocess(
const std::vector<std::string>& input_texts,
const std::vector<std::string>& prompts,
std::vector<faster_tokenizer::core::Encoding>* encodings,
std::vector<fast_tokenizer::core::Encoding>* encodings,
std::vector<fastdeploy::FDTensor>* inputs) {
// 1. Tokenize the short texts and short prompts
std::vector<faster_tokenizer::core::EncodeInput> text_pair_input;
std::vector<fast_tokenizer::core::EncodeInput> text_pair_input;
for (int i = 0; i < input_texts.size(); ++i) {
text_pair_input.emplace_back(
std::pair<std::string, std::string>(prompts[i], input_texts[i]));
@@ -596,7 +596,7 @@ void UIEModel::Preprocess(
void UIEModel::Postprocess(
const std::vector<fastdeploy::FDTensor>& outputs,
const std::vector<faster_tokenizer::core::Encoding>& encodings,
const std::vector<fast_tokenizer::core::Encoding>& encodings,
const std::vector<std::string>& short_input_texts,
const std::vector<std::string>& short_prompts,
const std::vector<std::vector<size_t>>& input_mapping_with_short_text,
@@ -611,7 +611,7 @@ void UIEModel::Postprocess(
GetCandidateIdx(end_prob, outputs[1].shape[0], outputs[1].shape[1],
&end_candidate_idx_prob, position_prob_);
std::vector<std::vector<faster_tokenizer::core::Offset>> offset_mapping;
std::vector<std::vector<fast_tokenizer::core::Offset>> offset_mapping;
for (int i = 0; i < encodings.size(); ++i) {
auto&& curr_offsets = encodings[i].GetOffsets();
offset_mapping.push_back(curr_offsets);
@@ -739,7 +739,7 @@ void UIEModel::Predict(
if (has_prompt) {
// 2. Convert texts and prompts to FDTensor
std::vector<FDTensor> inputs;
std::vector<faster_tokenizer::core::Encoding> encodings;
std::vector<fast_tokenizer::core::Encoding> encodings;
Preprocess(short_input_texts, short_prompts, &encodings, &inputs);
// 3. Infer

View File

@@ -21,7 +21,7 @@
#include <vector>
#include "fastdeploy/fastdeploy_model.h"
#include "fastdeploy/utils/unique_ptr.h"
#include "faster_tokenizer/tokenizers/ernie_faster_tokenizer.h"
#include "fast_tokenizer/tokenizers/ernie_fast_tokenizer.h"
using namespace paddlenlp;
@@ -133,11 +133,11 @@ struct FASTDEPLOY_DECL UIEModel : public FastDeployModel {
std::vector<std::vector<size_t>>* input_mapping_with_short_text);
void Preprocess(const std::vector<std::string>& input_texts,
const std::vector<std::string>& prompts,
std::vector<faster_tokenizer::core::Encoding>* encodings,
std::vector<fast_tokenizer::core::Encoding>* encodings,
std::vector<fastdeploy::FDTensor>* inputs);
void Postprocess(
const std::vector<fastdeploy::FDTensor>& outputs,
const std::vector<faster_tokenizer::core::Encoding>& encodings,
const std::vector<fast_tokenizer::core::Encoding>& encodings,
const std::vector<std::string>& short_input_texts,
const std::vector<std::string>& short_prompts,
const std::vector<std::vector<size_t>>& input_mapping_with_short_text,
@@ -167,7 +167,7 @@ struct FASTDEPLOY_DECL UIEModel : public FastDeployModel {
};
using SPAN_SET = std::set<std::pair<IDX_PROB, IDX_PROB>, IdxProbCmp>;
struct SpanIdx {
faster_tokenizer::core::Offset offset_;
fast_tokenizer::core::Offset offset_;
bool is_prompt_;
};
void SetValidBackend();
@@ -188,7 +188,7 @@ struct FASTDEPLOY_DECL UIEModel : public FastDeployModel {
SPAN_SET* span_set) const;
void GetSpanIdxAndProbs(
const SPAN_SET& span_set,
const std::vector<faster_tokenizer::core::Offset>& offset_mapping,
const std::vector<fast_tokenizer::core::Offset>& offset_mapping,
std::vector<SpanIdx>* span_idxs, std::vector<float>* probs) const;
void ConvertSpanToUIEResult(
const std::vector<std::string>& texts,
@@ -200,7 +200,7 @@ struct FASTDEPLOY_DECL UIEModel : public FastDeployModel {
size_t max_length_;
float position_prob_;
SchemaLanguage schema_language_;
faster_tokenizer::tokenizers_impl::ErnieFasterTokenizer tokenizer_;
fast_tokenizer::tokenizers_impl::ErnieFastTokenizer tokenizer_;
};
} // namespace text

View File

@@ -46,8 +46,8 @@ if "%__script_action_type%" == "show" (
echo !__3rd_lib_file! | findstr "opencv">nul && set __3rd_needed_flag=true
echo !__3rd_lib_file! | findstr "opencv">nul && set __api_tag=!__api_tag!::vision
if "!__3rd_needed_flag!"=="true" (echo !__3rd_lib_file! | findstr d\.lib>nul && set __3rd_needed_flag=false)
echo !__3rd_lib_file! | findstr "faster_tokenizer">nul && set __3rd_needed_flag=true
echo !__3rd_lib_file! | findstr "faster_tokenizer">nul && set __api_tag=!__api_tag!::text
echo !__3rd_lib_file! | findstr "fast_tokenizer">nul && set __3rd_needed_flag=true
echo !__3rd_lib_file! | findstr "fast_tokenizer">nul && set __api_tag=!__api_tag!::text
if "!__3rd_needed_flag!"=="true" (echo [Lib] !__3rd_lib_file! **[NEEDED][!__api_tag!]**) else (echo [Lib] !__3rd_lib_file!)
)
@@ -58,8 +58,8 @@ if "%__script_action_type%" == "show" (
set __3rd_include_dir=%%a && set __3rd_needed_flag=false && set __api_tag=fastdeploy
echo !__3rd_include_dir! | findstr "opencv">nul && set __3rd_needed_flag=true
echo !__3rd_include_dir! | findstr "opencv">nul && set __api_tag=!__api_tag!::vision
echo !__3rd_include_dir! | findstr "faster_tokenizer">nul && set __3rd_needed_flag=true
echo !__3rd_include_dir! | findstr "faster_tokenizer">nul && set __api_tag=!__api_tag!::text
echo !__3rd_include_dir! | findstr "fast_tokenizer">nul && set __3rd_needed_flag=true
echo !__3rd_include_dir! | findstr "fast_tokenizer">nul && set __api_tag=!__api_tag!::text
if "!__3rd_needed_flag!"=="true" (echo [Include] !__3rd_include_dir! **[NEEDED][!__api_tag!]**) else (echo [Include] !__3rd_include_dir!)
)

View File

@@ -60,9 +60,4 @@ if [ -d ${INSTALLED_PREBUILT_FASTDEPLOY_DIR}/third_libs/install/paddlelite ]; th
echo "Paddle Lite Lib: ${INSTALLED_PREBUILT_FASTDEPLOY_DIR}/third_libs/install/paddlelite/lib"
fi
if [ -d ${INSTALLED_PREBUILT_FASTDEPLOY_DIR}/third_libs/install/faster_tokenizer ]; then
export LD_LIBRARY_PATH=${INSTALLED_PREBUILT_FASTDEPLOY_DIR}/third_libs/install/faster_tokenizer/lib:${LD_LIBRARY_PATH}
echo "Faster Tokenizer Lib: ${INSTALLED_PREBUILT_FASTDEPLOY_DIR}/third_libs/install/faster_tokenizer/lib"
fi
cd ${CURRENT_EXE_DIR}

View File

@@ -35,7 +35,7 @@ RUN apt-get update \
RUN apt-get update \
&& apt-get install -y --no-install-recommends libre2-5 libb64-0d python3 python3-pip libarchive-dev ffmpeg libsm6 libxext6 \
&& python3 -m pip install -U pip \
&& python3 -m pip install paddlepaddle-gpu paddlenlp faster_tokenizer
&& python3 -m pip install paddlepaddle-gpu paddlenlp fast-tokenizer-python
COPY python/dist/*.whl /opt/fastdeploy/
RUN python3 -m pip install /opt/fastdeploy/*.whl \

View File

@@ -19,7 +19,7 @@ ENV TZ=Asia/Shanghai \
RUN apt-get update && apt-get install -y --no-install-recommends apt-utils libgomp1 ffmpeg libsm6 libxext6 \
&& python3 -m pip install -U pip \
&& python3 -m pip install paddlepaddle paddlenlp faster_tokenizer
&& python3 -m pip install paddlepaddle paddlenlp fast-tokenizer-python
COPY python/dist/*.whl *.whl /opt/fastdeploy/
RUN python3 -m pip install /opt/fastdeploy/*.whl \

View File

@@ -58,8 +58,8 @@ set PATH=%FASTDEPLOY_HOME%\third_libs\install\paddle_inference\third_party\insta
set PATH=%FASTDEPLOY_HOME%\third_libs\install\paddle_inference\third_party\install\mklml\lib;%PATH%
set PATH=%FASTDEPLOY_HOME%\third_libs\install\paddle2onnx\lib;%PATH%
set PATH=%FASTDEPLOY_HOME%\third_libs\install\tensorrt\lib;%PATH%
set PATH=%FASTDEPLOY_HOME%\third_libs\install\faster_tokenizer\lib;%PATH%
set PATH=%FASTDEPLOY_HOME%\third_libs\install\faster_tokenizer\third_party\lib;%PATH%
set PATH=%FASTDEPLOY_HOME%\third_libs\install\fast_tokenizer\lib;%PATH%
set PATH=%FASTDEPLOY_HOME%\third_libs\install\fast_tokenizer\third_party\lib;%PATH%
set PATH=%FASTDEPLOY_HOME%\third_libs\install\yaml-cpp\lib;%PATH%
set PATH=%FASTDEPLOY_HOME%\third_libs\install\opencv\build\x64\vc15\bin;%PATH%
set PATH=%FASTDEPLOY_HOME%\third_libs\install\openvino\runtime\bin;%PATH%