mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-06 00:57:33 +08:00
Add uie python example and doc (#221)
* add fastdeploy.text.UIEModel * Add uie python example * Add one schema for cpp demo * Add ConvertUIEResultToDict for pretty the uie result in python * remove default args for SchemaNode * Add uie example args * Add uie python api desc * Add infer.py usage * truncate some example output * Add uie schema usage * Add uie result md * Add uie c++ api doc
This commit is contained in:
@@ -262,11 +262,6 @@ add_library(${LIBRARY_NAME} SHARED ${ALL_DEPLOY_SRCS})
|
||||
add_dependencies(${LIBRARY_NAME} extern_eigen3)
|
||||
|
||||
redefine_file_macro(${LIBRARY_NAME})
|
||||
set_target_properties(${LIBRARY_NAME} PROPERTIES COMPILE_FLAGS "-fvisibility=hidden")
|
||||
if(NOT APPLE)
|
||||
set_target_properties(${LIBRARY_NAME} PROPERTIES LINK_FLAGS "-Wl,--start-group,--exclude-libs,ALL")
|
||||
endif()
|
||||
set_target_properties(${LIBRARY_NAME} PROPERTIES LINK_FLAGS_RELEASE -s)
|
||||
|
||||
file(READ "${PROJECT_SOURCE_DIR}/VERSION_NUMBER" FASTDEPLOY_VERSION)
|
||||
string(STRIP "${FASTDEPLOY_VERSION}" FASTDEPLOY_VERSION)
|
||||
|
7
docs/api/text_results/README.md
Normal file
7
docs/api/text_results/README.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# 自然语言模型预测结果说明
|
||||
|
||||
FastDeploy根据自然语言模型的任务类型,定义了不同的结构体来表达模型预测结果,具体如下表所示
|
||||
|
||||
| 结构体 | 文档 | 说明 | 相应模型 |
|
||||
| :----- | :--- | :---- | :------- |
|
||||
| UIEResult | [C++/Python文档](./uie_result.md) | UIE模型返回结果 | UIE模型 |
|
34
docs/api/text_results/uie_result.md
Normal file
34
docs/api/text_results/uie_result.md
Normal file
@@ -0,0 +1,34 @@
|
||||
# UIEResult 图像分类结果
|
||||
|
||||
UIEResult代码定义在`fastdeploy/text/uie/model.h`中,用于表明UIE模型抽取结果和置信度。
|
||||
|
||||
## C++ 定义
|
||||
|
||||
`fastdeploy::text::UIEResult`
|
||||
|
||||
```c++
|
||||
struct UIEResult {
|
||||
size_t start_;
|
||||
size_t end_;
|
||||
double probability_;
|
||||
std::string text_;
|
||||
std::unordered_map<std::string, std::vector<UIEResult>> relation_;
|
||||
std::string Str() const;
|
||||
};
|
||||
```
|
||||
|
||||
- **start_**: 成员变量,表示抽取结果text_在原文本(Unicode编码)中的起始位置。
|
||||
- **end**: 成员变量,表示抽取结果text_在原文本(Unicode编码)中的结束位置。
|
||||
- **text_**: 成员函数,表示抽取的结果,以UTF-8编码方式保存。
|
||||
- **relation_**: 成员函数,表示当前结果关联的结果。常用于关系抽取。
|
||||
- **Str()**: 成员函数,将结构体中的信息以字符串形式输出(用于Debug)
|
||||
|
||||
## Python 定义
|
||||
|
||||
`fastdeploy.text.C.UIEResult`
|
||||
|
||||
- **start_**(int): 成员变量,表示抽取结果text_在原文本(Unicode编码)中的起始位置。
|
||||
- **end**(int): 成员变量,表示抽取结果text_在原文本(Unicode编码)中的结束位置。
|
||||
- **text_**(str): 成员函数,表示抽取的结果,以UTF-8编码方式保存。
|
||||
- **relation_**(dict(str, list(fastdeploy.text.C.UIEResult))): 成员函数,表示当前结果关联的结果。常用于关系抽取。
|
||||
- **get_dict()**: 以dict形式返回fastdeploy.text.C.UIEResult。
|
@@ -7,28 +7,18 @@
|
||||
- 1. 软硬件环境满足要求,参考[FastDeploy环境要求](../../../../docs/environment.md)
|
||||
- 2. 根据开发环境,下载预编译部署库和samples代码,参考[FastDeploy预编译库](../../../../docs/compile/prebuilt_libraries.md)
|
||||
|
||||
## 快速开始
|
||||
以Linux上uie-base模型推理为例,在本目录执行如下命令即可完成编译测试。
|
||||
|
||||
```
|
||||
# UIE目前还未发布,当前需开发者自行编译FastDeploy,通过如下脚本编译得到部署库fastdeploy-linux-x64-dev
|
||||
git clone https://github.com/PaddlePaddle/FastDeploy.git
|
||||
cd FastDeploy
|
||||
mkdir build && cd build
|
||||
cmake .. -DENABLE_ORT_BACKEND=ON \
|
||||
-DENABLE_VISION=ON \
|
||||
-DENABLE_PADDLE_BACKEND=ON \
|
||||
-DENABLE_TEXT=ON \
|
||||
-DWITH_GPU=ON \
|
||||
-DCMAKE_INSTALL_PREFIX=${PWD}/fastdeploy-linux-x64-gpu-dev
|
||||
#下载SDK,编译模型examples代码(SDK中包含了examples代码)
|
||||
wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-gpu-0.2.1.tgz
|
||||
tar xvf fastdeploy-linux-x64-gpu-0.2.1.tgz
|
||||
|
||||
make -j8
|
||||
make install
|
||||
|
||||
# 编译模型examples代码(SDK中包含了examples代码)
|
||||
cd ../examples/text/uie/cpp
|
||||
cd fastdeploy-linux-x64-gpu-0.2.1/examples/text/uie/cpp
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/../../../../../build/fastdeploy-linux-x64-gpu-dev
|
||||
cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/../../../../../../fastdeploy-linux-x64-gpu-0.2.1
|
||||
make -j
|
||||
|
||||
# 下载uie-base模型以及词表
|
||||
@@ -41,7 +31,116 @@ tar -xvfz uie-base.tgz
|
||||
|
||||
# GPU 推理
|
||||
./infer_demo uie-base 1
|
||||
|
||||
# 使用OpenVINO推理
|
||||
./infer_demo uie-base 1 2
|
||||
```
|
||||
|
||||
## 模型获取
|
||||
UIE 模型介绍可以参考https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/uie 。其中,在完成训练后,需要将训练后的模型导出成推理模型。该步骤可参考该文档完成导出:https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/uie#%E6%A8%A1%E5%9E%8B%E9%83%A8%E7%BD%B2 。
|
||||
运行完成后返回结果如下所示(仅截取NER任务的输出)。
|
||||
```bash
|
||||
[INFO] fastdeploy/fastdeploy_runtime.cc(264)::Init Runtime initialized with Backend::PDINFER in device Device::CPU.
|
||||
After init predictor
|
||||
The result:
|
||||
赛事名称:
|
||||
text: 北京冬奥会自由式滑雪女子大跳台决赛
|
||||
probability: 0.850309
|
||||
start: 6
|
||||
end: 23
|
||||
|
||||
时间:
|
||||
text: 2月8日上午
|
||||
probability: 0.985738
|
||||
start: 0
|
||||
end: 6
|
||||
|
||||
选手:
|
||||
text: 谷爱凌
|
||||
probability: 0.898155
|
||||
start: 28
|
||||
end: 31
|
||||
```
|
||||
|
||||
## UIEModel C++接口
|
||||
|
||||
### SchemaNode 结构
|
||||
表示UIE模型目标模式的结构。
|
||||
|
||||
```c++
|
||||
SchemaNode(const std::string& name,
|
||||
const std::vector<SchemaNode>& children = {});
|
||||
```
|
||||
**参数**
|
||||
|
||||
> * **name**(str): 需要抽取的信息。
|
||||
> * **children**(str): 当前节点需抽取信息关联的子信息。
|
||||
|
||||
### UIEModel 结构
|
||||
用于信息抽取任务的UIE模型结构。
|
||||
|
||||
#### 初始化函数
|
||||
```c++
|
||||
UIEModel(
|
||||
const std::string& model_file, const std::string& params_file,
|
||||
const std::string& vocab_file, float position_prob, size_t max_length,
|
||||
const std::vector<std::string>& schema,
|
||||
const fastdeploy::RuntimeOption& custom_option =
|
||||
fastdeploy::RuntimeOption(),
|
||||
const fastdeploy::Frontend& model_format = fastdeploy::Frontend::PADDLE);
|
||||
UIEModel(
|
||||
const std::string& model_file, const std::string& params_file,
|
||||
const std::string& vocab_file, float position_prob, size_t max_length,
|
||||
const SchemaNode& schema, const fastdeploy::RuntimeOption& custom_option =
|
||||
fastdeploy::RuntimeOption(),
|
||||
const fastdeploy::Frontend& model_format = fastdeploy::Frontend::PADDLE);
|
||||
UIEModel(
|
||||
const std::string& model_file, const std::string& params_file,
|
||||
const std::string& vocab_file, float position_prob, size_t max_length,
|
||||
const std::vector<SchemaNode>& schema,
|
||||
const fastdeploy::RuntimeOption& custom_option =
|
||||
fastdeploy::RuntimeOption(),
|
||||
const fastdeploy::Frontend& model_format = fastdeploy::Frontend::PADDLE);
|
||||
```
|
||||
|
||||
UIE模型加载和初始化,其中model_file, params_file为训练模型导出的Paddle inference文件,具体请参考其文档说明[模型导出](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/model_zoo/uie/README.md#%E6%A8%A1%E5%9E%8B%E9%83%A8%E7%BD%B2)。
|
||||
|
||||
**参数**
|
||||
|
||||
> * **model_file**(str): 模型文件路径
|
||||
> * **params_file**(str): 参数文件路径
|
||||
> * **vocab_file**(str): 词表文件路径
|
||||
> * **position_prob**(str): 位置概率,模型将输出位置概率大于`position_prob`的位置,默认为0.5
|
||||
> * **max_length**(int): 输入文本的最大长度。输入文本下标超过`max_length`的部分将被截断。默认为128
|
||||
> * **schema**(list(SchemaNode) | SchemaNode | list(str)): 抽取任务的目标模式。
|
||||
> * **runtime_option**(RuntimeOption): 后端推理配置,默认为None,即采用默认配置
|
||||
> * **model_format**(Frontend): 模型格式,默认为Paddle格式
|
||||
|
||||
#### SetSchema函数
|
||||
|
||||
```c++
|
||||
void SetSchema(const std::vector<std::string>& schema);
|
||||
void SetSchema(const std::vector<SchemaNode>& schema);
|
||||
void SetSchema(const SchemaNode& schema);
|
||||
```
|
||||
|
||||
**参数**
|
||||
> * **schema**(list(SchemaNode) | SchemaNode | list(str)): 输入数据,待抽取文本模式。
|
||||
|
||||
#### Predict函数
|
||||
|
||||
```c++
|
||||
void Predict(
|
||||
const std::vector<std::string>& texts,
|
||||
std::vector<std::unordered_map<std::string, std::vector<UIEResult>>>* results);
|
||||
```
|
||||
**参数**
|
||||
|
||||
> * **texts**(list(str)): 文本列表
|
||||
> * **results**(list(dict())): UIE模型抽取结果。UIEResult结构详细可见[UIEResult说明](../../../../docs/api/text_results/uie_result.md)。
|
||||
|
||||
## 相关文档
|
||||
|
||||
[UIE模型详细介绍](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/model_zoo/uie/README.md)
|
||||
|
||||
[UIE模型导出方法](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/model_zoo/uie/README.md#%E6%A8%A1%E5%9E%8B%E9%83%A8%E7%BD%B2)
|
||||
|
||||
[UIE C++部署方法](../cpp/README.md)
|
||||
|
@@ -71,7 +71,7 @@ int main(int argc, char* argv[]) {
|
||||
auto predictor =
|
||||
fastdeploy::text::UIEModel(model_path, param_path, vocab_path, 0.5, 128,
|
||||
{"时间", "选手", "赛事名称"}, option);
|
||||
fastdeploy::FDINFO << "After init predictor" << std::endl;
|
||||
std::cout << "After init predictor" << std::endl;
|
||||
std::vector<std::unordered_map<std::string, std::vector<UIEResult>>> results;
|
||||
// Named Entity Recognition
|
||||
predictor.Predict({"2月8日上午北京冬奥会自由式滑雪女子大跳台决赛中中国选手谷"
|
||||
@@ -80,6 +80,16 @@ int main(int argc, char* argv[]) {
|
||||
std::cout << results << std::endl;
|
||||
results.clear();
|
||||
|
||||
predictor.SetSchema(
|
||||
{"肿瘤的大小", "肿瘤的个数", "肝癌级别", "脉管内癌栓分级"});
|
||||
predictor.Predict({"(右肝肿瘤)肝细胞性肝癌(II-"
|
||||
"III级,梁索型和假腺管型),肿瘤包膜不完整,紧邻肝被膜,侵"
|
||||
"及周围肝组织,未见脉管内癌栓(MVI分级:M0级)及卫星子灶形"
|
||||
"成。(肿物1个,大小4.2×4.0×2.8cm)。"},
|
||||
&results);
|
||||
std::cout << results << std::endl;
|
||||
results.clear();
|
||||
|
||||
// Relation Extraction
|
||||
predictor.SetSchema(
|
||||
{SchemaNode("竞赛名称", {SchemaNode("主办方"), SchemaNode("承办方"),
|
||||
|
383
examples/text/uie/python/README.md
Normal file
383
examples/text/uie/python/README.md
Normal file
@@ -0,0 +1,383 @@
|
||||
# 通用信息抽取 UIE Python部署示例
|
||||
|
||||
在部署前,需确认以下两个步骤
|
||||
|
||||
- 1. 软硬件环境满足要求,参考[FastDeploy环境要求](../../../../docs/environment.md)
|
||||
- 2. FastDeploy Python whl包安装,参考[FastDeploy Python安装](../../../../docs/quick_start)
|
||||
|
||||
本目录下提供`infer.py`快速完成UIE模型在CPU/GPU,以及CPU上通过OpenVINO加速CPU端部署示例。执行如下脚本即可完成。
|
||||
|
||||
## 快速开始
|
||||
```bash
|
||||
|
||||
#下载部署示例代码
|
||||
git clone https://github.com/PaddlePaddle/FastDeploy.git
|
||||
cd FastDeploy/examples/text/uie/python
|
||||
|
||||
# 下载UIE模型文件和词表,以uie-base模型为例
|
||||
wget https://bj.bcebos.com/fastdeploy/models/uie/uie-base.tgz
|
||||
tar -xvfz uie-base.tgz
|
||||
|
||||
# CPU推理
|
||||
python infer.py --model_dir uie-base --device cpu
|
||||
# GPU推理
|
||||
python infer.py --model_dir uie-base --device gpu
|
||||
# 使用OpenVINO推理
|
||||
python infer.py --model_dir uie-base --device cpu --backend openvino --cpu_num_threads 8
|
||||
```
|
||||
|
||||
运行完成后返回结果如下所示(仅截取NER任务的输出)。
|
||||
```bash
|
||||
1. Named Entity Recognition Task
|
||||
The extraction schema: ['时间', '选手', '赛事名称']
|
||||
[{'时间': {'end': 6,
|
||||
'probability': 0.9857379794120789,
|
||||
'start': 0,
|
||||
'text': '2月8日上午'},
|
||||
'赛事名称': {'end': 23,
|
||||
'probability': 0.8503087162971497,
|
||||
'start': 6,
|
||||
'text': '北京冬奥会自由式滑雪女子大跳台决赛'},
|
||||
'选手': {'end': 31,
|
||||
'probability': 0.8981553912162781,
|
||||
'start': 28,
|
||||
'text': '谷爱凌'}}]
|
||||
|
||||
The extraction schema: ['肿瘤的大小', '肿瘤的个数', '肝癌级别', '脉管内癌栓分级']
|
||||
[{'肝癌级别': {'end': 20,
|
||||
'probability': 0.9243271350860596,
|
||||
'start': 13,
|
||||
'text': 'II-III级'},
|
||||
'肿瘤的个数': {'end': 84,
|
||||
'probability': 0.7538408041000366,
|
||||
'start': 82,
|
||||
'text': '1个'},
|
||||
'肿瘤的大小': {'end': 100,
|
||||
'probability': 0.8341134190559387,
|
||||
'start': 87,
|
||||
'text': '4.2×4.0×2.8cm'},
|
||||
'脉管内癌栓分级': {'end': 70,
|
||||
'probability': 0.9083293080329895,
|
||||
'start': 67,
|
||||
'text': 'M0级'}}]
|
||||
......
|
||||
```
|
||||
|
||||
## UIE模型各抽取任务使用方式
|
||||
|
||||
在UIE模型中,schema代表要抽取的结构化信息,所以UIE模型可通过设置不同的schema支持不同信息抽取任务。
|
||||
|
||||
### 初始化UIEModel
|
||||
|
||||
```python
|
||||
import fastdeploy
|
||||
from fastdeploy.text import UIEModel
|
||||
model_dir = "uie-base"
|
||||
model_path = os.path.join(model_dir, "inference.pdmodel")
|
||||
param_path = os.path.join(model_dir, "inference.pdiparams")
|
||||
vocab_path = os.path.join(model_dir, "vocab.txt")
|
||||
|
||||
runtime_option = fastdeploy.RuntimeOption()
|
||||
schema = ["时间", "选手", "赛事名称"]
|
||||
|
||||
# 初始化UIE模型
|
||||
uie = UIEModel(
|
||||
model_path,
|
||||
param_path,
|
||||
vocab_path,
|
||||
position_prob=0.5,
|
||||
max_length=128,
|
||||
schema=schema,
|
||||
runtime_option=runtime_option)
|
||||
```
|
||||
|
||||
### 实体抽取
|
||||
|
||||
初始化阶段将schema设置为```["时间", "选手", "赛事名称"]```,可对输入的文本抽取时间、选手以及赛事名称三个信息。
|
||||
|
||||
```python
|
||||
>>> from pprint import pprint
|
||||
>>> results = uie.predict(
|
||||
["2月8日上午北京冬奥会自由式滑雪女子大跳台决赛中中国选手谷爱凌以188.25分获得金牌!"], return_dict=True)
|
||||
>>> pprint(results)
|
||||
|
||||
# 示例输出
|
||||
# [{'时间': {'end': 6,
|
||||
# 'probability': 0.9857379794120789,
|
||||
# 'start': 0,
|
||||
# 'text': '2月8日上午'},
|
||||
# '赛事名称': {'end': 23,
|
||||
# 'probability': 0.8503087162971497,
|
||||
# 'start': 6,
|
||||
# 'text': '北京冬奥会自由式滑雪女子大跳台决赛'},
|
||||
# '选手': {'end': 31,
|
||||
# 'probability': 0.8981553912162781,
|
||||
# 'start': 28,
|
||||
# 'text': '谷爱凌'}}]
|
||||
|
||||
```
|
||||
|
||||
例如抽取的目标实体类型是"肿瘤的大小"、"肿瘤的个数"、"肝癌级别"和"脉管内癌栓分级", 则可执行如下语句:
|
||||
|
||||
```python
|
||||
>>> schema = ["肿瘤的大小", "肿瘤的个数", "肝癌级别", "脉管内癌栓分级"]
|
||||
>>> uie.set_schema(schema)
|
||||
>>> results = uie.predict(
|
||||
[
|
||||
"(右肝肿瘤)肝细胞性肝癌(II-III级,梁索型和假腺管型),肿瘤包膜不完整,紧邻肝被膜,侵及周围肝组织,"
|
||||
"未见脉管内癌栓(MVI分级:M0级)及卫星子灶形成。(肿物1个,大小4.2×4.0×2.8cm)。"
|
||||
],
|
||||
return_dict=True)
|
||||
>>> pprint(results)
|
||||
|
||||
# 示例输出
|
||||
# [{'肝癌级别': {'end': 20,
|
||||
# 'probability': 0.9243271350860596,
|
||||
# 'start': 13,
|
||||
# 'text': 'II-III级'},
|
||||
# '肿瘤的个数': {'end': 84,
|
||||
# 'probability': 0.7538408041000366,
|
||||
# 'start': 82,
|
||||
# 'text': '1个'},
|
||||
# '肿瘤的大小': {'end': 100,
|
||||
# 'probability': 0.8341134190559387,
|
||||
# 'start': 87,
|
||||
# 'text': '4.2×4.0×2.8cm'},
|
||||
# '脉管内癌栓分级': {'end': 70,
|
||||
# 'probability': 0.9083293080329895,
|
||||
# 'start': 67,
|
||||
# 'text': 'M0级'}}]
|
||||
```
|
||||
|
||||
|
||||
### 关系抽取
|
||||
|
||||
关系抽取(Relation Extraction,简称RE),是指从文本中识别实体并抽取实体之间的语义关系,进而获取三元组信息,即<主体,谓语,客体>。
|
||||
|
||||
例如以"竞赛名称"作为抽取主体,抽取关系类型为"主办方"、"承办方"和"已举办次数", 则可执行如下语句:
|
||||
|
||||
```python
|
||||
>>> schema = {"竞赛名称": ["主办方", "承办方", "已举办次数"]}
|
||||
>>> uie.set_schema(schema)
|
||||
>>> results = uie.predict(
|
||||
[
|
||||
"2022语言与智能技术竞赛由中国中文信息学会和中国计算机学会联合主办,百度公司、中国中文信息学会评测工作"
|
||||
"委员会和中国计算机学会自然语言处理专委会承办,已连续举办4届,成为全球最热门的中文NLP赛事之一。"
|
||||
],
|
||||
return_dict=True)
|
||||
>>> pprint(results)
|
||||
|
||||
# 示例输出
|
||||
# [{'竞赛名称': {'end': 13,
|
||||
# 'probability': 0.7825401425361633,
|
||||
# 'relation': {'主办方': [{'end': 22,
|
||||
# 'probability': 0.8421716690063477,
|
||||
# 'start': 14,
|
||||
# 'text': '中国中文信息学会'},
|
||||
# {'end': 30,
|
||||
# 'probability': 0.7580805420875549,
|
||||
# 'start': 23,
|
||||
# 'text': '中国计算机学会'}],
|
||||
# '已举办次数': [{'end': 82,
|
||||
# 'probability': 0.4671304225921631,
|
||||
# 'start': 80,
|
||||
# 'text': '4届'}],
|
||||
# '承办方': [{'end': 39,
|
||||
# 'probability': 0.8292709589004517,
|
||||
# 'start': 35,
|
||||
# 'text': '百度公司'},
|
||||
# {'end': 55,
|
||||
# 'probability': 0.7000502943992615,
|
||||
# 'start': 40,
|
||||
# 'text': '中国中文信息学会评测工作委员会'},
|
||||
# {'end': 72,
|
||||
# 'probability': 0.6193484663963318,
|
||||
# 'start': 56,
|
||||
# 'text': '中国计算机学会自然语言处理专委会'}]},
|
||||
# 'start': 0,
|
||||
# 'text': '2022语言与智能技术竞赛'}}]
|
||||
```
|
||||
|
||||
### 事件抽取
|
||||
|
||||
事件抽取 (Event Extraction, 简称EE),是指从自然语言文本中抽取预定义的事件触发词(Trigger)和事件论元(Argument),组合为相应的事件结构化信息。
|
||||
|
||||
例如抽取的目标是"地震"事件的"地震强度"、"时间"、"震中位置"和"震源深度"这些信息,则可执行如下代码:
|
||||
|
||||
```python
|
||||
>>> schema = {"地震触发词": ["地震强度", "时间", "震中位置", "震源深度"]}
|
||||
>>> uie.set_schema(schema)
|
||||
>>> results = uie.predict(
|
||||
[
|
||||
"中国地震台网正式测定:5月16日06时08分在云南临沧市凤庆县(北纬24.34度,东经99.98度)发生3.5级地震,"
|
||||
"震源深度10千米。"
|
||||
],
|
||||
return_dict=True)
|
||||
>>> pprint(results)
|
||||
|
||||
# 示例输出
|
||||
# [{'地震触发词': {'end': 58,
|
||||
# 'probability': 0.9977425932884216,
|
||||
# 'relation': {'地震强度': [{'end': 56,
|
||||
# 'probability': 0.9980800747871399,
|
||||
# 'start': 52,
|
||||
# 'text': '3.5级'}],
|
||||
# '时间': [{'end': 22,
|
||||
# 'probability': 0.9853301644325256,
|
||||
# 'start': 11,
|
||||
# 'text': '5月16日06时08分'}],
|
||||
# '震中位置': [{'end': 50,
|
||||
# 'probability': 0.7874020934104919,
|
||||
# 'start': 23,
|
||||
# 'text': '云南临沧市凤庆县(北纬24.34度,东经99.98度)'}],
|
||||
# '震源深度': [{'end': 67,
|
||||
# 'probability': 0.9937973618507385,
|
||||
# 'start': 63,
|
||||
# 'text': '10千米'}]},
|
||||
# 'start': 56,
|
||||
# 'text': '地震'}}]
|
||||
```
|
||||
|
||||
### 评论观点抽取
|
||||
|
||||
评论观点抽取,是指抽取文本中包含的评价维度、观点词。
|
||||
|
||||
例如抽取的目标是文本中包含的评价维度及其对应的观点词和情感倾向,可执行以下代码:
|
||||
|
||||
```python
|
||||
>>> schema = {"评价维度": ["观点词", "情感倾向[正向,负向]"]}
|
||||
>>> uie.set_schema(schema)
|
||||
>>> results = uie.predict(
|
||||
["店面干净,很清静,服务员服务热情,性价比很高,发现收银台有排队"], return_dict=True)
|
||||
>>> pprint(results)
|
||||
|
||||
# 示例输出
|
||||
# [{'评价维度': {'end': 20,
|
||||
# 'probability': 0.9817039966583252,
|
||||
# 'relation': {'情感倾向[正向,负向]': [{'end': 0,
|
||||
# 'probability': 0.9966142177581787,
|
||||
# 'start': 0,
|
||||
# 'text': '正向'}],
|
||||
# '观点词': [{'end': 22,
|
||||
# 'probability': 0.9573966264724731,
|
||||
# 'start': 21,
|
||||
# 'text': '高'}]},
|
||||
# 'start': 17,
|
||||
# 'text': '性价比'}}]
|
||||
```
|
||||
|
||||
### 情感分类
|
||||
|
||||
句子级情感倾向分类,即判断句子的情感倾向是“正向”还是“负向”,可执行以下代码:
|
||||
|
||||
```python
|
||||
>>> schema = ["情感倾向[正向,负向]"]
|
||||
>>> uie.set_schema(schema)
|
||||
>>> results = uie.predict(["这个产品用起来真的很流畅,我非常喜欢"], return_dict=True)
|
||||
>>> pprint(results)
|
||||
|
||||
# 示例输出
|
||||
# [{'情感倾向[正向,负向]': {'end': 0,
|
||||
# 'probability': 0.9990023970603943,
|
||||
# 'start': 0,
|
||||
# 'text': '正向'}}]
|
||||
```
|
||||
|
||||
### 跨任务抽取
|
||||
|
||||
例如在法律场景同时对文本进行实体抽取和关系抽取,可执行以下代码:
|
||||
|
||||
```python
|
||||
>>> schema = ["法院", {"原告": "委托代理人"}, {"被告": "委托代理人"}]
|
||||
>>> uie.set_schema(schema)
|
||||
>>> results = uie.predict(
|
||||
[
|
||||
"北京市海淀区人民法院\n民事判决书\n(199x)建初字第xxx号\n原告:张三。\n委托代理人李四,北京市 A律师"
|
||||
"事务所律师。\n被告:B公司,法定代表人王五,开发公司总经理。\n委托代理人赵六,北京市 C律师事务所律师。"
|
||||
],
|
||||
return_dict=True)
|
||||
>>> pprint(results)
|
||||
# 示例输出
|
||||
# [{'原告': {'end': 37,
|
||||
# 'probability': 0.9949813485145569,
|
||||
# 'relation': {'委托代理人': [{'end': 46,
|
||||
# 'probability': 0.7956855297088623,
|
||||
# 'start': 44,
|
||||
# 'text': '李四'}]},
|
||||
# 'start': 35,
|
||||
# 'text': '张三'},
|
||||
# '法院': {'end': 10,
|
||||
# 'probability': 0.9221072793006897,
|
||||
# 'start': 0,
|
||||
# 'text': '北京市海淀区人民法院'},
|
||||
# '被告': {'end': 67,
|
||||
# 'probability': 0.8437348008155823,
|
||||
# 'relation': {'委托代理人': [{'end': 92,
|
||||
# 'probability': 0.7267124652862549,
|
||||
# 'start': 90,
|
||||
# 'text': '赵六'}]},
|
||||
# 'start': 64,
|
||||
# 'text': 'B公司'}}]
|
||||
```
|
||||
|
||||
## UIEModel Python接口
|
||||
|
||||
```python
|
||||
fd.text.uie.UIEModel(model_file,
|
||||
params_file,
|
||||
vocab_file,
|
||||
position_prob=0.5,
|
||||
max_length=128,
|
||||
schema=[],
|
||||
runtime_option=None,model_format=Frontend.PADDLE)
|
||||
```
|
||||
|
||||
UIEModel模型加载和初始化,其中`model_file`, `params_file`为训练模型导出的Paddle inference文件,具体请参考其文档说明[模型导出](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/model_zoo/uie/README.md#%E6%A8%A1%E5%9E%8B%E9%83%A8%E7%BD%B2),`vocab_file`为词表文件,UIE模型的词表可在[UIE配置文件](https://github.com/PaddlePaddle/PaddleNLP/blob/5401f01af85f1c73d8017c6b3476242fce1e6d52/model_zoo/uie/utils.py)中下载相应的UIE模型的vocab_file。
|
||||
|
||||
**参数**
|
||||
|
||||
> * **model_file**(str): 模型文件路径
|
||||
> * **params_file**(str): 参数文件路径
|
||||
> * **vocab_file**(str): 词表文件
|
||||
> * **position_prob**(str): 位置概率,模型将输出位置概率大于`position_prob`的位置,默认为0.5
|
||||
> * **max_length**(int): 输入文本的最大长度。输入文本下标超过`max_length`的部分将被截断。默认为128
|
||||
> * **schema**(list|dict): 抽取任务的目标信息。
|
||||
> * **runtime_option**(RuntimeOption): 后端推理配置,默认为None,即采用默认配置
|
||||
> * **model_format**(Frontend): 模型格式,默认为Paddle格式
|
||||
|
||||
### set_schema函数
|
||||
|
||||
> ```python
|
||||
> set_schema(schema)
|
||||
> ```
|
||||
> 设置UIE模型的schema接口。
|
||||
>
|
||||
> **参数**
|
||||
> > * **schema**(list|dict): 输入数据,待抽取文本列表。
|
||||
>
|
||||
> **返回**
|
||||
> 空。
|
||||
|
||||
### predict函数
|
||||
|
||||
> ```python
|
||||
> UIEModel.predict(texts, return_dict=False)
|
||||
> ```
|
||||
>
|
||||
> 模型预测接口,输入文本列表直接输出抽取结果。
|
||||
>
|
||||
> **参数**
|
||||
>
|
||||
> > * **texts**(list(str)): 输入数据,待抽取文本列表。
|
||||
> > * **return_dict**(bool): 是否以字典形式输出UIE结果,默认为False。
|
||||
> **返回**
|
||||
>
|
||||
> > 返回`dict(str, list(fastdeploy.text.C.UIEResult))`, 详细可见[UIEResult说明](../../../../docs/api/text_results/uie_result.md)。
|
||||
|
||||
## 相关文档
|
||||
|
||||
[UIE模型详细介绍](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/model_zoo/uie/README.md)
|
||||
|
||||
[UIE模型导出方法](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/model_zoo/uie/README.md#%E6%A8%A1%E5%9E%8B%E9%83%A8%E7%BD%B2)
|
||||
|
||||
[UIE C++部署方法](../cpp/README.md)
|
157
examples/text/uie/python/infer.py
Normal file
157
examples/text/uie/python/infer.py
Normal file
@@ -0,0 +1,157 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import fastdeploy
|
||||
from fastdeploy.text import UIEModel
|
||||
import os
|
||||
from pprint import pprint
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
import argparse
|
||||
import ast
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--model_dir",
|
||||
required=True,
|
||||
help="The directory of model, params and vocab file.")
|
||||
parser.add_argument(
|
||||
"--device",
|
||||
type=str,
|
||||
default='cpu',
|
||||
choices=['cpu', 'gpu'],
|
||||
help="Type of inference device, support 'cpu' or 'gpu'.")
|
||||
parser.add_argument(
|
||||
"--backend",
|
||||
type=str,
|
||||
default='onnx_runtime',
|
||||
choices=['onnx_runtime', 'paddle_inference', 'openvino'],
|
||||
help="The inference runtime backend.")
|
||||
parser.add_argument(
|
||||
"--cpu_num_threads",
|
||||
type=int,
|
||||
default=8,
|
||||
help="The number of threads to execute inference in cpu device.")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def build_option(args):
|
||||
runtime_option = fastdeploy.RuntimeOption()
|
||||
# Set device
|
||||
if args.device == 'cpu':
|
||||
runtime_option.use_cpu()
|
||||
else:
|
||||
runtime_option.use_gpu()
|
||||
|
||||
# Set backend
|
||||
if args.backend == 'onnx_runtime':
|
||||
runtime_option.use_ort_backend()
|
||||
elif args.backend == 'paddle_inference':
|
||||
runtime_option.use_paddle_backend()
|
||||
elif args.backend == 'openvino':
|
||||
runtime_option.use_openvino_backend()
|
||||
runtime_option.set_cpu_thread_num(args.cpu_num_threads)
|
||||
return runtime_option
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_arguments()
|
||||
runtime_option = build_option(args)
|
||||
|
||||
model_path = os.path.join(args.model_dir, "inference.pdmodel")
|
||||
param_path = os.path.join(args.model_dir, "inference.pdiparams")
|
||||
vocab_path = os.path.join(args.model_dir, "vocab.txt")
|
||||
|
||||
schema = ["时间", "选手", "赛事名称"]
|
||||
uie = UIEModel(
|
||||
model_path,
|
||||
param_path,
|
||||
vocab_path,
|
||||
position_prob=0.5,
|
||||
max_length=128,
|
||||
schema=schema,
|
||||
runtime_option=runtime_option)
|
||||
|
||||
print("1. Named Entity Recognition Task")
|
||||
print(f"The extraction schema: {schema}")
|
||||
results = uie.predict(
|
||||
["2月8日上午北京冬奥会自由式滑雪女子大跳台决赛中中国选手谷爱凌以188.25分获得金牌!"], return_dict=True)
|
||||
pprint(results)
|
||||
print()
|
||||
|
||||
schema = ["肿瘤的大小", "肿瘤的个数", "肝癌级别", "脉管内癌栓分级"]
|
||||
print(f"The extraction schema: {schema}")
|
||||
uie.set_schema(schema)
|
||||
results = uie.predict(
|
||||
[
|
||||
"(右肝肿瘤)肝细胞性肝癌(II-III级,梁索型和假腺管型),肿瘤包膜不完整,紧邻肝被膜,侵及周围肝组织,"
|
||||
"未见脉管内癌栓(MVI分级:M0级)及卫星子灶形成。(肿物1个,大小4.2×4.0×2.8cm)。"
|
||||
],
|
||||
return_dict=True)
|
||||
pprint(results)
|
||||
print()
|
||||
|
||||
print("2. Relation Extraction Task")
|
||||
schema = {"竞赛名称": ["主办方", "承办方", "已举办次数"]}
|
||||
print(f"The extraction schema: {schema}")
|
||||
uie.set_schema(schema)
|
||||
results = uie.predict(
|
||||
[
|
||||
"2022语言与智能技术竞赛由中国中文信息学会和中国计算机学会联合主办,百度公司、中国中文信息学会评测工作"
|
||||
"委员会和中国计算机学会自然语言处理专委会承办,已连续举办4届,成为全球最热门的中文NLP赛事之一。"
|
||||
],
|
||||
return_dict=True)
|
||||
pprint(results)
|
||||
print()
|
||||
|
||||
print("3. Event Extraction Task")
|
||||
schema = {"地震触发词": ["地震强度", "时间", "震中位置", "震源深度"]}
|
||||
print(f"The extraction schema: {schema}")
|
||||
uie.set_schema(schema)
|
||||
results = uie.predict(
|
||||
[
|
||||
"中国地震台网正式测定:5月16日06时08分在云南临沧市凤庆县(北纬24.34度,东经99.98度)发生3.5级地震,"
|
||||
"震源深度10千米。"
|
||||
],
|
||||
return_dict=True)
|
||||
pprint(results)
|
||||
print()
|
||||
|
||||
print("4. Opinion Extraction Task")
|
||||
schema = {"评价维度": ["观点词", "情感倾向[正向,负向]"]}
|
||||
print(f"The extraction schema: {schema}")
|
||||
uie.set_schema(schema)
|
||||
results = uie.predict(
|
||||
["店面干净,很清静,服务员服务热情,性价比很高,发现收银台有排队"], return_dict=True)
|
||||
pprint(results)
|
||||
print()
|
||||
|
||||
print("5. Sequence Classification Task")
|
||||
schema = ["情感倾向[正向,负向]"]
|
||||
print(f"The extraction schema: {schema}")
|
||||
uie.set_schema(schema)
|
||||
results = uie.predict(["这个产品用起来真的很流畅,我非常喜欢"], return_dict=True)
|
||||
pprint(results)
|
||||
print()
|
||||
|
||||
print("6. Cross Task Extraction Task")
|
||||
schema = ["法院", {"原告": "委托代理人"}, {"被告": "委托代理人"}]
|
||||
print(f"The extraction schema: {schema}")
|
||||
uie.set_schema(schema)
|
||||
results = uie.predict(
|
||||
[
|
||||
"北京市海淀区人民法院\n民事判决书\n(199x)建初字第xxx号\n原告:张三。\n委托代理人李四,北京市 A律师"
|
||||
"事务所律师。\n被告:B公司,法定代表人王五,开发公司总经理。\n委托代理人赵六,北京市 C律师事务所律师。"
|
||||
],
|
||||
return_dict=True)
|
||||
pprint(results)
|
@@ -14,18 +14,47 @@
|
||||
|
||||
#include "fastdeploy/pybind/main.h"
|
||||
|
||||
namespace py = pybind11;
|
||||
using namespace py::literals;
|
||||
|
||||
namespace fastdeploy {
|
||||
|
||||
void BindUIE(pybind11::module& m);
|
||||
void BindUIE(py::module& m);
|
||||
|
||||
void BindText(pybind11::module& m) {
|
||||
pybind11::class_<text::UIEResult>(m, "UIEResult")
|
||||
.def(pybind11::init())
|
||||
py::dict ConvertUIEResultToDict(const text::UIEResult& self) {
|
||||
py::dict d;
|
||||
d["start"] = self.start_;
|
||||
d["end"] = self.end_;
|
||||
d["probability"] = self.probability_;
|
||||
d["text"] = self.text_;
|
||||
|
||||
if (!self.relation_.empty()) {
|
||||
d["relation"] = py::dict();
|
||||
for (auto iter = self.relation_.begin(); iter != self.relation_.end();
|
||||
++iter) {
|
||||
py::list l;
|
||||
for (auto result_iter = iter->second.begin();
|
||||
result_iter != iter->second.end(); ++result_iter) {
|
||||
l.append(ConvertUIEResultToDict(*result_iter));
|
||||
}
|
||||
d["relation"][iter->first.c_str()] = l;
|
||||
}
|
||||
}
|
||||
return d;
|
||||
}
|
||||
|
||||
void BindText(py::module& m) {
|
||||
py::class_<text::UIEResult>(m, "UIEResult", py::dynamic_attr())
|
||||
.def(py::init())
|
||||
.def_readwrite("start", &text::UIEResult::start_)
|
||||
.def_readwrite("end", &text::UIEResult::end_)
|
||||
.def_readwrite("probability_", &text::UIEResult::probability_)
|
||||
.def_readwrite("probability", &text::UIEResult::probability_)
|
||||
.def_readwrite("text", &text::UIEResult::text_)
|
||||
.def_readwrite("relation", &text::UIEResult::relation_)
|
||||
.def("get_dict",
|
||||
[](const text::UIEResult& self) {
|
||||
return ConvertUIEResultToDict(self);
|
||||
})
|
||||
.def("__repr__", &text::UIEResult::Str)
|
||||
.def("__str__", &text::UIEResult::Str);
|
||||
BindUIE(m);
|
||||
|
@@ -22,7 +22,7 @@ void BindUIE(pybind11::module& m) {
|
||||
py::class_<text::SchemaNode>(m, "SchemaNode")
|
||||
.def(py::init<>())
|
||||
.def(py::init<std::string, std::vector<text::SchemaNode>>(),
|
||||
py::arg("name"), py::arg("children") = {})
|
||||
py::arg("name"), py::arg("children"))
|
||||
.def_readwrite("name", &text::SchemaNode::name_)
|
||||
.def_readwrite("prefix", &text::SchemaNode::prefix_)
|
||||
.def_readwrite("relations", &text::SchemaNode::relations_)
|
||||
|
@@ -14,3 +14,4 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
from . import uie
|
||||
from .uie import UIEModel
|
||||
|
@@ -23,11 +23,13 @@ from ... import c_lib_wrap as C
|
||||
class SchemaNode(object):
|
||||
def __init__(self, name, children=[]):
|
||||
schema_node_children = []
|
||||
if isinstance(children, str):
|
||||
children = [children]
|
||||
for child in children:
|
||||
if isinstance(child, str):
|
||||
schema_node_children += [C.text.SchemaNode(child, [])]
|
||||
elif isinstance(child, dict):
|
||||
for key, val in child.item():
|
||||
for key, val in child.items():
|
||||
schema_node_child = SchemaNode(key, val)
|
||||
schema_node_children += [schema_node_child._schema_node]
|
||||
else:
|
||||
@@ -69,5 +71,15 @@ class UIEModel(object):
|
||||
schema = schema_tmp
|
||||
self._model.set_schema(schema)
|
||||
|
||||
def predict(self, texts):
|
||||
return self._model.predict(texts)
|
||||
def predict(self, texts, return_dict=False):
|
||||
results = self._model.predict(texts)
|
||||
if not return_dict:
|
||||
return results
|
||||
new_results = []
|
||||
for result in results:
|
||||
uie_result = dict()
|
||||
for key, uie_results in result.items():
|
||||
for uie_res in uie_results:
|
||||
uie_result[key] = uie_res.get_dict()
|
||||
new_results += [uie_result]
|
||||
return new_results
|
||||
|
Reference in New Issue
Block a user