mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-12 12:00:30 +08:00
add_cli_tokenizer (#4278)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
This commit is contained in:
249
fastdeploy/entrypoints/cli/tokenizer.py
Normal file
249
fastdeploy/entrypoints/cli/tokenizer.py
Normal file
@@ -0,0 +1,249 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import typing
|
||||
from pathlib import Path
|
||||
|
||||
from fastdeploy.entrypoints.cli.types import CLISubcommand
|
||||
from fastdeploy.input.preprocess import InputPreprocessor
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
from fastdeploy.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
class TokenizerSubcommand(CLISubcommand):
|
||||
"""The `tokenizer` subcommand for the FastDeploy CLI."""
|
||||
|
||||
name = "tokenizer"
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
main(args)
|
||||
|
||||
def subparser_init(self, subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
|
||||
tokenizer_parser = subparsers.add_parser(
|
||||
name=self.name,
|
||||
help="Start the FastDeploy Tokenizer Server.",
|
||||
description="Start the FastDeploy Tokenizer Server.",
|
||||
usage="fastdeploy tokenizer [--encode/-e TEXT] [--decode/-d TEXT]",
|
||||
)
|
||||
|
||||
# 添加通用参数
|
||||
tokenizer_parser.add_argument(
|
||||
"--model_name_or_path",
|
||||
"--model",
|
||||
"-m",
|
||||
type=str,
|
||||
default="baidu/ERNIE-4.5-0.3B-PT",
|
||||
help="Path to model or model identifier",
|
||||
)
|
||||
tokenizer_parser.add_argument("--enable-mm", "-mm", action="store_true", help="Enable multi-modal support")
|
||||
tokenizer_parser.add_argument("--vocab-size", "-vs", action="store_true", help="Show vocabulary size")
|
||||
tokenizer_parser.add_argument("--info", "-i", action="store_true", help="Show tokenizer information")
|
||||
tokenizer_parser.add_argument(
|
||||
"--vocab-export", "-ve", type=str, metavar="FILE", help="Export vocabulary to file"
|
||||
)
|
||||
tokenizer_parser.add_argument("--encode", "-e", default=None, help="Encode text to tokens")
|
||||
tokenizer_parser.add_argument("--decode", "-d", default=None, help="Decode tokens to text")
|
||||
|
||||
return tokenizer_parser
|
||||
|
||||
|
||||
def cmd_init() -> list[CLISubcommand]:
|
||||
return [TokenizerSubcommand()]
|
||||
|
||||
|
||||
def get_vocab_size(tokenizer) -> int:
|
||||
"""获取词表大小"""
|
||||
try:
|
||||
if hasattr(tokenizer, "vocab_size"):
|
||||
return tokenizer.vocab_size
|
||||
elif hasattr(tokenizer, "get_vocab_size"):
|
||||
return tokenizer.get_vocab_size()
|
||||
else:
|
||||
return 100295 # Ernie4_5Tokenizer的固定词表大小
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
def get_tokenizer_info(tokenizer) -> dict:
|
||||
"""获取tokenizer的元信息"""
|
||||
info = {}
|
||||
|
||||
try:
|
||||
# 基本属性
|
||||
info["vocab_size"] = get_vocab_size(tokenizer)
|
||||
|
||||
# 模型类型和路径
|
||||
if hasattr(tokenizer, "name_or_path"):
|
||||
info["model_name"] = tokenizer.name_or_path
|
||||
|
||||
# tokenizer类型
|
||||
info["tokenizer_type"] = type(tokenizer).__name__
|
||||
|
||||
# 特殊符号
|
||||
special_tokens = {}
|
||||
for attr in ["bos_token", "eos_token", "unk_token", "sep_token", "pad_token", "cls_token", "mask_token"]:
|
||||
if hasattr(tokenizer, attr):
|
||||
token = getattr(tokenizer, attr)
|
||||
if token:
|
||||
special_tokens[attr] = token
|
||||
info["special_tokens"] = special_tokens
|
||||
|
||||
# 特殊token IDs
|
||||
special_token_ids = {}
|
||||
for attr in [
|
||||
"bos_token_id",
|
||||
"eos_token_id",
|
||||
"unk_token_id",
|
||||
"sep_token_id",
|
||||
"pad_token_id",
|
||||
"cls_token_id",
|
||||
"mask_token_id",
|
||||
]:
|
||||
if hasattr(tokenizer, attr):
|
||||
token_id = getattr(tokenizer, attr)
|
||||
if token_id is not None:
|
||||
special_token_ids[attr] = token_id
|
||||
info["special_token_ids"] = special_token_ids
|
||||
|
||||
# 模型最大长度
|
||||
if hasattr(tokenizer, "model_max_length"):
|
||||
info["model_max_length"] = tokenizer.model_max_length
|
||||
|
||||
except Exception as e:
|
||||
info["error"] = f"Failed to get tokenizer info: {e}"
|
||||
|
||||
return info
|
||||
|
||||
|
||||
def get_vocab_dict(tokenizer) -> dict:
|
||||
"""获取词表字典"""
|
||||
try:
|
||||
if hasattr(tokenizer, "vocab"):
|
||||
return tokenizer.vocab
|
||||
elif hasattr(tokenizer, "get_vocab"):
|
||||
return tokenizer.get_vocab()
|
||||
elif hasattr(tokenizer, "tokenizer") and hasattr(tokenizer.tokenizer, "vocab"):
|
||||
return tokenizer.tokenizer.vocab
|
||||
elif hasattr(tokenizer, "encoder"):
|
||||
return tokenizer.encoder
|
||||
else:
|
||||
return {}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def export_vocabulary(tokenizer, file_path: str) -> None:
|
||||
"""导出词表到文件"""
|
||||
try:
|
||||
vocab = get_vocab_dict(tokenizer)
|
||||
if not vocab:
|
||||
print("Warning: Could not retrieve vocabulary from tokenizer")
|
||||
return
|
||||
|
||||
path = Path(file_path)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 根据文件扩展名选择格式
|
||||
if path.suffix.lower() == ".json":
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
json.dump(vocab, f, ensure_ascii=False, indent=2)
|
||||
else:
|
||||
# 默认格式:每行一个token
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
for token, token_id in sorted(vocab.items(), key=lambda x: x[1]):
|
||||
# 处理不可打印字符
|
||||
try:
|
||||
f.write(f"{token_id}\t{repr(token)}\n")
|
||||
except:
|
||||
f.write(f"{token_id}\t<unprintable>\n")
|
||||
|
||||
print(f"Vocabulary exported to: {file_path}")
|
||||
print(f"Total tokens: {len(vocab)}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error exporting vocabulary: {e}")
|
||||
|
||||
|
||||
def main(args: argparse.Namespace) -> None:
|
||||
|
||||
def print_separator(title=""):
|
||||
if title:
|
||||
print(f"\n{'='*50}")
|
||||
print(f" {title}")
|
||||
print(f"{'='*50}")
|
||||
else:
|
||||
print(f"\n{'='*50}")
|
||||
|
||||
# 检查参数
|
||||
if not any([args.encode, args.decode, args.vocab_size, args.info, args.vocab_export]):
|
||||
print("请至少指定一个参数:--encode, --decode, --vocab-size, --info, --export-vocab")
|
||||
return
|
||||
|
||||
# 初始化tokenizer
|
||||
preprocessor = InputPreprocessor(model_name_or_path=args.model_name_or_path, enable_mm=args.enable_mm)
|
||||
tokenizer = preprocessor.create_processor().tokenizer
|
||||
|
||||
# 执行操作
|
||||
operations_count = 0
|
||||
|
||||
if args.encode:
|
||||
print_separator("ENCODING")
|
||||
print(f"Input text: {args.encode}")
|
||||
encoded_text = tokenizer.encode(args.encode)
|
||||
print(f"Encoded tokens: {encoded_text}")
|
||||
operations_count += 1
|
||||
|
||||
if args.decode:
|
||||
print_separator("DECODING")
|
||||
print(f"Input tokens: {args.decode}")
|
||||
try:
|
||||
if isinstance(args.decode, str):
|
||||
if args.decode.startswith("[") and args.decode.endswith("]"):
|
||||
tokens = eval(args.decode)
|
||||
else:
|
||||
tokens = [int(x.strip()) for x in args.decode.split(",")]
|
||||
else:
|
||||
tokens = args.decode
|
||||
|
||||
decoded_text = tokenizer.decode(tokens)
|
||||
print(f"Decoded text: {decoded_text}")
|
||||
except Exception as e:
|
||||
print(f"Error decoding tokens: {e}")
|
||||
operations_count += 1
|
||||
|
||||
if args.vocab_size:
|
||||
print_separator("VOCABULARY SIZE")
|
||||
print(f"Vocabulary size: {get_vocab_size(tokenizer)}")
|
||||
operations_count += 1
|
||||
|
||||
if args.info:
|
||||
print_separator("TOKENIZER INFO")
|
||||
print(json.dumps(get_tokenizer_info(tokenizer), indent=2))
|
||||
operations_count += 1
|
||||
|
||||
if args.vocab_export:
|
||||
print_separator("EXPORT VOCABULARY")
|
||||
export_vocabulary(tokenizer, args.vocab_export)
|
||||
operations_count += 1
|
||||
|
||||
print_separator()
|
||||
print(f"Completed {operations_count} operation(s)")
|
Reference in New Issue
Block a user