mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-11 03:20:20 +08:00

Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
250 lines
8.2 KiB
Python
250 lines
8.2 KiB
Python
"""
|
||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||
#
|
||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||
# you may not use this file except in compliance with the License.
|
||
# You may obtain a copy of the License at
|
||
#
|
||
# http://www.apache.org/licenses/LICENSE-2.0
|
||
#
|
||
# Unless required by applicable law or agreed to in writing, software
|
||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
# See the License for the specific language governing permissions and
|
||
# limitations under the License.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import typing
|
||
from pathlib import Path
|
||
|
||
from fastdeploy.entrypoints.cli.types import CLISubcommand
|
||
from fastdeploy.input.preprocess import InputPreprocessor
|
||
|
||
if typing.TYPE_CHECKING:
|
||
from fastdeploy.utils import FlexibleArgumentParser
|
||
|
||
|
||
class TokenizerSubcommand(CLISubcommand):
|
||
"""The `tokenizer` subcommand for the FastDeploy CLI."""
|
||
|
||
name = "tokenizer"
|
||
|
||
@staticmethod
|
||
def cmd(args: argparse.Namespace) -> None:
|
||
main(args)
|
||
|
||
def subparser_init(self, subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
|
||
tokenizer_parser = subparsers.add_parser(
|
||
name=self.name,
|
||
help="Start the FastDeploy Tokenizer Server.",
|
||
description="Start the FastDeploy Tokenizer Server.",
|
||
usage="fastdeploy tokenizer [--encode/-e TEXT] [--decode/-d TEXT]",
|
||
)
|
||
|
||
# 添加通用参数
|
||
tokenizer_parser.add_argument(
|
||
"--model_name_or_path",
|
||
"--model",
|
||
"-m",
|
||
type=str,
|
||
default="baidu/ERNIE-4.5-0.3B-PT",
|
||
help="Path to model or model identifier",
|
||
)
|
||
tokenizer_parser.add_argument("--enable-mm", "-mm", action="store_true", help="Enable multi-modal support")
|
||
tokenizer_parser.add_argument("--vocab-size", "-vs", action="store_true", help="Show vocabulary size")
|
||
tokenizer_parser.add_argument("--info", "-i", action="store_true", help="Show tokenizer information")
|
||
tokenizer_parser.add_argument(
|
||
"--vocab-export", "-ve", type=str, metavar="FILE", help="Export vocabulary to file"
|
||
)
|
||
tokenizer_parser.add_argument("--encode", "-e", default=None, help="Encode text to tokens")
|
||
tokenizer_parser.add_argument("--decode", "-d", default=None, help="Decode tokens to text")
|
||
|
||
return tokenizer_parser
|
||
|
||
|
||
def cmd_init() -> list[CLISubcommand]:
|
||
return [TokenizerSubcommand()]
|
||
|
||
|
||
def get_vocab_size(tokenizer) -> int:
|
||
"""获取词表大小"""
|
||
try:
|
||
if hasattr(tokenizer, "vocab_size"):
|
||
return tokenizer.vocab_size
|
||
elif hasattr(tokenizer, "get_vocab_size"):
|
||
return tokenizer.get_vocab_size()
|
||
else:
|
||
return 100295 # Ernie4_5Tokenizer的固定词表大小
|
||
except Exception:
|
||
return 0
|
||
|
||
|
||
def get_tokenizer_info(tokenizer) -> dict:
|
||
"""获取tokenizer的元信息"""
|
||
info = {}
|
||
|
||
try:
|
||
# 基本属性
|
||
info["vocab_size"] = get_vocab_size(tokenizer)
|
||
|
||
# 模型类型和路径
|
||
if hasattr(tokenizer, "name_or_path"):
|
||
info["model_name"] = tokenizer.name_or_path
|
||
|
||
# tokenizer类型
|
||
info["tokenizer_type"] = type(tokenizer).__name__
|
||
|
||
# 特殊符号
|
||
special_tokens = {}
|
||
for attr in ["bos_token", "eos_token", "unk_token", "sep_token", "pad_token", "cls_token", "mask_token"]:
|
||
if hasattr(tokenizer, attr):
|
||
token = getattr(tokenizer, attr)
|
||
if token:
|
||
special_tokens[attr] = token
|
||
info["special_tokens"] = special_tokens
|
||
|
||
# 特殊token IDs
|
||
special_token_ids = {}
|
||
for attr in [
|
||
"bos_token_id",
|
||
"eos_token_id",
|
||
"unk_token_id",
|
||
"sep_token_id",
|
||
"pad_token_id",
|
||
"cls_token_id",
|
||
"mask_token_id",
|
||
]:
|
||
if hasattr(tokenizer, attr):
|
||
token_id = getattr(tokenizer, attr)
|
||
if token_id is not None:
|
||
special_token_ids[attr] = token_id
|
||
info["special_token_ids"] = special_token_ids
|
||
|
||
# 模型最大长度
|
||
if hasattr(tokenizer, "model_max_length"):
|
||
info["model_max_length"] = tokenizer.model_max_length
|
||
|
||
except Exception as e:
|
||
info["error"] = f"Failed to get tokenizer info: {e}"
|
||
|
||
return info
|
||
|
||
|
||
def get_vocab_dict(tokenizer) -> dict:
|
||
"""获取词表字典"""
|
||
try:
|
||
if hasattr(tokenizer, "vocab"):
|
||
return tokenizer.vocab
|
||
elif hasattr(tokenizer, "get_vocab"):
|
||
return tokenizer.get_vocab()
|
||
elif hasattr(tokenizer, "tokenizer") and hasattr(tokenizer.tokenizer, "vocab"):
|
||
return tokenizer.tokenizer.vocab
|
||
elif hasattr(tokenizer, "encoder"):
|
||
return tokenizer.encoder
|
||
else:
|
||
return {}
|
||
except Exception:
|
||
return {}
|
||
|
||
|
||
def export_vocabulary(tokenizer, file_path: str) -> None:
|
||
"""导出词表到文件"""
|
||
try:
|
||
vocab = get_vocab_dict(tokenizer)
|
||
if not vocab:
|
||
print("Warning: Could not retrieve vocabulary from tokenizer")
|
||
return
|
||
|
||
path = Path(file_path)
|
||
path.parent.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 根据文件扩展名选择格式
|
||
if path.suffix.lower() == ".json":
|
||
with open(path, "w", encoding="utf-8") as f:
|
||
json.dump(vocab, f, ensure_ascii=False, indent=2)
|
||
else:
|
||
# 默认格式:每行一个token
|
||
with open(path, "w", encoding="utf-8") as f:
|
||
for token, token_id in sorted(vocab.items(), key=lambda x: x[1]):
|
||
# 处理不可打印字符
|
||
try:
|
||
f.write(f"{token_id}\t{repr(token)}\n")
|
||
except:
|
||
f.write(f"{token_id}\t<unprintable>\n")
|
||
|
||
print(f"Vocabulary exported to: {file_path}")
|
||
print(f"Total tokens: {len(vocab)}")
|
||
|
||
except Exception as e:
|
||
print(f"Error exporting vocabulary: {e}")
|
||
|
||
|
||
def main(args: argparse.Namespace) -> None:
|
||
|
||
def print_separator(title=""):
|
||
if title:
|
||
print(f"\n{'='*50}")
|
||
print(f" {title}")
|
||
print(f"{'='*50}")
|
||
else:
|
||
print(f"\n{'='*50}")
|
||
|
||
# 检查参数
|
||
if not any([args.encode, args.decode, args.vocab_size, args.info, args.vocab_export]):
|
||
print("请至少指定一个参数:--encode, --decode, --vocab-size, --info, --export-vocab")
|
||
return
|
||
|
||
# 初始化tokenizer
|
||
preprocessor = InputPreprocessor(model_name_or_path=args.model_name_or_path, enable_mm=args.enable_mm)
|
||
tokenizer = preprocessor.create_processor().tokenizer
|
||
|
||
# 执行操作
|
||
operations_count = 0
|
||
|
||
if args.encode:
|
||
print_separator("ENCODING")
|
||
print(f"Input text: {args.encode}")
|
||
encoded_text = tokenizer.encode(args.encode)
|
||
print(f"Encoded tokens: {encoded_text}")
|
||
operations_count += 1
|
||
|
||
if args.decode:
|
||
print_separator("DECODING")
|
||
print(f"Input tokens: {args.decode}")
|
||
try:
|
||
if isinstance(args.decode, str):
|
||
if args.decode.startswith("[") and args.decode.endswith("]"):
|
||
tokens = eval(args.decode)
|
||
else:
|
||
tokens = [int(x.strip()) for x in args.decode.split(",")]
|
||
else:
|
||
tokens = args.decode
|
||
|
||
decoded_text = tokenizer.decode(tokens)
|
||
print(f"Decoded text: {decoded_text}")
|
||
except Exception as e:
|
||
print(f"Error decoding tokens: {e}")
|
||
operations_count += 1
|
||
|
||
if args.vocab_size:
|
||
print_separator("VOCABULARY SIZE")
|
||
print(f"Vocabulary size: {get_vocab_size(tokenizer)}")
|
||
operations_count += 1
|
||
|
||
if args.info:
|
||
print_separator("TOKENIZER INFO")
|
||
print(json.dumps(get_tokenizer_info(tokenizer), indent=2))
|
||
operations_count += 1
|
||
|
||
if args.vocab_export:
|
||
print_separator("EXPORT VOCABULARY")
|
||
export_vocabulary(tokenizer, args.vocab_export)
|
||
operations_count += 1
|
||
|
||
print_separator()
|
||
print(f"Completed {operations_count} operation(s)")
|