mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Metax] add ci yaml (#5520)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Co-authored-by: Jiaxin Sui <95567040+plusNew001@users.noreply.github.com>
This commit is contained in:
36
tests/ci_use/Metax_UT/run_ernie_vl_28B.py
Normal file
36
tests/ci_use/Metax_UT/run_ernie_vl_28B.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import os
|
||||
|
||||
os.environ["MACA_VISIBLE_DEVICES"] = "0,1"
|
||||
os.environ["FD_MOE_BACKEND"] = "cutlass"
|
||||
os.environ["PADDLE_XCCL_BACKEND"] = "metax_gpu"
|
||||
os.environ["FLAGS_weight_only_linear_arch"] = "80"
|
||||
os.environ["FD_METAX_KVCACHE_MEM"] = "8"
|
||||
os.environ["ENABLE_V1_KVCACHE_SCHEDULER"] = "1"
|
||||
os.environ["FD_ENC_DEC_BLOCK_NUM"] = "2"
|
||||
|
||||
|
||||
import fastdeploy
|
||||
|
||||
sampling_params = fastdeploy.SamplingParams(top_p=0.95, max_tokens=2048, temperature=0.6)
|
||||
|
||||
llm = fastdeploy.LLM(
|
||||
model="/data/models/PaddlePaddle/ERNIE-4.5-VL-28B-A3B-Thinking",
|
||||
tensor_parallel_size=2,
|
||||
engine_worker_queue_port=8899,
|
||||
max_model_len=2048,
|
||||
quantization="wint8",
|
||||
load_choices="default_v1",
|
||||
disable_custom_all_reduce=True,
|
||||
)
|
||||
|
||||
prompts = [
|
||||
"A robe takes 2 bolts of blue fiber and half that much white fiber. How many bolts in total does it take?",
|
||||
]
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs.text
|
||||
print(f"Prompt: {prompt!r}")
|
||||
print(f"Generated: {generated_text!r}")
|
||||
Reference in New Issue
Block a user