mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Intel HPU] add example benchmark scripts for hpu (#5304)
* [Intel HPU] add example benchmark scripts for hpu * Revise the code based on the copilot comments * update code based on comments * update ci ops version
This commit is contained in:
246
examples/intel_hpu/bench_gsm8k.py
Normal file
246
examples/intel_hpu/bench_gsm8k.py
Normal file
@@ -0,0 +1,246 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Metric evaluation for Fastdeploy + ERNIE-4.5-Turbo"""
|
||||
# adapted from https://github.com/sgl-project/sglang/blob/main/benchmark/gsm8k/bench_other.py
|
||||
import argparse
|
||||
import ast
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
|
||||
INVALID = -9999999
|
||||
|
||||
|
||||
def call_generate(prompt, **kwargs):
|
||||
"""
|
||||
Generates response based on the input prompt.
|
||||
|
||||
Args:
|
||||
prompt (str): The input prompt text.
|
||||
**kwargs: Keyword arguments, including server IP address and port number.
|
||||
|
||||
Returns:
|
||||
str: The response generated based on the prompt.
|
||||
|
||||
"""
|
||||
url = f"http://{kwargs['ip']}:{kwargs['port']}/v1/chat/completions"
|
||||
headers = {"Content-Type": "application/json"}
|
||||
data = {
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}
|
||||
],
|
||||
"temperature": 0.6,
|
||||
"max_tokens": 2047,
|
||||
"top_p": 0.95,
|
||||
"do_sample": True,
|
||||
}
|
||||
|
||||
response = requests.post(url, headers=headers, data=json.dumps(data))
|
||||
out = response.json()
|
||||
return out["choices"][0]["message"]["content"]
|
||||
|
||||
|
||||
def get_one_example(lines, i, include_answer):
|
||||
"""
|
||||
Retrieves a question-answer example from the given list of text lines.
|
||||
|
||||
Args:
|
||||
lines (list of dict): A list of question-answer pairs.
|
||||
i (int): The index of the question-answer pair to retrieve from lines.
|
||||
include_answer (bool): Whether to include the answer in the returned string.
|
||||
|
||||
Returns:
|
||||
str: A formatted question-answer string in the format "Question: <question>\nAnswer: <answer>".
|
||||
|
||||
"""
|
||||
ret = "Question: " + lines[i]["question"] + "\nAnswer:"
|
||||
if include_answer:
|
||||
ret += " " + lines[i]["answer"]
|
||||
return ret
|
||||
|
||||
|
||||
def get_few_shot_examples(lines, k):
|
||||
"""
|
||||
Selects k examples from the given list of text lines and concatenates them into a single string.
|
||||
|
||||
Args:
|
||||
lines (list): A list containing text lines.
|
||||
k (int): The number of examples to select.
|
||||
|
||||
Returns:
|
||||
str: A string composed of k examples, separated by two newline characters.
|
||||
"""
|
||||
ret = ""
|
||||
for i in range(k):
|
||||
ret += get_one_example(lines, i, True) + "\n\n"
|
||||
return ret
|
||||
|
||||
|
||||
def get_answer_value(answer_str):
|
||||
"""
|
||||
Extracts numerical values from an answer string and returns them.
|
||||
|
||||
Args:
|
||||
answer_str (str): The string containing the answer.
|
||||
|
||||
Returns:
|
||||
The extracted numerical value; returns "INVALID" if extraction fails.
|
||||
"""
|
||||
answer_str = answer_str.replace(",", "")
|
||||
numbers = re.findall(r"\d+", answer_str)
|
||||
if len(numbers) < 1:
|
||||
return INVALID
|
||||
try:
|
||||
return ast.literal_eval(numbers[-1])
|
||||
except SyntaxError:
|
||||
return INVALID
|
||||
|
||||
|
||||
def read_jsonl(filename: str):
|
||||
"""
|
||||
Reads a JSONL file.
|
||||
|
||||
Args:
|
||||
filename (str): Path to the JSONL file.
|
||||
|
||||
Yields:
|
||||
dict: A dictionary object corresponding to each line in the JSONL file.
|
||||
"""
|
||||
with open(filename) as fin:
|
||||
for line in fin:
|
||||
if line.startswith("#"):
|
||||
continue
|
||||
yield json.loads(line)
|
||||
|
||||
|
||||
def main(args):
|
||||
"""
|
||||
Process inputs and generate answers by calling the model in parallel using a thread pool.
|
||||
|
||||
Args:
|
||||
args (argparse.Namespace):
|
||||
- num_questions (int): Number of questions to process.
|
||||
- num_shots (int): Number of few-shot learning examples.
|
||||
- ip (str): IP address of the model service.
|
||||
- port (int): Port number of the model service.
|
||||
- parallel (int): Number of questions to process in parallel.
|
||||
- result_file (str): File path to store the results.
|
||||
|
||||
Returns:
|
||||
None
|
||||
|
||||
"""
|
||||
# Read data
|
||||
filename = "test.jsonl"
|
||||
|
||||
lines = list(read_jsonl(filename))
|
||||
|
||||
# Construct prompts
|
||||
num_questions = args.num_questions
|
||||
num_shots = args.num_shots
|
||||
few_shot_examples = get_few_shot_examples(lines, num_shots)
|
||||
|
||||
questions = []
|
||||
labels = []
|
||||
for i in range(len(lines[:num_questions])):
|
||||
questions.append(get_one_example(lines, i, False))
|
||||
labels.append(get_answer_value(lines[i]["answer"]))
|
||||
assert all(l != INVALID for l in labels)
|
||||
|
||||
states = [None] * len(labels)
|
||||
|
||||
# Use thread pool
|
||||
def get_one_answer(i):
|
||||
answer = call_generate(
|
||||
prompt=few_shot_examples + questions[i],
|
||||
# stop=["Question", "Assistant:", "<|separator|>"],
|
||||
ip=args.ip,
|
||||
port=args.port,
|
||||
)
|
||||
states[i] = answer
|
||||
|
||||
tic = time.time()
|
||||
if args.parallel == 1:
|
||||
for i in tqdm(range(len(questions))):
|
||||
get_one_answer(i)
|
||||
else:
|
||||
with ThreadPoolExecutor(args.parallel) as executor:
|
||||
list(
|
||||
tqdm(
|
||||
executor.map(get_one_answer, list(range(len(questions)))),
|
||||
total=len(questions),
|
||||
)
|
||||
)
|
||||
|
||||
latency = time.time() - tic
|
||||
preds = []
|
||||
|
||||
with open(args.acc_log, "w") as fout:
|
||||
for i in range(len(states)):
|
||||
preds.append(get_answer_value(states[i]))
|
||||
answer = get_answer_value(states[i])
|
||||
fout.write("\n################################################################\n")
|
||||
fout.write("-----------prompt--------------\n")
|
||||
fout.write(f"{few_shot_examples + questions[i]}\n")
|
||||
fout.write("-----------answer--------------\n")
|
||||
fout.write(f"answer= {states[i]}\n")
|
||||
fout.write("-----------accuracy--------------\n")
|
||||
fout.write(f"Correct={answer==labels[i]}, pred={answer}, label={labels[i]} \n")
|
||||
|
||||
# Compute accuracy
|
||||
acc = np.mean(np.array(preds) == np.array(labels))
|
||||
invalid = np.mean(np.array(preds) == INVALID)
|
||||
|
||||
# Print results
|
||||
print(f"Accuracy: {acc:.3f}")
|
||||
print(f"Invalid: {invalid:.3f}")
|
||||
print(f"Latency: {latency:.3f} s")
|
||||
|
||||
with open(args.result_file, "a") as fout:
|
||||
value = {
|
||||
"task": "gsm8k",
|
||||
"backend": "paddlepaddle",
|
||||
"num_gpus": 1,
|
||||
"latency": round(latency, 3),
|
||||
"accuracy": round(acc, 3),
|
||||
"num_requests": args.num_questions,
|
||||
"other": {
|
||||
"num_questions": args.num_questions,
|
||||
"parallel": args.parallel,
|
||||
},
|
||||
}
|
||||
fout.write(json.dumps(value) + "\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--ip", type=str, default="127.0.0.1")
|
||||
parser.add_argument("--port", type=str, default="8188")
|
||||
parser.add_argument("--num-shots", type=int, default=10)
|
||||
parser.add_argument("--data-path", type=str, default="test.jsonl")
|
||||
parser.add_argument("--num-questions", type=int, default=1319)
|
||||
parser.add_argument("--result-file", type=str, default="result.jsonl")
|
||||
parser.add_argument("--parallel", type=int, default=1)
|
||||
parser.add_argument("--acc-log", type=str, default="accuracy.log")
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
72
examples/intel_hpu/benchmark_paddle_hpu_cli.sh
Executable file
72
examples/intel_hpu/benchmark_paddle_hpu_cli.sh
Executable file
@@ -0,0 +1,72 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# set -x
|
||||
|
||||
model="ERNIE-4.5-21B-A3B-Paddle"
|
||||
model_log_name="ERNIE-4.5-21B-A3B-Paddle"
|
||||
model_yaml="yaml/eb45-21b-a3b-32k-bf16.yaml"
|
||||
# model="ERNIE-4.5-300B-A47B-Paddle"
|
||||
# model_log_name="ERNIE-4.5-300B-A47B-Paddle"
|
||||
# model_yaml="yaml/eb45-300b-a47b-32k-bf16.yaml"
|
||||
|
||||
export SERVER_PORT=8188
|
||||
export no_proxy=localhost,127.0.0.1,0.0.0.0,10.0.0.0/8,192.168.1.0/24
|
||||
|
||||
input_lengths=(1024 2048)
|
||||
output_lengths=(1024)
|
||||
batch_sizes=(1 2 4 8 16 32 64 128)
|
||||
|
||||
workspace=$(pwd)
|
||||
cd $workspace
|
||||
log_home=$workspace/benchmark_fastdeploy_logs/$(TZ='Asia/Shanghai' date '+WW%V')_$(TZ='Asia/Shanghai' date +%F-%H-%M-%S)_${model_log_name}_FixedLen
|
||||
|
||||
mkdir -p ${log_home}
|
||||
|
||||
for input_length in "${input_lengths[@]}"
|
||||
do
|
||||
for output_length in "${output_lengths[@]}"
|
||||
do
|
||||
for batch_size in "${batch_sizes[@]}"
|
||||
do
|
||||
> log/hpu_model_runner_profile.log
|
||||
num_prompts=$(( batch_size * 3))
|
||||
log_name_prefix="benchmarkdata_${model_log_name}_inputlength_${input_length}_outputlength_${output_length}_batchsize_${batch_size}_numprompts_${num_prompts}"
|
||||
log_name=${log_name_prefix}_$(TZ='Asia/Shanghai' date +%F-%H-%M-%S)
|
||||
echo "running benchmark with input length ${input_length}, output length ${output_length}, batch size ${batch_size}, log name ${log_name}"
|
||||
cmd="python ../../benchmarks/benchmark_serving.py \
|
||||
--backend openai-chat \
|
||||
--model $model \
|
||||
--endpoint /v1/chat/completions \
|
||||
--host 0.0.0.0 \
|
||||
--port ${SERVER_PORT} \
|
||||
--dataset-name random \
|
||||
--random-input-len ${input_length} \
|
||||
--random-output-len ${output_length} \
|
||||
--random-range-ratio 0 \
|
||||
--hyperparameter-path ../../benchmarks/${model_yaml} \
|
||||
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
|
||||
--metric-percentiles 80,95,99,99.9,99.95,99.99 \
|
||||
--num-prompts ${num_prompts} \
|
||||
--max-concurrency ${batch_size} \
|
||||
--ignore-eos"
|
||||
echo $cmd | tee -a ${log_home}/${log_name}.log
|
||||
eval $cmd >> ${log_home}/${log_name}.log 2>&1
|
||||
|
||||
cp log/hpu_model_runner_profile.log ${log_home}/${log_name}_profile.log
|
||||
done
|
||||
done
|
||||
done
|
||||
64
examples/intel_hpu/benchmark_paddle_hpu_cli_sharegpt.sh
Executable file
64
examples/intel_hpu/benchmark_paddle_hpu_cli_sharegpt.sh
Executable file
@@ -0,0 +1,64 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# set -x
|
||||
|
||||
model="ERNIE-4.5-21B-A3B-Paddle"
|
||||
model_log_name="ERNIE-4.5-21B-A3B-Paddle"
|
||||
model_yaml="yaml/eb45-21b-a3b-32k-bf16.yaml"
|
||||
# model="ERNIE-4.5-300B-A47B-Paddle"
|
||||
# model_log_name="ERNIE-4.5-300B-A47B-Paddle"
|
||||
# model_yaml="yaml/eb45-300b-a47b-32k-bf16.yaml"
|
||||
export SERVER_PORT=8188
|
||||
export no_proxy=.intel.com,intel.com,localhost,127.0.0.1,0.0.0.0,10.0.0.0/8,192.168.1.0/24
|
||||
|
||||
CARD_NUM=$1
|
||||
|
||||
if [[ "$CARD_NUM" == "1" ]]; then
|
||||
batch_size=128
|
||||
else
|
||||
batch_size=64
|
||||
fi
|
||||
|
||||
num_prompts=2000
|
||||
|
||||
workspace=$(pwd)
|
||||
cd $workspace
|
||||
log_home=$workspace/benchmark_fastdeploy_logs/$(TZ='Asia/Shanghai' date '+WW%V')_$(TZ='Asia/Shanghai' date +%F-%H-%M-%S)_${model_log_name}
|
||||
|
||||
mkdir -p ${log_home}
|
||||
|
||||
log_name_prefix="benchmarkdata_${model_log_name}_sharegpt"
|
||||
log_name=${log_name_prefix}_$(TZ='Asia/Shanghai' date +%F-%H-%M-%S)
|
||||
echo "running benchmark with sharegpt log name ${log_name}"
|
||||
cmd="python ../../benchmarks/benchmark_serving.py \
|
||||
--backend openai-chat \
|
||||
--model $model \
|
||||
--endpoint /v1/chat/completions \
|
||||
--host 0.0.0.0 \
|
||||
--port ${SERVER_PORT} \
|
||||
--dataset-name EBChat \
|
||||
--dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
|
||||
--hyperparameter-path ../../benchmarks/${model_yaml} \
|
||||
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
|
||||
--metric-percentiles 80,95,99,99.9,99.95,99.99 \
|
||||
--max-concurrency ${batch_size} \
|
||||
--num-prompts ${num_prompts} \
|
||||
--sharegpt-output-len 4096 \
|
||||
--save-result "
|
||||
echo $cmd | tee -a ${log_home}/${log_name}.log
|
||||
eval $cmd >> ${log_home}/${log_name}.log 2>&1
|
||||
cp log/hpu_model_runner_profile.log ${log_home}/${log_name}_profile.log
|
||||
49
examples/intel_hpu/benchmark_paddle_hpu_server.sh
Executable file
49
examples/intel_hpu/benchmark_paddle_hpu_server.sh
Executable file
@@ -0,0 +1,49 @@
|
||||
#!/bin/bash
|
||||
export GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
|
||||
export GC_KERNEL_PATH=/usr/local/lib/python3.10/dist-packages/paddle_custom_device/intel_hpu/libcustom_tpc_perf_lib.so:$GC_KERNEL_PATH
|
||||
export INTEL_HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||
export PADDLE_DISTRI_BACKEND=xccl
|
||||
export PADDLE_XCCL_BACKEND=intel_hpu
|
||||
# export FLAGS_intel_hpu_recipe_cache_config=/tmp/recipe,false,10240
|
||||
export FLAGS_intel_hpu_recipe_cache_num=20480
|
||||
export SERVER_PORT=8188
|
||||
export ENGINE_WORKER_QUEUE_PORT=8002
|
||||
export METRICS_PORT=8001
|
||||
export CACHE_QUEUE_PORT=8003
|
||||
export HABANA_PROFILE=0
|
||||
export HPU_VISIBLE_DEVICES=0
|
||||
rm -rf log 2>/dev/null
|
||||
FD_ENC_DEC_BLOCK_NUM=8 HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN \
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ERNIE-4.5-21B-A3B-Paddle \
|
||||
--port ${SERVER_PORT} \
|
||||
--engine-worker-queue-port ${ENGINE_WORKER_QUEUE_PORT} \
|
||||
--metrics-port ${METRICS_PORT} \
|
||||
--cache-queue-port ${CACHE_QUEUE_PORT} \
|
||||
--tensor-parallel-size 1 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 128 \
|
||||
--block-size 128 \
|
||||
--num-gpu-blocks-override 3100 \
|
||||
--kv-cache-ratio 0.991 \
|
||||
--no-enable-prefix-caching \
|
||||
--graph-optimization-config '{"use_cudagraph":false}'
|
||||
|
||||
# (2k + 1k) / 128(block_size) * 128(batch) = 3072
|
||||
# export HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||
# rm -rf log 2>/dev/null
|
||||
# FD_ENC_DEC_BLOCK_NUM=8 HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=3072 FD_ATTENTION_BACKEND=HPU_ATTN \
|
||||
# python -m fastdeploy.entrypoints.openai.api_server \
|
||||
# --model ERNIE-4.5-300B-A47B-Paddle \
|
||||
# --port ${SERVER_PORT} \
|
||||
# --engine-worker-queue-port ${ENGINE_WORKER_QUEUE_PORT} \
|
||||
# --metrics-port ${METRICS_PORT} \
|
||||
# --cache-queue-port ${CACHE_QUEUE_PORT} \
|
||||
# --tensor-parallel-size 8 \
|
||||
# --max-model-len 32768 \
|
||||
# --max-num-seqs 128 \
|
||||
# --block-size 128 \
|
||||
# --num-gpu-blocks-override 3100 \
|
||||
# --kv-cache-ratio 0.991 \
|
||||
# --no-enable-prefix-caching \
|
||||
# --graph-optimization-config '{"use_cudagraph":false}'
|
||||
35
examples/intel_hpu/benchmark_paddle_hpu_server_sharegpt.sh
Executable file
35
examples/intel_hpu/benchmark_paddle_hpu_server_sharegpt.sh
Executable file
@@ -0,0 +1,35 @@
|
||||
#!/bin/bash
|
||||
export GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
|
||||
export GC_KERNEL_PATH=/usr/local/lib/python3.10/dist-packages/paddle_custom_device/intel_hpu/libcustom_tpc_perf_lib.so:$GC_KERNEL_PATH
|
||||
export INTEL_HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||
export PADDLE_DISTRI_BACKEND=xccl
|
||||
export PADDLE_XCCL_BACKEND=intel_hpu
|
||||
# export FLAGS_intel_hpu_recipe_cache_config=/tmp/recipe,false,10240
|
||||
export FLAGS_intel_hpu_recipe_cache_num=20480
|
||||
export SERVER_PORT=8188
|
||||
export ENGINE_WORKER_QUEUE_PORT=8002
|
||||
export METRICS_PORT=8001
|
||||
export CACHE_QUEUE_PORT=8003
|
||||
export HABANA_PROFILE=0
|
||||
|
||||
CARD_NUM=$1
|
||||
|
||||
if [[ "$CARD_NUM" == "1" ]]; then
|
||||
export HPU_VISIBLE_DEVICES=0
|
||||
export MODEL="ERNIE-4.5-21B-A3B-Paddle"
|
||||
export GPU_BLOCKS=5000
|
||||
elif [[ "$CARD_NUM" == "8" ]]; then
|
||||
export HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||
export MODEL="ERNIE-4.5-300B-A47B-Paddle"
|
||||
export GPU_BLOCKS=3000
|
||||
else
|
||||
exit 0
|
||||
fi
|
||||
|
||||
rm -rf log 2>/dev/null
|
||||
FD_ENC_DEC_BLOCK_NUM=8 HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=0 FD_ATTENTION_BACKEND=HPU_ATTN ENABLE_V1_KVCACHE_SCHEDULER=0 \
|
||||
python -m fastdeploy.entrypoints.openai.api_server --model ${MODEL} --port ${SERVER_PORT} \
|
||||
--engine-worker-queue-port ${ENGINE_WORKER_QUEUE_PORT} --metrics-port ${METRICS_PORT} \
|
||||
--cache-queue-port ${CACHE_QUEUE_PORT} --tensor-parallel-size ${CARD_NUM} --max-model-len 16384 \
|
||||
--max-num-seqs 128 --block-size 128 --kv-cache-ratio 0.5 --num-gpu-blocks-override ${GPU_BLOCKS} \
|
||||
--graph-optimization-config '{"use_cudagraph":false}'
|
||||
173
examples/intel_hpu/draw_benchmark_data.py
Normal file
173
examples/intel_hpu/draw_benchmark_data.py
Normal file
@@ -0,0 +1,173 @@
|
||||
import csv
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
import matplotlib.dates as mdates
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
log_patterns = [
|
||||
re.compile(
|
||||
r"benchmarkdata_(.+?)_inputlength_(\d+)_outputlength_(\d+)_batchsize_(\d+)_numprompts_(\d+)_.*_profile\.log$"
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def draw_time_graph(log_dir, log_filename, max_num_seqs, mode):
|
||||
# Store extracted time and BT values
|
||||
timestamps_model = []
|
||||
times_model = []
|
||||
bt_values_model = []
|
||||
block_list_shapes_model = []
|
||||
block_indices_shapes_model = []
|
||||
timestamps_pp = []
|
||||
times_pp = []
|
||||
bt_values_pp = []
|
||||
|
||||
# Use regex to extract Model execution time and BT information
|
||||
pattern_model = re.compile(
|
||||
r"(\d+-\d+-\d+ \d+:\d+:\d+,\d+) .* Model execution time\(ms\): ([\d\.]+), BT=(\d+), block_list_shape=\[(\d+)\], block_indices_shape=\[(\d+)\]"
|
||||
)
|
||||
pattern_pp = re.compile(
|
||||
r"(\d+-\d+-\d+ \d+:\d+:\d+,\d+) .* PostProcessing execution time\(ms\): ([\d\.]+), BT=(\d+)"
|
||||
)
|
||||
# Read log file
|
||||
with open(os.path.join(log_dir, log_filename), "r") as file:
|
||||
for line in file:
|
||||
match_model = pattern_model.search(line)
|
||||
if match_model:
|
||||
bt_value = int(match_model.group(3))
|
||||
timestamps_model.append(datetime.strptime(match_model.group(1), "%Y-%m-%d %H:%M:%S,%f"))
|
||||
if mode == "prefill" and bt_value <= max_num_seqs:
|
||||
times_model.append(None)
|
||||
bt_values_model.append(None)
|
||||
continue
|
||||
if mode == "decode" and bt_value > max_num_seqs:
|
||||
times_model.append(None)
|
||||
bt_values_model.append(None)
|
||||
continue
|
||||
times_model.append(float(match_model.group(2)))
|
||||
bt_values_model.append(bt_value)
|
||||
block_list_shapes_model.append(int(match_model.group(4)))
|
||||
block_indices_shapes_model.append(int(match_model.group(5)))
|
||||
else:
|
||||
match_pp = pattern_pp.search(line)
|
||||
if match_pp:
|
||||
bt_value = int(match_pp.group(3))
|
||||
timestamps_pp.append(datetime.strptime(match_pp.group(1), "%Y-%m-%d %H:%M:%S,%f"))
|
||||
if mode == "prefill" and bt_value <= max_num_seqs:
|
||||
times_pp.append(None)
|
||||
bt_values_pp.append(None)
|
||||
continue
|
||||
if mode == "decode" and bt_value > max_num_seqs:
|
||||
times_pp.append(None)
|
||||
bt_values_pp.append(None)
|
||||
continue
|
||||
times_pp.append(float(match_pp.group(2)))
|
||||
bt_values_pp.append(bt_value)
|
||||
|
||||
# Plot graphs
|
||||
plt.figure(figsize=(15, 7))
|
||||
|
||||
date_format = mdates.DateFormatter("%m-%d %H:%M:%S")
|
||||
# Plot time graph
|
||||
plt.subplot(2, 1, 1)
|
||||
ax1 = plt.gca()
|
||||
ax2 = ax1.twinx()
|
||||
ax1.plot(timestamps_model, times_model, label="Model Execution Time (ms)", color="blue")
|
||||
ax2.plot(timestamps_pp, times_pp, label="PostProcessing Time (ms)", color="red")
|
||||
ax1.set_ylabel("Model Execution Time (ms)")
|
||||
ax2.set_ylabel("PostProcessing Time (ms)")
|
||||
ax1.xaxis.set_major_formatter(date_format)
|
||||
# Merge legends
|
||||
lines_1, labels_1 = ax1.get_legend_handles_labels()
|
||||
lines_2, labels_2 = ax2.get_legend_handles_labels()
|
||||
ax1.legend(lines_1 + lines_2, labels_1 + labels_2)
|
||||
|
||||
# Plot BT value graph
|
||||
plt.subplot(2, 1, 2)
|
||||
plt.plot(timestamps_model, bt_values_model, label="BT [" + mode + "]", color="orange")
|
||||
plt.ylabel("BT Value")
|
||||
plt.xlabel(log_filename, fontsize=8)
|
||||
|
||||
plt.gca().xaxis.set_major_formatter(date_format)
|
||||
plt.legend()
|
||||
|
||||
plt.tight_layout()
|
||||
output_filename = log_filename[:-4] + "_analysis_" + mode + ".png"
|
||||
plt.savefig(os.path.join(log_dir, output_filename), dpi=300)
|
||||
plt.close()
|
||||
|
||||
# Write to CSV file
|
||||
if mode == "all":
|
||||
csv_filename = log_filename[:-4] + "_analysis.csv"
|
||||
with open(os.path.join(log_dir, csv_filename), "w", newline="") as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
writer.writerow(
|
||||
[
|
||||
"Timestamp",
|
||||
"ModelTime(ms)",
|
||||
"BT",
|
||||
"block_list_shape",
|
||||
"block_indices_shape",
|
||||
"Timestamp",
|
||||
"PostProcessing(ms)",
|
||||
"BT",
|
||||
]
|
||||
)
|
||||
for i in range(len(times_model)):
|
||||
writer.writerow(
|
||||
[
|
||||
timestamps_model[i],
|
||||
times_model[i],
|
||||
bt_values_model[i],
|
||||
block_list_shapes_model[i],
|
||||
block_indices_shapes_model[i],
|
||||
timestamps_pp[i],
|
||||
times_pp[i],
|
||||
bt_values_pp[i],
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) > 1:
|
||||
log_dir = sys.argv[1]
|
||||
else:
|
||||
log_dir = "."
|
||||
try:
|
||||
from natsort import natsorted
|
||||
|
||||
natsort_available = True
|
||||
except ImportError:
|
||||
natsort_available = False
|
||||
|
||||
files = []
|
||||
for f in os.listdir(log_dir):
|
||||
for pat in log_patterns:
|
||||
if pat.match(f):
|
||||
files.append(f)
|
||||
break
|
||||
if natsort_available:
|
||||
files = natsorted(files)
|
||||
else:
|
||||
import re as _re
|
||||
|
||||
def natural_key(s):
|
||||
return [int(text) if text.isdigit() else text.lower() for text in _re.split("([0-9]+)", s)]
|
||||
|
||||
files.sort(key=natural_key)
|
||||
|
||||
for file in files:
|
||||
for idx, pat in enumerate(log_patterns):
|
||||
m = pat.match(file)
|
||||
if m:
|
||||
draw_time_graph(log_dir, file, 128, "prefill")
|
||||
draw_time_graph(log_dir, file, 128, "decode")
|
||||
draw_time_graph(log_dir, file, 128, "all")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Starting to draw logs...")
|
||||
main()
|
||||
5
examples/intel_hpu/eb45-300b-a47b-32k-bf16.yaml
Normal file
5
examples/intel_hpu/eb45-300b-a47b-32k-bf16.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
kv_cache_ratio: 0.75
|
||||
tensor_parallel_size: 8
|
||||
max_num_batched_tokens: 32768
|
||||
70
examples/intel_hpu/intel_hpu_serving_benchmark.md
Normal file
70
examples/intel_hpu/intel_hpu_serving_benchmark.md
Normal file
@@ -0,0 +1,70 @@
|
||||
# Intel HPU serving benchmark
|
||||
These scripts are used to launch FastDeploy Paddle large model inference service for performance and stress testing.
|
||||
|
||||
## Main HPU-Specific Parameter
|
||||
- `HPU_WARMUP_BUCKET`: Whether to enable warmup (1 means enabled)
|
||||
- `HPU_WARMUP_MODEL_LEN`: Model length for warmup (including input and output)
|
||||
- `MAX_PREFILL_NUM`: Maximum batch in prefill stage, default 3
|
||||
- `BATCH_STEP_PREFILL`: Batch step in prefill stage, default 1
|
||||
- `SEQUENCE_STEP_PREFILL`: Sequence step in prefill stage, default 128, same as block size
|
||||
- `CONTEXT_BLOCK_STEP_PREFILL`: Step size for block hit when prefill caching is enabled, default 1
|
||||
- `BATCH_STEP_DECODE`: Batch step in decode stage, default 4
|
||||
- `BLOCK_STEP_DECODE`: Block step in decode stage, default 16
|
||||
- `FLAGS_intel_hpu_recipe_cache_num`: Limit for HPU recipe cache number
|
||||
- `FLAGS_intel_hpu_recipe_cache_config`: HPU recipe cache config, can be used for warmup optimization
|
||||
- `GC_KERNEL_PATH`: The default path of the HPU TPC kernels library
|
||||
- `HABANA_PROFILE`: Whether to enable profiler (1 means enabled)
|
||||
- `PROFILE_START`: Profiler start step.
|
||||
- `PROFILE_END`: Profiler end step.
|
||||
|
||||
## Usage
|
||||
### 1. Start server
|
||||
There are different setup scripts are provided to start the vllm server, one for RandomDataset and the other for ShareGPT.
|
||||
|
||||
Before running, please make sure to correctly set the model path and port number in the script.
|
||||
```bash
|
||||
./benchmark_paddle_hpu_server.sh
|
||||
./benchmark_paddle_hpu_server_sharegpt.sh
|
||||
```
|
||||
You can use HPU_VISIBLE_DEVICES in the script to select the HPU card.
|
||||
|
||||
### 2. Run client
|
||||
Correspondingly, there are different client test scripts. `benchmark_paddle_hpu_cli.sh` supports both variable and fixed length tests.
|
||||
|
||||
Before running, please make sure to correctly set the model path, port number, and input/output settings in the script.
|
||||
```bash
|
||||
./benchmark_paddle_hpu_cli.sh
|
||||
./benchmark_paddle_hpu_cli_sharegpt.sh
|
||||
```
|
||||
|
||||
### 3. Parse logs
|
||||
After batch testing, run the following script to automatically parse the logs and generate a CSV file.
|
||||
```python
|
||||
python parse_benchmark_logs.py benchmark_fastdeploy_logs/[the targeted folder]
|
||||
```
|
||||
The performance data will be saved as a CSV file.
|
||||
|
||||
### 4. Analyse logs
|
||||
During HPU_MODEL_RUNNER execution, performance logs are generated. The following script can parse these logs and produce performance graphs to help identify bottlenecks.
|
||||
```python
|
||||
python draw_benchmark_data.py benchmark_fastdeploy_logs/[the targeted folder]
|
||||
```
|
||||
The script will save the model execution times and batch tokens as a CSV file and plot them in a graph.
|
||||
|
||||
### 5. Accuracy test
|
||||
Accuracy testing uses GSM8K. Use the following conversion to generate the test file.
|
||||
```python
|
||||
>>> import pandas as pd
|
||||
>>> df = pd.read_parquet('tests/ce/accuracy_cases/gsm8k.parquet', engine='pyarrow')
|
||||
>>> df.to_json('test.jsonl', orient='records', lines=True)
|
||||
```
|
||||
Run the following command to perform the accuracy test.
|
||||
```bash
|
||||
python -u bench_gsm8k.py --port 8188 --num-questions 1319 --num-shots 5 --parallel 64
|
||||
```
|
||||
|
||||
### 6. Offline demo
|
||||
To run a offline demo on HPU quickly, after set model_path in offline_demo.py, run the start script directly.
|
||||
```bash
|
||||
./run_offline_demo.sh
|
||||
```
|
||||
53
examples/intel_hpu/offline_demo.py
Normal file
53
examples/intel_hpu/offline_demo.py
Normal file
@@ -0,0 +1,53 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from fastdeploy.engine.sampling_params import SamplingParams
|
||||
from fastdeploy.entrypoints.llm import LLM
|
||||
|
||||
model_name_or_path = "ERNIE-4.5-21B-A3B-Paddle"
|
||||
# model_name_or_path = "ERNIE-4.5-300B-A47B-Paddle"
|
||||
|
||||
# Hyperparameter settings
|
||||
input_bs = 1
|
||||
input_seq = None # 1000
|
||||
max_out_tokens = 128
|
||||
server_max_bs = 128
|
||||
TP = 1
|
||||
|
||||
# num_gpu_blocks_override = ceil((input_seq + max_out_tokens) / 128) * server_max_bs
|
||||
num_gpu_blocks_override = 2000
|
||||
sampling_params = SamplingParams(max_tokens=max_out_tokens)
|
||||
graph_optimization_config = {"use_cudagraph": False}
|
||||
llm = LLM(
|
||||
model=model_name_or_path,
|
||||
tensor_parallel_size=TP,
|
||||
engine_worker_queue_port=8602,
|
||||
num_gpu_blocks_override=num_gpu_blocks_override,
|
||||
block_size=128,
|
||||
max_model_len=32768,
|
||||
max_num_seqs=server_max_bs,
|
||||
graph_optimization_config=graph_optimization_config,
|
||||
)
|
||||
|
||||
if input_seq is None:
|
||||
prompt = "user: who are you?"
|
||||
else:
|
||||
prompt = "hi " * input_seq
|
||||
prompts = [prompt] * input_bs
|
||||
for i in range(2):
|
||||
output = llm.generate(prompts=prompts, use_tqdm=True, sampling_params=sampling_params)
|
||||
|
||||
print(output)
|
||||
195
examples/intel_hpu/parse_benchmark_logs.py
Normal file
195
examples/intel_hpu/parse_benchmark_logs.py
Normal file
@@ -0,0 +1,195 @@
|
||||
import csv
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
log_patterns = [
|
||||
re.compile(
|
||||
r"benchmarkdata_(.+?)_inputlength_(\d+)_outputlength_(\d+)_batchsize_(\d+)_numprompts_(\d+)_.*(?<!_profile)\.log$"
|
||||
),
|
||||
re.compile(r"benchmarkdata_(.+?)_sharegpt_prompts_(\d+)_concurrency_(\d+)_.*(?<!_profile)\.log$"),
|
||||
]
|
||||
|
||||
metrics = [
|
||||
("Mean Decode", r"Mean Decode:\s+([\d\.]+)"),
|
||||
("Mean TTFT (ms)", r"Mean TTFT \(ms\):\s+([\d\.]+)"),
|
||||
("Mean S_TTFT (ms)", r"Mean S_TTFT \(ms\):\s+([\d\.]+)"),
|
||||
("Mean TPOT (ms)", r"Mean TPOT \(ms\):\s+([\d\.]+)"),
|
||||
("Mean ITL (ms)", r"Mean ITL \(ms\):\s+([\d\.]+)"),
|
||||
("Mean S_ITL (ms)", r"Mean S_ITL \(ms\):\s+([\d\.]+)"),
|
||||
("Mean E2EL (ms)", r"Mean E2EL \(ms\):\s+([\d\.]+)"),
|
||||
("Mean S_E2EL (ms)", r"Mean S_E2EL \(ms\):\s+([\d\.]+)"),
|
||||
("Mean Input Length", r"Mean Input Length:\s+([\d\.]+)"),
|
||||
("Mean Output Length", r"Mean Output Length:\s+([\d\.]+)"),
|
||||
("Request throughput (req/s)", r"Request throughput \(req/s\):\s+([\d\.]+)"),
|
||||
("Output token throughput (tok/s)", r"Output token throughput \(tok/s\):\s+([\d\.]+)"),
|
||||
("Total Token throughput (tok/s)", r"Total Token throughput \(tok/s\):\s+([\d\.]+)"),
|
||||
]
|
||||
|
||||
|
||||
def parse_benchmark_log_file(filepath):
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
result = {}
|
||||
for name, pattern in metrics:
|
||||
match = re.search(pattern, content)
|
||||
result[name] = match.group(1) if match else ""
|
||||
return result
|
||||
|
||||
|
||||
def parse_profile_log_file(file_path):
|
||||
prepare_input_times = []
|
||||
model_times = []
|
||||
postprocessing_times = []
|
||||
steppaddle_times = []
|
||||
|
||||
with open(file_path, "r") as file:
|
||||
for line in file:
|
||||
prepare_input_match = re.search(r"_prepare_inputs time\(ms\): (\d+\.\d+)", line)
|
||||
model_match = re.search(r"Model execution time\(ms\): (\d+\.\d+)", line)
|
||||
postprocessing_match = re.search(r"PostProcessing execution time\(ms\): (\d+\.\d+)", line)
|
||||
steppaddle_match = re.search(r"StepPaddle execution time\(ms\): (\d+\.\d+)", line)
|
||||
|
||||
if prepare_input_match:
|
||||
prepare_input_times.append(float(prepare_input_match.group(1)))
|
||||
if model_match:
|
||||
model_times.append(float(model_match.group(1)))
|
||||
if postprocessing_match:
|
||||
postprocessing_times.append(float(postprocessing_match.group(1)))
|
||||
if steppaddle_match:
|
||||
steppaddle_times.append(float(steppaddle_match.group(1)))
|
||||
|
||||
return prepare_input_times, model_times, postprocessing_times, steppaddle_times
|
||||
|
||||
|
||||
def calculate_times(times, separate_first):
|
||||
if len(times) < 2:
|
||||
return times[0], None
|
||||
if separate_first:
|
||||
first_time = times[0]
|
||||
average_time = sum(times[1:]) / len(times[1:])
|
||||
return first_time, average_time
|
||||
else:
|
||||
return None, sum(times) / len(times)
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) > 1:
|
||||
log_dir = sys.argv[1]
|
||||
else:
|
||||
log_dir = "."
|
||||
try:
|
||||
from natsort import natsorted
|
||||
|
||||
natsort_available = True
|
||||
except ImportError:
|
||||
natsort_available = False
|
||||
all_files = set(os.listdir(log_dir))
|
||||
files = []
|
||||
for f in os.listdir(log_dir):
|
||||
for pat in log_patterns:
|
||||
if pat.match(f):
|
||||
files.append(f)
|
||||
break
|
||||
if natsort_available:
|
||||
files = natsorted(files)
|
||||
else:
|
||||
import re as _re
|
||||
|
||||
def natural_key(s):
|
||||
return [int(text) if text.isdigit() else text.lower() for text in _re.split("([0-9]+)", s)]
|
||||
|
||||
files.sort(key=natural_key)
|
||||
rows = []
|
||||
|
||||
for file in files:
|
||||
m = None
|
||||
matched_idx = -1
|
||||
for idx, pat in enumerate(log_patterns):
|
||||
m = pat.match(file)
|
||||
if m:
|
||||
matched_idx = idx
|
||||
break
|
||||
if not m:
|
||||
continue
|
||||
# model_name, input_len, output_len, batch_size, num_prompts
|
||||
# model_name, num_prompts, max_concurrency
|
||||
if matched_idx == 0:
|
||||
model_name, input_len, output_len, batch_size, num_prompts = m.groups()
|
||||
elif matched_idx == 1:
|
||||
model_name, num_prompts, max_concurrency = m.groups()
|
||||
input_len = "-"
|
||||
output_len = "-"
|
||||
if file.endswith(".log"):
|
||||
profile_file = file[:-4] + "_profile.log"
|
||||
else:
|
||||
profile_file = ""
|
||||
model_first = model_average = postprocessing_average = steppaddle_average = ""
|
||||
if profile_file in all_files:
|
||||
prepare_input_times, model_times, postprocessing_times, steppaddle_times = parse_profile_log_file(
|
||||
os.path.join(log_dir, profile_file)
|
||||
)
|
||||
_, pia = calculate_times(prepare_input_times, False)
|
||||
mf, ma = calculate_times(model_times, True)
|
||||
_, pa = calculate_times(postprocessing_times, False)
|
||||
_, sa = calculate_times(steppaddle_times, False)
|
||||
prepare_input_average = pia if pia is not None else ""
|
||||
model_first = mf if mf is not None else ""
|
||||
model_average = ma if ma is not None else ""
|
||||
postprocessing_average = pa if pa is not None else ""
|
||||
steppaddle_average = sa if sa is not None else ""
|
||||
data = parse_benchmark_log_file(os.path.join(log_dir, file))
|
||||
data["dataset"] = "Fixed-Length" if matched_idx == 0 else "ShareGPT"
|
||||
data["model_name"] = model_name
|
||||
data["input_length"] = input_len
|
||||
data["output_length"] = output_len
|
||||
data["batch_size"] = batch_size if matched_idx == 0 else max_concurrency
|
||||
data["num_prompts"] = num_prompts
|
||||
data["prepare_input_average"] = prepare_input_average
|
||||
data["model_execute_first"] = model_first
|
||||
data["model_execute_average"] = model_average
|
||||
data["postprocessing_execute_average"] = postprocessing_average
|
||||
data["steppaddle_execute_average"] = steppaddle_average
|
||||
rows.append(data)
|
||||
|
||||
import datetime
|
||||
|
||||
import pytz
|
||||
|
||||
shanghai_tz = pytz.timezone("Asia/Shanghai")
|
||||
now = datetime.datetime.now(shanghai_tz)
|
||||
ts = now.strftime("%Y%m%d_%H%M%S")
|
||||
log_dir_name = os.path.basename(os.path.abspath(log_dir))
|
||||
if log_dir_name == "" or log_dir == "." or log_dir == "/":
|
||||
csv_filename = f"benchmark_summary_{ts}.csv"
|
||||
else:
|
||||
csv_filename = f"benchmark_summary_{log_dir_name}_{ts}.csv"
|
||||
fieldnames = (
|
||||
[
|
||||
"model_name",
|
||||
"dataset",
|
||||
"input_length",
|
||||
"output_length",
|
||||
"batch_size",
|
||||
"num_prompts",
|
||||
]
|
||||
+ [name for name, _ in metrics]
|
||||
+ [
|
||||
"prepare_input_average",
|
||||
"model_execute_first",
|
||||
"model_execute_average",
|
||||
"postprocessing_execute_average",
|
||||
"steppaddle_execute_average",
|
||||
]
|
||||
)
|
||||
with open(csv_filename, "w", newline="", encoding="utf-8") as csvfile:
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
for row in rows:
|
||||
writer.writerow(row)
|
||||
print(f"CSV saved as: {csv_filename}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Starting to parse logs...")
|
||||
main()
|
||||
19
examples/intel_hpu/run_offline_demo.sh
Executable file
19
examples/intel_hpu/run_offline_demo.sh
Executable file
@@ -0,0 +1,19 @@
|
||||
#!/bin/bash
|
||||
|
||||
export GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
|
||||
export GC_KERNEL_PATH=/usr/local/lib/python3.10/dist-packages/paddle_custom_device/intel_hpu/libcustom_tpc_perf_lib.so:$GC_KERNEL_PATH
|
||||
export INTEL_HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||
export PADDLE_DISTRI_BACKEND=xccl
|
||||
export PADDLE_XCCL_BACKEND=intel_hpu
|
||||
# export HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||
export HPU_VISIBLE_DEVICES=0
|
||||
export HABANA_PROFILE=0
|
||||
export PROFILE_START=1
|
||||
export PROFILE_END=3
|
||||
# export HABANA_LOGS=hpu_logs
|
||||
# export LOG_LEVEL_ALL=0
|
||||
# export FLAGS_intel_hpu_runtime_debug=1
|
||||
# export FLAGS_intel_hpu_reciperunner_debug=1
|
||||
|
||||
rm -rf log
|
||||
FD_ATTENTION_BACKEND=HPU_ATTN python offline_demo.py
|
||||
@@ -26,8 +26,8 @@ python -m pip uninstall fastdeploy_intel_hpu -y
|
||||
#to install paddlepaddle
|
||||
pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
||||
#to install paddlecustomdevice? (paddle-intel-hpu)
|
||||
pip install https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/paddle_intel_hpu-0.0.1-cp310-cp310-linux_x86_64.whl
|
||||
pip install https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/paddlenlp_ops-0.0.0-cp310-cp310-linux_x86_64.whl
|
||||
pip install https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/paddle_intel_hpu-0.0.2-cp310-cp310-linux_x86_64.whl
|
||||
pip install https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/paddlenlp_ops-0.0.2-cp310-cp310-linux_x86_64.whl
|
||||
|
||||
#to build and install fastdeploy
|
||||
echo "build whl"
|
||||
|
||||
Reference in New Issue
Block a user