# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cd ../custom_ops/gpu_ops/fp8_deep_gemm rm -rf build dist deep_gemm.egg-info pip uninstall deep_gemm python setup.py develop cd ../../../scripts rm -rf log rm -f core* export NVIDIA_TF32_OVERRIDE=0 export NCCL_ALGO=Tree export FLAGS_allocator_strategy=auto_growth export FLAGS_fraction_of_gpu_memory_to_use=0.98 export FLAGS_gemm_use_half_precision_compute_type=False export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python export PYTHONPATH=$(dirname $(pwd)):$PYTHONPATH export FLAGS_enable_pir_api=0 export FLAGS_use_append_attn=1 export FLAGS_use_fa3=1 export devices=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=${devices} # export FLAGS_enable_blaslt_global_search=1 # export FLAGS_cublaslt_device_best_config=/path/to/cublaslt_device_best_config.csv # export FLAGS_use_cutlass_device_best_config_path=/path/to/cutlass_device_best_config.json model_path=${1:-"/path/to/model"} for name in `env | grep -E 'PADDLE|ENDPOINT' | awk -F'=' '{print $1}'`; do unset ${name} done export PADDLE_TRAINER_ID=0 export PADDLE_TRAINERS_NUM=1 export TRAINER_INSTANCES_NUM=1 export TRAINER_INSTANCES=`hostname -i` self_ip=`hostname -i` python -m paddle.distributed.launch \ --gpus ${devices} \ fake_p_server.py \ --model_name_or_path ${model_path} \ --input_file "../data/qf_turbopro_5k_pd5_eb" \ --output_file ./predict_out.json \ --predict_model_type "WINT8" \ --dtype bfloat16 \ --data_format "pt" \ --append_bos_token "False" \ --max_dec_len 1 \ --top_p 0 \ --batch_size 8 \ --moe_quant_type "weight_only_int4" \ --use_ep "True" \ --generation_phase 1 \ --benchmark "False" \ --fake_server_p "True" \ --use_cache_kv_int8 "False" \ --scale_dir "None"