Files
FastDeploy/fastdeploy/download_model.py
2025-06-29 23:29:37 +00:00

228 lines
8.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import requests
import os
from tqdm import tqdm
import argparse
import hashlib
import re
def parse_arguments():
"""
<20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>в<EFBFBD><D0B2><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>һ<EFBFBD><D2BB>argparse.Namespace<63><65><EFBFBD><EFBFBD>
Args:
None
Returns:
argparse.Namespace (parser.parse_args()): <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>в<EFBFBD><D0B2><EFBFBD><EFBFBD><EFBFBD>Namespace<63><65><EFBFBD><EFBFBD>
- model_name (str, default='deepseek-ai/DeepSeek-R1/weight_only_int4'): ģ<><C4A3><EFBFBD><EFBFBD><EFBFBD>ơ<EFBFBD>
- dir (str, default='downloads'): <20><><EFBFBD><EFBFBD>Ŀ¼<C4BF><C2BC>
- nnodes (int, default=1): <20>ڵ<EFBFBD><DAB5><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
- mode (str, default="master"): ģʽ<C4A3><CABD>ֻ֧<D6BB><D6A7><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ڵ<EFBFBD>ģ<EFBFBD><C4A3><EFBFBD>У<EFBFBD><D0A3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ģʽ<C4A3><CABD>master<65><72><EFBFBD><EFBFBD>slave<76><65>
- speculate_model_path (str, default=None): <20><><EFBFBD><EFBFBD>ģ<EFBFBD><C4A3>·<EFBFBD><C2B7><EFBFBD><EFBFBD>
"""
parser = argparse.ArgumentParser(description="download models")
parser.add_argument('-m', '--model_name', default='deepseek-ai/DeepSeek-R1/weight_only_int4',
help="model_name")
parser.add_argument('-d', '--dir', default='downloads',
help="save dir")
parser.add_argument('-n', '--nnodes', type=int, default=1,
help="the number of node")
parser.add_argument('-M', '--mode', default="master", choices=["master", "slave"],
help="only support in 2 nodes model. There are two modes, master or slave.")
parser.add_argument('-s', '--speculate_model_path', default=None,
help="speculate model path")
return parser.parse_args()
def calculate_md5(file_path, chunk_size=8192):
"""
<20><><EFBFBD><EFBFBD><EFBFBD>ļ<EFBFBD><C4BC><EFBFBD>MD5ֵ<35><D6B5>
Args:
file_path (str): <20>ļ<EFBFBD>·<EFBFBD><C2B7><EFBFBD><EFBFBD>
chunk_size (int, optional): ÿ<>ζ<EFBFBD>ȡ<EFBFBD><C8A1><EFBFBD>ֽ<EFBFBD><D6BD><EFBFBD><EFBFBD><EFBFBD>Ĭ<EFBFBD><C4AC>Ϊ8192<39><32>
Returns:
str: <20><><EFBFBD><EFBFBD><EFBFBD>ļ<EFBFBD><C4BC><EFBFBD>MD5ֵ<35><D6B5><EFBFBD><EFBFBD>ʽΪʮ<CEAA><CAAE><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ַ<EFBFBD><D6B7><EFBFBD><EFBFBD><EFBFBD>
"""
hasher = hashlib.md5()
with open(file_path, 'rb') as f:
for chunk in iter(lambda: f.read(chunk_size), b''):
hasher.update(chunk)
return hasher.hexdigest()
def download_file(url, save_path, md5sum):
"""download file"""
md5_check = int(os.getenv("MD5_CHECK", "0")) == 1
try:
with requests.get(url, stream=True) as response:
response.raise_for_status()
if os.path.exists(save_path):
if not md5_check:
print(f"{save_path} already exists and md5 check is off, skip this step")
return save_path
current_md5sum = calculate_md5(save_path)
if md5sum != current_md5sum:
os.remove(save_path)
print("not complete file! start to download again")
else:
print(f"{save_path} already exists and md5sum matches")
return save_path
os.makedirs(os.path.dirname(save_path), exist_ok=True)
file_name = save_path.split('/')[-1]
total_size = int(response.headers.get('content-length', 0))
progress_bar = tqdm(
total=total_size,
unit='iB',
unit_scale=True,
desc=f"download {file_name}"
)
with open(save_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
progress_bar.update(len(chunk))
progress_bar.close()
if total_size != 0 and os.path.getsize(save_path) != total_size:
raise RuntimeError("not complete")
return save_path
except Exception as e:
if save_path and os.path.exists(save_path):
os.remove(save_path)
return None
def download_from_txt(base_url, save_dir, model_name=None):
"""
<20><><EFBFBD>ı<EFBFBD><C4B1>ļ<EFBFBD><C4BC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ļ<EFBFBD><C4BC><EFBFBD>
Args:
base_url (str): <20><><EFBFBD><EFBFBD>URL<52><4C><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ļ<EFBFBD><C4BC>б<EFBFBD><D0B1><EFBFBD>·<EFBFBD><C2B7><EFBFBD><EFBFBD>
save_dir (str): <20><><EFBFBD><EFBFBD>Ŀ¼<C4BF><C2BC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ص<EFBFBD><D8B5><EFBFBD>Ŀ¼<C4BF>¡<EFBFBD><C2A1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ڣ<EFBFBD><DAA3>򴴽<EFBFBD><F2B4B4BD><EFBFBD>
model_name (str, optional): ģ<><C4A3><EFBFBD><EFBFBD><EFBFBD>ƣ<EFBFBD>Ĭ<EFBFBD><C4AC>ΪNone<6E><65><EFBFBD><EFBFBD>ѡ<EFBFBD><D1A1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ع<EFBFBD><D8B9><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʾģ<CABE><C4A3><EFBFBD><EFBFBD><EFBFBD>ơ<EFBFBD>
Returns:
None, <20>޷<EFBFBD><DEB7><EFBFBD>ֵ<EFBFBD><D6B5>
Raises:
Exception: <20><><EFBFBD><EFBFBD>ʧ<EFBFBD><CAA7>ʱ<EFBFBD><CAB1><EFBFBD><EFBFBD><EFBFBD><EFBFBD>һ<EFBFBD><D2BB><EFBFBD><EFBFBD><ECB3A3><EFBFBD><EFBFBD><E1B9A9><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ<EFBFBD><CFA2>
"""
txt_url = base_url + "/file_list.txt"
print(f"{txt_url}")
try:
response = requests.get(txt_url)
response.raise_for_status()
files_name = response.text.splitlines()
files_name = [file.strip() for file in files_name if file.strip()]
md5sum = [file_name.rsplit(':', 1)[-1] for file_name in files_name]
file_name = [file_name.rsplit(':', 1)[0] for file_name in files_name]
if not files_name:
print("No valid files found.")
return
print(f"Found {len(files_name)} files")
for i in range(len(file_name)):
cur_url = base_url + f"/{file_name[i]}"
path = download_file(cur_url, os.path.join(save_dir, file_name[i]), md5sum[i])
if path:
print(f"[✓] Success: {path}")
else:
print(f"[×] Failed: {cur_url}")
except requests.exceptions.RequestException as e:
raise Exception(f"Failed to download file list from {txt_url}: {str(e)}")
def main():
"""
<20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ؾ<EFBFBD>̬ģ<CCAC>͡<EFBFBD>
Args:
<20>޲<EFBFBD><DEB2><EFBFBD><EFBFBD><EFBFBD>
Returns:
bool: <20><><EFBFBD><EFBFBD>False<73><65><EFBFBD><EFBFBD>ʾ<EFBFBD>ú<EFBFBD><C3BA><EFBFBD>û<EFBFBD>з<EFBFBD><D0B7><EFBFBD>ֵ<EFBFBD><D6B5>
Raises:
ValueError (BaseException): <20><><EFBFBD>ģ<EFBFBD><C4A3><EFBFBD><EFBFBD><EFBFBD>Ʋ<EFBFBD><C6B2><EFBFBD>֧<EFBFBD><D6A7><EFBFBD>б<EFBFBD><D0B1>У<EFBFBD><D0A3><EFBFBD><EFBFBD><EFBFBD>׳<EFBFBD>ValueError<6F><EFBFBD><ECB3A3>
"""
args = parse_arguments()
print(f"Save Path: {os.path.abspath(args.dir)}")
# make dir
path = os.path.join(args.dir, args.model_name)
os.makedirs(path, exist_ok=True)
model_name = args.model_name
env = os.environ
# Define supported model patterns
supported_patterns = [
r".*Qwen.*",
r".+Llama.+",
r".+Mixtral.+",
r".+DeepSeek.+",
]
# Check if model_name matches any supported pattern
if not any(re.match(pattern, model_name) for pattern in supported_patterns):
raise ValueError(
f"{model_name} is not in the supported list. Currently supported models: Qwen, Llama, Mixtral, DeepSeek.",
f"Please check the model name from this document ",
"https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/server/docs/static_models.md"
)
print(f"Start downloading model: {model_name}")
tag = env.get("tag")
base_url = f"https://paddlenlp.bj.bcebos.com/models/static/{tag}/{model_name}"
temp_file = None
if args.nnodes == 1:
temp_file = "model"
elif args.nnodes > 1:
if args.mode == "master":
temp_file = "node1"
elif args.mode == "slave":
temp_file = "node2"
else:
raise ValueError(f"Invalid mode: {args.mode}. Mode must be 'master' or 'slave'.")
else:
raise ValueError(f"Invalid nnodes: {args.nnodes}. nnodes must be >= 1.")
if temp_file:
model_url = base_url + f"/{temp_file}"
download_from_txt(model_url, path)
else:
print(f"Don't support download the {model_name} in mode {args.mode}")
if args.speculate_model_path:
os.makedirs(args.speculate_model_path, exist_ok=True)
print(f"Start downloading mtp model: {model_name}")
model_url = base_url + "/mtp"
download_from_txt(model_url, args.speculate_model_path)
if __name__ == "__main__":
main()