[Feature] support async download features (#5003)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled

* support async download features

* add test case

* update code
This commit is contained in:
kevin
2025-11-19 22:23:36 +08:00
committed by GitHub
parent bde97e09f7
commit 109d48e456
10 changed files with 433 additions and 75 deletions

View File

@@ -21,6 +21,7 @@ import importlib
import json
import logging
import os
import pickle
import random
import re
import socket
@@ -975,6 +976,36 @@ def init_bos_client():
return BosClient(cfg)
def download_from_bos(bos_client, bos_links):
"""
Download pickled objects from Baidu Object Storage (BOS).
Args:
bos_client: BOS client instance
bos_links: Single link or list of BOS links in format "bos://bucket-name/path/to/object"
Yields:
tuple: (success: bool, data: np.ndarray | error_msg: str)
- On success: (True, deserialized_data)
- On failure: (False, error_message) and stops processing remaining links
Security Note:
Uses pickle deserialization. Only use with trusted data sources.
"""
if not isinstance(bos_links, list):
bos_links = [bos_links]
for link in bos_links:
try:
if link.startswith("bos://"):
link = link.replace("bos://", "")
bucket_name = "/".join(link.split("/")[1:-1])
object_key = link.split("/")[-1]
response = bos_client.get_object_as_string(bucket_name, object_key)
yield True, pickle.loads(response)
except Exception as e:
yield False, f"link {link} download error: {str(e)}"
break
llm_logger = get_logger("fastdeploy", "fastdeploy.log")
data_processor_logger = get_logger("data_processor", "data_processor.log")
scheduler_logger = get_logger("scheduler", "scheduler.log")