This commit is contained in:
co63oc
2025-09-15 18:33:30 +08:00
committed by GitHub
parent 113e330030
commit 17a27170bc
17 changed files with 28 additions and 28 deletions

View File

@@ -308,7 +308,7 @@ assert np.allclose(
), f"C16 prefix cache != No prefix cache,\n attn_out[hit_prefix_len:]: {attn_out_np},\nattn_out_prefix_cache: {attn_out_prefix_cache_np}"
print("\n-- C8 per channle prefix cache test --")
print("\n-- C8 per channel prefix cache test --")
print(
"attn_out_C8[hit_prefix_len:]'s mean:",
attn_out_C8[hit_prefix_len:].mean().item(),
@@ -318,9 +318,9 @@ attn_out_C8_prefix_cache_np = attn_out_C8_prefix_cache.astype("float32").numpy()
attn_out_C8_np = attn_out_C8[hit_prefix_len:].astype("float32").numpy()
assert np.allclose(
attn_out_C8_prefix_cache_np, attn_out_C8_np, rtol=1e-1, atol=1e-2
), f"C8 per channle prefix cache != No prefix cache,\n attn_out_C8[hit_prefix_len:]: {attn_out_C8_np},\nattn_out_C8_prefix_cache: {attn_out_C8_prefix_cache_np}"
), f"C8 per channel prefix cache != No prefix cache,\n attn_out_C8[hit_prefix_len:]: {attn_out_C8_np},\nattn_out_C8_prefix_cache: {attn_out_C8_prefix_cache_np}"
print("\n-- C8 per channle zp prefix cache test --")
print("\n-- C8 per channel zp prefix cache test --")
print(
"attn_out_C8_zp[hit_prefix_len:]'s mean:",
attn_out_C8_zp[hit_prefix_len:].mean().item(),
@@ -333,4 +333,4 @@ attn_out_C8_zp_prefix_cache_np = attn_out_C8_zp_prefix_cache.astype("float32").n
attn_out_C8_zp_np = attn_out_C8_zp[hit_prefix_len:].astype("float32").numpy()
assert np.allclose(
attn_out_C8_zp_prefix_cache_np, attn_out_C8_zp_np, rtol=1e-1, atol=1e-2
), f"C8 per channle zp prefix cache != No prefix cache,\n attn_out_C8_zp[hit_prefix_len:]: {attn_out_C8_zp_np},\nattn_out_C8_zp_prefix_cache: {attn_out_C8_zp_prefix_cache_np}"
), f"C8 per channel zp prefix cache != No prefix cache,\n attn_out_C8_zp[hit_prefix_len:]: {attn_out_C8_zp_np},\nattn_out_C8_zp_prefix_cache: {attn_out_C8_zp_prefix_cache_np}"

View File

@@ -80,7 +80,7 @@ combined_out_pd = ep_moe_expert_combine(
moe_index_pd.shape[1],
)
# comparation
# comparison
# print("moe_index:\n", moe_index)
# print("moe_weights:\n", moe_weights)
# print("combined_out_np:\n", combined_out_np)

View File

@@ -117,7 +117,7 @@ print(f"cumsum_idx:\n{cumsum_idx}")
"weight_only_int8",
)
# comparation
# comparison
permute_input_xpu = permute_input_xpu.astype("float32").numpy()
permute_indices_per_token_xpu = permute_indices_per_token_xpu.numpy()
recv_num_tokens_per_expert_list_cumsum_xpu = recv_num_tokens_per_expert_list_cumsum_xpu.numpy()

View File

@@ -130,7 +130,7 @@ out_pd = weight_only_linear(x_pd, qw_pd, wscale_pd, None, weight_dtype, -1, -1)
print(f"out_pd:\n{out_pd}")
print(f"out_np:\n{out_np}")
# comparation
# comparison
print(f"out_pd, mean={out_pd.mean()}, std={out_pd.std()}")
print(f"out_np, mean={out_np.mean()}, std={out_np.std()}")
sum_diff = np.sum(np.abs(out_pd.astype("float32").numpy() - out_np.astype("float32")))

View File

@@ -32,7 +32,7 @@ During sparse attention computation, each query token may dynamically select dif
To optimize performance in both the prefill and decode stages, we design a special joint strategy to adapt to their respective characteristics:
* **Prefill Toke Union**: We observe that adjacent query tokens tend to select similar key blocks. Leveraging this locality, we take the union of the key blocks selected by consecutive 128 query tokens and jointly compute sparse attention for these tokens.
* **Prefill Token Union**: We observe that adjacent query tokens tend to select similar key blocks. Leveraging this locality, we take the union of the key blocks selected by consecutive 128 query tokens and jointly compute sparse attention for these tokens.
* **Decode Head Union**: Given the widespread adoption of GQA in modern models, we find that different heads within the same group often select overlapping key blocks. Thus, we combine the key blocks selected by all query heads within a group into a unified set and jointly calculate sparse attention. This way also reduces memory access overhead and further improves decoding efficiency.
* **Top-K Selection**: Conventional top-k algorithms based on sorting or direct calls to the cub library introduce significant runtime overhead. To mitigate this, we implemented an approximate top-k selection algorithm using binary search, which significantly reduces latency while maintaining accuracy, ultimately achieving significantly improved performance.
@@ -132,7 +132,7 @@ We selected a subset (longbook_sum_eng) from InfiniteBench as the performance ev
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>QPS</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Decode Speed (token/s)</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Time to First token(s)</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Time per Ouput Token(ms)</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Time per Output Token(ms)</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>End-to-End Latency(s)</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Mean Input<br>Length</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Mean Output Length</strong></td>

View File

@@ -33,7 +33,7 @@ docker exec -it paddle_infer bash
pip3 install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
pip3 install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
```
For latest paddle verion on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
### Install or build FastDeploy
```bash

View File

@@ -34,7 +34,7 @@
为了优化预填充和解码阶段的性能,我们设计了一种特殊的联合策略来适应各自的特点:
* **Prefill Toke Union**: 我们观察到相邻的查询标记倾向于选择相似的关键块。利用这种局部性,我们取连续 128 个查询标记选择的关键块的并集,并联合计算这些标记的稀疏注意力机制。
* **Prefill Token Union**: 我们观察到相邻的查询标记倾向于选择相似的关键块。利用这种局部性,我们取连续 128 个查询标记选择的关键块的并集,并联合计算这些标记的稀疏注意力机制。
* **Decode Head Union**: 鉴于 GQA 在现代模型中的广泛应用,我们发现同一组内的不同查询头经常选择重叠的关键块。因此,我们将同一组内所有查询头选择的关键块合并为一个统一的集合,并联合计算稀疏注意力机制。这种方式也减少了内存访问开销,并进一步提高了解码效率。
@@ -136,7 +136,7 @@
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>QPS</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Decode Speed (token/s)</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Time to First token(s)</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Time per Ouput Token(ms)</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Time per Output Token(ms)</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>End-to-End Latency(s)</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Mean Input<br>Length</strong></td>
<td style="border: 1px solid #dcdde0; padding: 8px; text-align: center; vertical-align: middle;"><strong>Mean Output Length</strong></td>

View File

@@ -455,7 +455,7 @@ class ResourceManagerV1(ResourceManager):
# schedule when extend block tables is needed
for req in self.running:
num_prefill_blocks = req.need_prefill_tokens // self.config.cache_config.block_size
# alocate
# allocate
if req.use_extend_tables and req.request_id not in self.using_extend_tables_req_id:
llm_logger.info(
f"req {req.request_id} at batch id {req.idx} with num_prefill_blocks {num_prefill_blocks} is going to enable extend tables"
@@ -488,7 +488,7 @@ class ResourceManagerV1(ResourceManager):
<= self.config.cache_config.prealloc_dec_block_slot_num_threshold
):
llm_logger.info(
f"req {req.request_id} is going to alocate more extend tables because allocated_slots {self.allocated_slots(req)} and prealloc_dec_block_slot_num_threshold {self.config.cache_config.prealloc_dec_block_slot_num_threshold} req.num_total_tokens {req.num_total_tokens}"
f"req {req.request_id} is going to allocate more extend tables because allocated_slots {self.allocated_slots(req)} and prealloc_dec_block_slot_num_threshold {self.config.cache_config.prealloc_dec_block_slot_num_threshold} req.num_total_tokens {req.num_total_tokens}"
)
if self.cache_manager.can_allocate_gpu_blocks(self.config.cache_config.enc_dec_block_num):
req.extend_block_tables.extend(

View File

@@ -16,7 +16,7 @@
from fastdeploy.plugins.utils import load_plugins_by_group
# use for modle runner
# use for model runner
PLUGINS_GROUP = "fastdeploy.model_runner_plugins"

View File

@@ -576,7 +576,7 @@ def parse_args():
"--moba_attention_config",
type=json.loads,
default=None,
help="Configation of moba attention.",
help="Configuration of moba attention.",
)
parser.add_argument(
"--guided_decoding_backend",

View File

@@ -181,7 +181,7 @@ class TestMaxStreamingResponseTokens(IsolatedAsyncioTestCase):
chunk_dict = json.loads(json_part)
parsed_chunks.append(chunk_dict)
except json.JSONDecodeError as e:
self.fail(f"Cannot parser {i+1} chunck, JSON: {e}\n origin string: {repr(chunk_str)}")
self.fail(f"Cannot parser {i+1} chunk, JSON: {e}\n origin string: {repr(chunk_str)}")
else:
self.fail(f"{i+1} chunk is unexcepted 'data: JSON\\n\\n': {repr(chunk_str)}")
for chunk_dict in parsed_chunks:
@@ -260,7 +260,7 @@ class TestMaxStreamingResponseTokens(IsolatedAsyncioTestCase):
chunk_dict = json.loads(json_part)
parsed_chunks.append(chunk_dict)
except json.JSONDecodeError as e:
self.fail(f"Cannot parser {i+1} chunck, JSON: {e}\n origin string: {repr(chunk_str)}")
self.fail(f"Cannot parser {i+1} chunk, JSON: {e}\n origin string: {repr(chunk_str)}")
else:
self.fail(f"{i+1} chunk is unexcepted 'data: JSON\\n\\n': {repr(chunk_str)}")
self.assertEqual(len(parsed_chunks), 1)

View File

@@ -89,7 +89,7 @@ class TestCase1SubLayer3(paddle.nn.Layer):
class TestModel1(paddle.nn.Layer):
"""Tast Model"""
"""Test Model"""
def __init__(self, fd_config: FDConfig, **kwargs):
super().__init__()

View File

@@ -36,7 +36,7 @@ class TestCase1SubLayer1(paddle.nn.Layer):
class TestModel1(paddle.nn.Layer):
"""Tast Model"""
"""Test Model"""
def __init__(self, fd_config: FDConfig, **kwargs):
super().__init__()
@@ -123,9 +123,9 @@ class TestCUDAGrpahRecapture(unittest.TestCase):
assert (output1 == self.output_correct).all()
# Destroy
print_gpu_memory_use(0, "before destory")
print_gpu_memory_use(0, "before destroy")
self.test_model1.clear_grpah_opt_backend()
print_gpu_memory_use(0, "after destory")
print_gpu_memory_use(0, "after destroy")
def recapture_and_replay(self, input_tensor1, forward_meta1):
""" """
@@ -139,9 +139,9 @@ class TestCUDAGrpahRecapture(unittest.TestCase):
assert (output2 == self.output_correct).all()
# Destroy
print_gpu_memory_use(0, "before destory")
print_gpu_memory_use(0, "before destroy")
self.test_model1.clear_grpah_opt_backend()
print_gpu_memory_use(0, "after destory")
print_gpu_memory_use(0, "after destroy")
if __name__ == "__main__":

View File

@@ -51,7 +51,7 @@ class TestCase1SubLayer1(paddle.nn.Layer):
class TestModel1(paddle.nn.Layer):
"""Tast Model"""
"""Test Model"""
def __init__(self, fd_config: FDConfig, **kwargs):
super().__init__()

View File

@@ -123,6 +123,6 @@ def test_model_cache(
check_tokens_id_and_text_close(
outputs_0_lst=fd_outputs_v1,
outputs_1_lst=fd_outputs_v1_with_cache,
name_0="default_v1 laoder",
name_0="default_v1 loader",
name_1="default_v1 loader using cache",
)

View File

@@ -100,7 +100,7 @@ def form_model_get_output_topp0(
fd_outputs = fd_model.generate_topp0(prompts, max_tokens=max_tokens)
result_queue.put(fd_outputs)
except Exception:
print(f"Failed using {load_choices} laoder to load model from {model_path}.")
print(f"Failed using {load_choices} loader to load model from {model_path}.")
traceback.print_exc()
pytest.fail(f"Failed to initialize LLM model from {model_path}")

View File

@@ -28,7 +28,7 @@ FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333))
MAX_WAIT_SECONDS = 60
os.environ["LD_LIBRARY_PATH"] = "/usr/local/nccl/"
# enbale get_save_output_v1
# enable get_save_output_v1
os.environ["FD_USE_GET_SAVE_OUTPUT_V1"] = "1"