Compare commits

582 Commits

Author SHA1 Message Date
RichardWooSJTU
791b101195 revert worker process ipc signal suffix (#4323)
Some checks failed
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
2025-09-30 03:56:41 -07:00
fangfangssj
af3872215e [BugFix]remove redundant includes (#4312)
Co-authored-by: Tao Luo <luotao02@baidu.com>
2025-09-30 17:54:19 +08:00
Zero Rains
d14aadf70e [FIx] CI Approve fix (#4316)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* check approve

* fix the bug

* update
2025-09-29 11:38:24 -07:00
chen
81959c7d88 [NewFeature]custom_allreduce support cudagraph recapture (#4305)
Some checks failed
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
* custom_allreduce support cudagraph recapture

* add shut_down/restart default group
2025-09-29 15:56:54 +08:00
xiaozude
7c919070f7 [Metax] support cutlass moe & optimize flash attention (#4208)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-09-29 11:22:43 +08:00
K11OntheBoat
2b2b645296 Fix bugs of splitwise_complete_prefilled_step IPCsignal clear (#4309)
Co-authored-by: K11OntheBoat <“ruianmaidanglao@163.com”>
2025-09-29 11:21:22 +08:00
RichardWooSJTU
3740e33fea 【Feature】ResourceManagerV1 support need block num notifying (#4220)
* support need block num notifying

* adapt t2i

* fix unexpected change
2025-09-29 11:11:51 +08:00
李泳桦
70633c6641 [fix] fix gpu_caches key (#4311)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-09-28 21:32:57 +08:00
xiaolei373
1282ebe1b1 add_cli_tokenizer (#4278)
Some checks failed
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
2025-09-28 20:47:35 +08:00
李泳桦
6265f4385f [feat] support prefix cache clearing when /clear_load_weight is called (#4008)
* [feat] support clearing prefix cache (cherry-picked from release/2.1)

* [fix] fix ipc suffix, use port instead

* [fix] fix prefix caching not enabled

* [fix] fix key/value_cache_scales indent

* [fix] fix ep group all-reduce

* [fix] fix clear/update lock not working when workers > 1

* [chore] add preemption triggered info log

* [fix] fix code style

* [fix] fix max_num_seqs config

* [fix] do not force enable_prefix_caching=False in dynamic loading

* [fix] fix ci

* Revert "[fix] fix ci"

This reverts commit 0bc6d55cc8.

* [fix] initialize available_gpu_block_num with max_gpu_block_num

* [fix] fix config splitwise_role

* [fix] fix clearing caches synchronization and add more logs

* [chore] print cache_ready_signal in log

* [fix] fix scheduler_config.splitwise_role

* [fix] fix cache_messager cache_ready_signal create=True

* [fix] stop cache messager from launching in mixed deployment
2025-09-28 19:42:53 +08:00
Lucas
59313ed7f9 [XPU] fix VL thinking mode (#4266) 2025-09-28 17:37:37 +08:00
Sunny-bot1
aa1cc09c5b fix machete pre quant (#4295) 2025-09-28 16:11:09 +08:00
K11OntheBoat
7b6cb72ab2 Fix wrong batch size of thinking_mask (#4296)
Co-authored-by: K11OntheBoat <“ruianmaidanglao@163.com”>
Co-authored-by: xiegegege <46314656+xiegegege@users.noreply.github.com>
2025-09-28 14:56:42 +08:00
chenjian
3cef851468 [Bug fix] Fix bug for running ep (#4245)
* fix bug for ep

* fix bug
2025-09-28 14:56:18 +08:00
luukunn
17e00d9f5d fix reasoning_max_tokens (#4277) 2025-09-28 14:05:29 +08:00
Zhenghai Zhang
aa045aa84f fix typos (#4274)
Some checks failed
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
2025-09-27 09:25:43 +08:00
GoldPancake
79c2c52756 deepgemm pre-compile tool support mixed parallel (#4282)
Some checks failed
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
2025-09-26 18:43:39 +08:00
YUNSHEN XIE
5c6e859681 increase ccache size (#4255) 2025-09-26 17:40:07 +08:00
yyssys
f40d7c6d65 [Docs]When XPU starts the service, the model loader uses the default version (#4292) 2025-09-26 15:58:12 +08:00
Zero Rains
331c4d2a74 Set approve checking for config.py, worker, model and cudagraph (#4276)
* set approve checking for config.py and worker files

* update

* update

* update file list

* check worker

* update

* check graph

* check model_loader

* check models

* update
2025-09-26 14:50:54 +08:00
GoldPancake
838de53de8 Add speculative decoding approval check (#4284) 2025-09-26 14:47:45 +08:00
xiaolei373
55124f8491 Add cli run batch (#4237)
* feat(log):add_request_and_response_log

* [cli] add run batch cli

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-09-26 14:27:25 +08:00
tianlef
8a964329f4 add glm benchmark yaml (#4289) 2025-09-26 14:23:29 +08:00
Zhong Hui
67e693b18b fix ernie vl distributed attr. (#4215)
Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-09-26 14:18:49 +08:00
zhuzixuan
12a3587cca [Supplements and upgrades]Improvement of X1 parsers (#4172)
* reasoning_parser

* reasoning_parser

* reasoning_parser

* reasoning_parser

* reasoning_parser

* reasoning_parser

* reasoning_parser
2025-09-26 13:37:37 +08:00
YuBaoku
dd2e844ea3 [CI] fix base_test error temporarily (#4283)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-09-26 11:24:55 +08:00
memoryCoderC
4ec00df2b0 [Feature] add config api (#4254) 2025-09-26 11:21:02 +08:00
kxz2002
83d41d23b0 initial commit (#4248)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-09-25 21:42:05 +08:00
yyssys
c415885a94 [Docs]Add ENABLE_V1_KVCACHE_SCHEDULER=0 to docs (#4268)
Some checks failed
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
2025-09-25 20:09:03 +08:00
K11OntheBoat
4515ad21e9 Support limit thinking lengths (#4069)
Co-authored-by: K11OntheBoat <“ruianmaidanglao@163.com”>
2025-09-25 19:55:56 +08:00
Yuanle Liu
0c6f1932c5 delete_moe_phase_in_parallel_config (#4264) 2025-09-25 17:14:37 +08:00
Lucas
87179cb744 [XPU] support XPU VL model inference (#4030)
* [XPU] support XPU VL model inference

* fix image op import and device check

* rebase develop

* fix perf
2025-09-25 14:34:15 +08:00
ooo oo
e36eccfdad 【Hackathon 9th No.21、23】add unit tests for fused_hadamard_quant_fp8, moe_fused_hadamard_quant_fp8 (#4094)
* test: add unit tests for fused_hadamard_quant_fp8

* test: add unit tests for moe_fused_hadamard_quant_fp8

* tests: simulate CUDA kernel's hadamard32_warp using butterfly operations

* apply review

* apply review
2025-09-25 12:15:00 +08:00
Zero Rains
b433a93d9a fix the bug for prefilled_step_idx signal of cache_messager in cudagraph and PD (#4235)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-09-24 19:46:52 +08:00
RAM
870364b547 [CUDAGraph]CUDA Graph support unique memory pool (#4230)
* cuda graph use unique memory pool

* fix custom device import bug

* refine code

* refine code

* refine code
2025-09-24 19:45:22 +08:00
CSWYF3634076
5ff10c8ced [Model] Qwen2.5VL support --use-cudagraph and unit testing (#4087)
* [BugFix] qwen2.5vl enable_thinking=true and image_patch_id bug fix

* [Docs]offine infer add apply_chat_template add_generation_prompt parameter

* [Model]qwen2.5VL support --use-cudagraph

* [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test

* [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test

* [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v2

* [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v3

* [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v4

* [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v5

* [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v6

* [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v7
2025-09-24 19:45:01 +08:00
luukunn
18f4977aec [fix]update apply_chat_template (#4137)
Some checks failed
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
* update apply_chat_template

* fix unittest

* fix unittest

* fix

* fix

* fix unit test

* fix

* fix unit test

* add unit test
2025-09-24 18:56:32 +08:00
chen
7c1fd19f0f [OPs] MoE support wfp8afp8(channelwise) and improve per_token_quant_fp8 (#4238) 2025-09-24 16:39:51 +08:00
memoryCoderC
8b0ce8e3ab [Feature] add cli command serve (#4226) 2025-09-24 14:50:45 +08:00
ApplEOFDiscord
9566ae8827 [Bug Fix] disable prefix caching in mm model (#4167)
* add http get retry

* fix coments

* disable prefix caching in mm model

* fix unit test

---------

Co-authored-by: zhangjunjun04 <zhangjunjun04@baidu.com>
Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
2025-09-24 14:43:46 +08:00
lizexu123
e8318b7477 [BugFix] fix qwen3-embedding model tp>1 (#4223)
* support qwen3-embedding

* fix ci bug

* fix

* fix ci bug

* fix ci bug

* fix

* fix qwen3-embedding

* fix

* fix

* fix
2025-09-24 14:13:26 +08:00
chen
3161014e49 [BugFix]fix v1 loader moe bf16, and supoort dynamic_load_weight create quant param (#4229)
* fix v1 loader moe bf16, and supoort dynamic_load_weight create quant param

* include_stop_str_in_output=False not return eos text
2025-09-24 14:12:05 +08:00
Yohanna
44010cee13 FIX] Fix CUDA error(700): 'cudaErrorIllegalAddress' in CascadeAppendWriteCacheKVQKV cache_kernel(). Continue when batch_id_per_token[token_idx] is default value -1. (#4218) 2025-09-24 14:08:49 +08:00
fmiao2372
f1b5392e20 [Intel HPU] Support intel hpu platform (#4161)
* [Intel HPU] Support intel hpu platform

* fix some issues

* apply precommit and move AttentionBackend_HPU

* fix format issue

* correct ops import

* fix ci issue

* update code in layers

* fix code style issue

* remove dense tp moe ep mode

* fix enc_dec_block_num

* fix rebase issue

* rename hpu to gaudi in readme

* rename ForwardMeta_HPU to HPUForwardMeta
2025-09-24 12:27:50 +08:00
co63oc
a1c5d930bb 【Hackathon 9th No.24】add rebuild_padding (#4107) 2025-09-24 12:08:17 +08:00
Yuanle Liu
b455fd39f3 register_model_class compatible with plugins (#4236)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-09-24 11:17:12 +08:00
yyssys
d6e59447f5 [XPU] Enable XPU V1 mode based on environment variable (#4213)
* Enable XPU V1 mode based on environment variable
* add default param to xft_moe_fc_block_eb for latest xvllm compatibility; update run_ci_xpu to use latest xvllm
2025-09-24 10:29:48 +08:00
chen
ec99474e71 [Test]add glm45_air logprob test and rollout model (#4175)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* add glm45_air logprob test

* add glm rollout model and pretrainedmodel for rl

* add glm rollout model and test

* check

* delete cudagraph in glm45

* add UT for glm rollout model

* revert glm UT
2025-09-23 21:06:07 +08:00
bukejiyu
62d1c48363 [v1 loader]code style (#4204)
* code style

* update
2025-09-23 19:36:00 +08:00
chen
1a6283424e Fix noaux_tc cuda Error 700 in CUDAGraph (#4174)
Some checks failed
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
2025-09-23 18:41:33 +08:00
lizexu123
c96a535a5d [Feature] support qwen3-embedding model load (#4202)
* support qwen3-embedding

* fix ci bug

* fix

* fix ci bug

* fix ci bug

* fix
2025-09-23 00:14:35 -07:00
zhupengyang
9082f625ba [xpu] use cpu barrier (#4181) 2025-09-23 12:19:03 +08:00
plusNew001
813befadfa Update run_ci_xpu.sh to lock xvllm version (#4210)
Temporarily lock xvllm version due to compilation errors and update XVLLM_PATH.
2025-09-23 11:20:08 +08:00
plusNew001
c32aae901f [XPU] update XPU CI (#4209)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* change xpu ci model

* change xpu ci model

* change xpu ci model

* change xpu ci model

* Update model path and XPU settings in run_ci_xpu.sh

* Increase health check timeout to 10 minutes

Increased the timeout duration for health checks from 5 minutes to 10 minutes in two places.

* Implement test for OpenAI chat completion

Add a test function for the OpenAI client chat response.

* Change script to use pytest for running tests

* Update health check timeout to 15 minutes

Increase the timeout for health checks from 10 minutes to 15 minutes.

* Add pytest installation to CI script

* Modify base response in test_45t function

Updated the base response message for the test.

* Add V0 and V1 mode test echo statements

* Set ENABLE_V1_KVCACHE_SCHEDULER to 0

Disable V1 KVCACHE SCHEDULER for V0 mode testing.

---------

Co-authored-by: root <root@yq01-inf-hic-k8s-a100-aa24-0591.yq01.baidu.com>
2025-09-23 10:28:49 +08:00
yangjianfengo1
4325b737e7 【FIX】Change the name of sparse attn from moba to plas (#4006) (#4076)
* 【FIX】Change the name of sparse attn from moba to plas (#4006)

* 更新文档

* 【docs】 update readme (#4000)

* 更新文档

* update readme

* update docs

* 【FIX】Change the name of sparse attn from moba to plas (#3845)

* 更新文档

* 更新文档

* 更新文档

* 更新文档

* 修改moba为plas

* code style

* update ci

* code style

* update ci

* code style

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>

* fix max_num_seqs

* fix test load attn

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-09-23 10:26:40 +08:00
plusNew001
2c34a557f4 [XPU]change xpu ci model (#4117)
* change xpu ci model

* change xpu ci model

* change xpu ci model

* change xpu ci model

* Update model path and XPU settings in run_ci_xpu.sh

* Increase health check timeout to 10 minutes

Increased the timeout duration for health checks from 5 minutes to 10 minutes in two places.

* Implement test for OpenAI chat completion

Add a test function for the OpenAI client chat response.

* Change script to use pytest for running tests

* Update health check timeout to 15 minutes

Increase the timeout for health checks from 10 minutes to 15 minutes.

* Add pytest installation to CI script

* Modify base response in test_45t function

Updated the base response message for the test.

* Add V0 and V1 mode test echo statements

---------

Co-authored-by: root <root@yq01-inf-hic-k8s-a100-aa24-0591.yq01.baidu.com>
2025-09-23 10:21:17 +08:00
ltd0924
83720da79f [Feature] support clear data (#3601)
* [Feature] support clear data

* update

* fix

* fix

* fix

* fix

* fix

* fix

* fix
2025-09-23 10:20:02 +08:00
Jiang-Jia-Jun
772f0156f3 Remove useless code (#4195)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-09-22 21:18:19 +08:00
yzwu
504461b6b5 [Iluvatar GPU] Optimize attention performance and fix moe load ckpt error (#3651) 2025-09-22 21:13:59 +08:00
Zhang Yulong
5532e8a323 [FD CLI] Add bench cli (#4160)
* add bench cli

* Update test_main.py
2025-09-22 20:37:30 +08:00
Echo-Nie
5e1f13bd3b add test_set_value_by_flags_and_idx.py (#4186) 2025-09-22 20:21:34 +08:00
co63oc
c5671d7c09 [MTP][Unit Test]add test_top_p_candidates (#4046)
Some checks failed
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
* add test_top_p_candidates

* fix

* fix

* fix
2025-09-22 17:06:38 +08:00
chenjian
918ccdb123 [Feature] Support pd ep deployment with yiyan adapter (#4029)
* [Feature] Support mixed deployment with yiyan adapter in release2.2

* fix metrics

* add unit test

* add unit test

* add unit test

* Support pd ep deployment with yiyan adapter

* Support pd ep deployment with yiyan adapter

* refactor cache messager

* support scheduler v1 in PD

* suppport pd v1 + chunk prefill

* suppport pd v1 + chunk prefill

* add eplb

* support eplb

* support eplb

* support eplb

* support v1

* fix

* fix

* fix bug

* remove eplb support

* support prefix cache in P

* fix bug

* fix bug

* support one stop in V1

* fix bug

* fix ci

* fix ci

* fix

* fix

* fix

* fix

* fix

---------

Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
2025-09-22 16:41:38 +08:00
Echo-Nie
9845f0d010 【Hackathon 9th No.30】add test_tritonmoe_preprocess (#3891)
* add test_tritonmoe_preprocess

* add value check

* del test_support_all...
2025-09-22 15:31:32 +08:00
co63oc
c4830ef24c fix typos (#4176)
* fix typos

* fix
2025-09-22 14:27:17 +08:00
Divano
0b62648924 test xly ci 2025-09-22 14:13:00 +08:00
lizexu123
c86945ef49 [Feature] support pool (#3827)
* support pool

* update pooling

* add pooler_config and check

* update

* support AutoWeightsLoader load weight

* fix

* update

* delete print

* update pre-commit

* fix

* fix xpu

* fix ModelRegistry->model_registry

* fix Copilot review

* fix pooler.py

* delete StepPooler

* fix abstract

* fix default_loader_v1

* fix Pre Commit

* support torch qwen3 dense

* add test and fix torch-qwen

* fix

* fix

* adapter ci:

* fix review

* fix pooling_params.py

* fix

* fix tasks.py 2025

* fix print and logger

* Modefy ModelRegistry and delete AutoWeightsLoader

* fix logger

* fix test_embedding

* fix ci bug

* ernie4_5 model_registry

* fix test

* support Qwen3-Embedding-0.6B tp=1 load

* fix extra code

* fix

* delete fix vocab_size

* delete prepare_params_dict

* fix:
2025-09-22 14:09:09 +08:00
chen
da74a5f0b3 fix glm all_reduce tp group (#4187) 2025-09-22 10:56:55 +08:00
co63oc
718f32a6b0 fix nul (#4191) 2025-09-22 10:55:33 +08:00
Lucas
5c33be5a7d [TEST] init first commit (#4192) 2025-09-22 10:51:27 +08:00
RichardWooSJTU
91912cc2e1 fix t2i (#4163)
Some checks failed
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
CE Compile Job / ce_job_pre_check (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Co-authored-by: Yuanle Liu <yuanlehome@163.com>
2025-09-19 18:07:13 +08:00
Echo-Nie
cc6e14d2ec 【Hackathon 9th No.46】add test_fused_rotary_position_encoding (#3848)
* add test_fused_rotary_position_encoding

* 添加版权

* fix according to the review
2025-09-19 17:50:19 +08:00
YuanRisheng
24180fba0a [FDConfig]Remove splitwise_role and engine_worker_queue_port in FDConfig (#4147)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* remove splitwise_role and engine_worker_queue_port

* fix xpu

* fix xpu

* fix xpu

* fix unittest

* resolve conflct
2025-09-19 17:01:52 +08:00
luukunn
ee9d8a840a [fix]Modify follow-up push parameters and Modify the verification method for thinking length (#4086)
* 续推参数  generated_token_ids 修改成 completion_token_ids;修改思考长度校验方式

* 续推参数  generated_token_ids 修改成 completion_token_ids;修改思考长度校验方式

* 续推参数  generated_token_ids 修改成 completion_token_ids;修改思考长度校验方式

* 续推参数  generated_token_ids 修改成 completion_token_ids;修改思考长度校验方式

* add completion_token_ids

* add logger

* fix reasoning_max_tokens ParameterError

* add unittest

* add unittest

* add unittest

* add unittest

* add unittest

* add unit test
2025-09-19 14:26:01 +08:00
chen
66a98b44ed ep support logprob (#4089) (#4151) 2025-09-19 14:07:31 +08:00
Yuanle Liu
a685e5ad35 Each module should have its own plugins_loaded (#4164) 2025-09-19 14:06:10 +08:00
xiaolei373
ddf5606263 Bugfix test exception (#4171)
* feat(log):add_request_and_response_log

* modify default error type
2025-09-19 11:48:49 +08:00
Sunny-bot1
c3b8ebeb18 [Optimize] Machete using group scale default (#4121)
Some checks failed
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
2025-09-18 13:51:11 +08:00
qwes5s5
62b8b02e08 fix_unitest (#4159)
Co-authored-by: K11OntheBoat <your_email@example.com>
Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
2025-09-18 11:17:15 +08:00
xiaolei373
98447beb4d Add param valid log (#4113)
* feat(log):add_request_and_response_log

* [bugfix] add param valid log

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-09-18 10:39:24 +08:00
chenjian
618ccdbfba [Feature] Support mixed deployment with yiyan adapter in develop (#3976)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* [Feature] Support mixed deployment with yiyan adapter in release2.2

* fix metrics

* add unit test

* add unit test

* add unit test

* fix ci

* fix for eb5

* fix ci

* fix ci

* fix ci

---------

Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-09-18 01:52:20 +08:00
YuBaoku
2745f37017 [CI] enhance clean port and add waiting time (#4152)
Some checks failed
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
2025-09-17 20:31:49 +08:00
gaoziyuan
896e3bb606 [NewFeture]add ep rollout model init and update/clear ep buffer (#4039)
* fix gid

* merge

* fix test

* fix bug

* fix

* fix ci
2025-09-17 20:24:53 +08:00
YuanRisheng
0d3a57a2c6 fix unittest (#4155) 2025-09-17 20:20:26 +08:00
qw86972190
b52971749c Print KV Cache available memory and block memory usage in GB format (#4148) 2025-09-17 20:01:55 +08:00
RichardWooSJTU
2adca04f1f Reconstruct streaming data transfer with zmq (#3836)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* reconstruct USE_GET_SAVE_OUTPUT_V1

* fix ut

* use dp rank

* fix ci
2025-09-17 14:30:39 +08:00
Jiang-Jia-Jun
f9766f917b [BugFix] Forbiden FD_DISABLED_RECOVER while ENABLE_V1_KVCACHE_SCHEDULER (#4142)
Co-authored-by: Jiang-Jia-Jun <jiangjiajun@baidu.com>
2025-09-17 14:11:44 +08:00
YuanRisheng
2e9e53ff7e [FDConfig]Remove max_num_batched_tokens/max_num_seqs in parallel config (#4116)
* remove max_num_batched_tokens in parallel config

* remove max_num_seqs

* update test case

* fix test

* fix

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-09-17 10:43:35 +08:00
YUNSHEN XIE
c01a756912 mv test to tests (#4129)
Some checks failed
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
2025-09-16 20:45:40 +08:00
Zhang Yulong
cd09913552 Update test_w4a8_model.py (#4125) 2025-09-16 20:43:10 +08:00
chenjian
67e6d8c691 [Feature] Set prefix caching as default (#3814)
* Set prefix caching as default

* Set prefix caching as default

* Set prefix caching as default

* skip dynamic load scene

* fix kill bug

* fix kill bug

* fix kill bug

* fix

* fix

* fix ci
2025-09-16 20:34:27 +08:00
Yuan Xiaolan
de8638b1e9 fix dynamic Cfp8 computing error (#4119)
Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
2025-09-16 20:21:49 +08:00
YUNSHEN XIE
4f8901489c ci: Increase compilation task time limit (#4098)
* ci: Increase compilation task time limit

* update

* update

* rename

* update

* update
2025-09-16 20:05:45 +08:00
tianlef
e79a1a7938 x1_a3b config (#4135) 2025-09-16 19:44:46 +08:00
xiegegege
d682c97dd3 [benchmark]add lite-vl and x1 yaml (#4130) 2025-09-16 16:38:36 +08:00
Divano
8e49d99009 Addcase (#4112)
logprob 没跑,不影响,增加校验openai 异常情况下 错误输出格式字段的case
2025-09-16 16:12:14 +08:00
tianlef
83bf1fd5aa [Doc]add plas attention config (#4128) 2025-09-16 15:55:12 +08:00
co63oc
b70ca35c0b 【Hackathon 9th No.52】add test_dynamic_per_token_scaled_fp8_quant (#4015)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* add test_dynamic_per_token_scaled_fp8_quant

* fix

* add bfloat16

* ci
2025-09-16 14:11:29 +08:00
Echo-Nie
befe463f01 【Hackathon 9th No.37】add test_top_k_renorm_probs (#3755)
* add test_top_k_renorm_probs.py

* add size=2,3
2025-09-16 11:12:46 +08:00
Sunny-bot1
442543cd6b fix ep wint8 (#4102) 2025-09-16 11:05:33 +08:00
Yuanle Liu
ed2dcec829 add ignore=all for deepgemm (#4118)
Some checks failed
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
2025-09-15 21:52:00 +08:00
Jiang-Jia-Jun
a04365a0c7 Update api_server.py 2025-09-15 21:31:33 +08:00
YuanRisheng
03b3d6175d fix mtp (#4105) 2025-09-15 20:26:07 +08:00
co63oc
17a27170bc fix typos (#4093) 2025-09-15 18:33:30 +08:00
bukejiyu
113e330030 fix bf16 and add comments (#4106) 2025-09-15 17:23:07 +08:00
freeliuzc
69aa2781a1 [MTP]Support mtp reshard (#4099)
* support rl reshard

* modify model name
2025-09-15 17:13:53 +08:00
freeliuzc
46911f903d [MTP]update hybrid-mtp-with-ngram (#4047) 2025-09-15 17:13:31 +08:00
Yuanle Liu
b1b33211e8 [CUDAGraph] Support multi output buffers and merge some fixes from feature/exp_0908 (#4062)
* refine cudagraph

* refine cudagraph

* typo

* fix

* fix plugins

* fix

* update

* update

* update
2025-09-15 16:21:30 +08:00
zhupengyang
9409665713 [xpu] support ep (#4067)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-09-15 13:53:11 +08:00
bukejiyu
29ed617f0f [v1 loader]qwen Offline fp8 (#4036)
* support offline fp8

* update ut

* update ut

* update ut

* fix

* update

* update
2025-09-15 13:44:11 +08:00
Sunny-bot1
b1a5b756a3 [Optimize] Support WINT8 and group scale for Machete (#3905) 2025-09-15 12:01:34 +08:00
Echo-Nie
4408dc7f67 【Hackathon 9th No.49】add test_pre_cache_len_concat (#3847)
* add test_pre_cache_len_concat

* fix according review, add ref_pre_cache_len_concat
2025-09-15 11:20:14 +08:00
co63oc
ef4a1aa2da 【Hackathon 9th No.61、65】add test_draft_model_update (#3940)
* add draft_model_update test

* fix

* fix

* fix

* fix

* fix
2025-09-15 11:19:50 +08:00
Zero Rains
f213ae1e86 [Bug Fix]fix the bug for cache_messager signal loss (#3879)
* fix the bug for real size 0 in cudagraph

* fix cache_messager
2025-09-15 11:16:24 +08:00
qwes5s5
553adb299e 【FastDeploy CLI】collect-env subcommand (#4044)
* collect-env subcommand

* trigger ci

---------

Co-authored-by: K11OntheBoat <your_email@example.com>
2025-09-15 10:31:23 +08:00
zhouchong
958abebeab Support offline inference with streaming output (#4071)
* Support offline inference with streaming output

* add unit test

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-09-15 10:27:03 +08:00
YUNSHEN XIE
4871f18dad fix(CE): update concurrency to stop CE tasks from canceling each other (#4083)
Some checks failed
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
2025-09-12 19:16:26 +08:00
Ayakouji
987609c894 [BugFix] Fix image_feature 0-Size causing insert failed (#4042)
* update

* fix image_feature
2025-09-12 19:13:08 +08:00
xiaolei373
9ac539471d [format] Valid para format error info (#4035)
* feat(log):add_request_and_response_log

* 报错信息与OpenAI对齐
2025-09-12 19:05:17 +08:00
YuanRisheng
88ea565aba [BugFix]Fix load kv cache quant scale (#4077)
* fix kv cache

* fix kv_cache

* fix kv cache
2025-09-12 17:44:03 +08:00
co63oc
c86b3357ce 【Hackathon 9th No.78】add test_chat.py (#3958) 2025-09-12 16:53:27 +08:00
Echo-Nie
06f4b49ca3 【Hackathon 9th No.25】add test_fused_get_rotary_embedding (#3892)
* add test_fused_get_rotary_embedding

* 增加基于 NumPy 的基准实现

* 添加,开源软件的版权和许可声明
2025-09-12 15:38:43 +08:00
SuperNova
805f29a06c [Feature] refactor metax_gpu attention and moe and remove some useless code (#3688)
Co-authored-by: yongqiangma <xing.wo@163.com>
2025-09-12 14:40:25 +08:00
ltd0924
cab7a633fe [CI] add multi api server test (#4049)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* [BugFix] fix max streaming tokens invalid

* fix scheduler bug

* fix scheduler bug

* Update multi_api_server.py

* Create test_multi_api_server.py

* fix
2025-09-12 11:18:38 +08:00
qwes5s5
58e0785bab [metrics] update metrics markdown file (#4061)
* adjust md

* trigger ci

---------

Co-authored-by: K11OntheBoat <your_email@example.com>
2025-09-12 11:13:43 +08:00
co63oc
8466219ec8 fix typos (#3840)
* fix typos

* ci

---------

Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
2025-09-12 11:04:38 +08:00
RichardWooSJTU
82dab8a91a Add token processor plugin support (#4059)
* Add token processor plugin support

* fix import

* fix import
2025-09-12 10:17:23 +08:00
chenjian
37f1632732 [Optimize] optimize prefix cache in develop (#3890)
* optimize prefix cache in release22

* fix

* fix

* fix

* add ci for v1

* add unit test

---------

Co-authored-by: xiegegege <46314656+xiegegege@users.noreply.github.com>
2025-09-12 10:15:59 +08:00
chen
4859f40b20 [Feature] GLM-45-AIR Support Mix Quantization(Dense wfp8afp8 and wint8 triton_moe_backend) (#4051)
Some checks failed
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
2025-09-11 20:08:09 +08:00
lddfym
2056a428bd [bug fix] Fix the placeholder in qwen prompt and add some unittests (#4065)
* fix the placeholder in qwen prompt

* fix the placeholder in qwen prompt

* add soem unittests for qwen_vl_processor
2025-09-11 20:00:02 +08:00
memoryCoderC
850465e8ed [Feature] add cli command chat,complete (#4037) 2025-09-11 19:53:14 +08:00
zhuzixuan
a47976e82d [Echo] Support more types of prompt echo (#4022)
* wenxin-tools-700 When the prompt type is list[int] or list[list[int]], it needs to support echoing after decoding.

* wenxin-tools-700 When the prompt type is list[int] or list[list[int]], it needs to support echoing after decoding.

* wenxin-tools-700 When the prompt type is list[int] or list[list[int]], it needs to support echoing after decoding.

* wenxin-tools-700 When the prompt type is list[int] or list[list[int]], it needs to support echoing after decoding.

* wenxin-tools-700 When the prompt type is list[int] or list[list[int]], it needs to support echoing after decoding.

* wenxin-tools-700 When the prompt type is list[int] or list[list[int]], it needs to support echoing after decoding.

* wenxin-tools-700 When the prompt type is list[int] or list[list[int]], it needs to support echoing after decoding.

* wenxin-tools-700 When the prompt type is list[int] or list[list[int]], it needs to support echoing after decoding.

* wenxin-tools-700 When the prompt type is list[int] or list[list[int]], it needs to support echoing after decoding.

---------

Co-authored-by: luukunn <83932082+luukunn@users.noreply.github.com>
2025-09-11 19:34:44 +08:00
xiaoxiaohehe001
abdcef30aa [BugFix] mm_post_fix (#4005)
* mm_post_fix

* mm_post_fix_1
2025-09-11 19:09:46 +08:00
Zhang Yulong
d2ec7f6aa2 update ci (#4064)
* update ci

* update ci
2025-09-11 18:36:25 +08:00
YuBaoku
fec58639db [CI] skip test_structured_outputs* temporarily (#4055) 2025-09-11 18:07:50 +08:00
YuanRisheng
d2d04c2d5e [setup optimize]Support git submodule (#4033)
* support git submodule

* update setup

* fix ci network

* fix clone

* revert clone linux

* delete args

* fix ci

* update
2025-09-11 17:41:16 +08:00
SuperNova
d60f7c4661 fix import tests.utils error in tests/model_loader/test_load_mtp.py (#4027)
Co-authored-by: yongqiangma <xing.wo@163.com>
2025-09-11 16:47:16 +08:00
CSWYF3634076
e4c64a71cc [BugFix] qwen2.5vl enable_thinking=true and image_patch_id bug fix (#3921) 2025-09-11 15:08:24 +08:00
bukejiyu
2650f58740 [docs] Update environment variables documentation (#3957)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-09-10 21:17:06 -07:00
co63oc
2af0f671b1 【Hackathon 9th No.55】add test_update_inputs_v1.py (#3992) 2025-09-11 11:34:22 +08:00
AIbin
a7392a0ff9 【Inference Optimize】DeepSeek-V3-model MLA Optimize (#3886)
* support MLA chunk_size auto search & cuda_graph
2025-09-11 10:46:09 +08:00
chen
637d96c6ae [Feature] Support zai-org/GLM-4.5-Air BF16 model (#3928)
Some checks failed
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
* support glm45_air
2025-09-10 19:36:10 +08:00
freeliuzc
7ee100903f support rope_3d in spec mode (#4034) 2025-09-10 03:15:05 -07:00
ltd0924
684e93269b [Fix] fix multi api server log dir (#3967)
* [BugFix] fix max streaming tokens invalid

* fix scheduler bug

* fix scheduler bug

* Update multi_api_server.py
2025-09-10 17:15:30 +08:00
wanrui
276f73cf83 【Hackathon 9th No.28】add test_cutlass_fp8_fp8_fp8_dual_gemm_fused (#3935)
* add test_cutlass_fp8_fp8_fp8_dual_gemm_fused

* fix the version

* fix code style

---------

Co-authored-by: Tao Luo <luotao02@baidu.com>
2025-09-10 14:57:49 +08:00
RAM
d3e4ae3d49 [Executor] Adjust signal sending order in RL training (#3773)
* Adjust processing order

* fix bug

* fix update_parameters bug

* refine code
2025-09-10 13:24:20 +08:00
Ayakouji
453487d5b0 [Feat] ernie4_5_vl_moe support CudaGraph (#3226)
* delete dynamic control flow for decode

* coda-style

* fix scatter/gather typos and use input stream instead default stream

* support 0-Size Tensor

* update runner and model

* using static mem address as input

* fix mem leak

* refine code

* update mm_buffer

* fix typo

* fix buffersize

* fix unk token

* refine code

* refine

* support other arch

* open cudagraph in vlci

* fix

* update

* update

* update

* fix cmd

* update

---------

Co-authored-by: aquagull <hongyuh@qq.com>
Co-authored-by: Yuanle Liu <yuanlehome@163.com>
2025-09-10 13:11:57 +08:00
zhupengyang
9d0074a91a [xpu] add ep custom ops (#3911)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-09-10 12:22:50 +08:00
Yuanle Liu
c3b2a60fb8 [BugFix] Fix the abnormal memory usage caused by shape errors in the triton moe backend (#4026)
* fix device_id to in

* fix triton_moe bug
2025-09-09 20:05:54 -07:00
周周周
dbab579299 clean code (#4020) 2025-09-10 10:56:15 +08:00
guozhuangzhuang
f078a959b6 metrics shared folder naming (#4007)
* Fixed the issue of metrics file conflicts between multiple instances on a single machine

* Use uuid to name the metrics shared folder

* Use uuid to name the metrics shared folder
2025-09-10 10:47:20 +08:00
Sunny-bot1
3b1da6e4dd support v1 loader for machete (#3999) 2025-09-10 10:21:33 +08:00
YuanRisheng
b3fac5bde1 [V1 Loader] Ernie kv cache quant support v1 loader (#3899)
Some checks failed
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
* support c8 for ernie

* add unittest

* support vl

* fix c8
2025-09-09 05:25:08 -07:00
Zero Rains
98bfefea02 get org_vocab_size from args (#3983) 2025-09-09 15:08:03 +08:00
Jiang-Jia-Jun
c60adf4281 Revert "【FIX】Change the name of sparse attn from moba to plas (#3845)" (#4001)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
This reverts commit e31c8f7336.
2025-09-09 11:08:23 +08:00
Jiang-Jia-Jun
bbd548ceb6 Revert "【Fix】Change the name of sparse attn from moba to plas (#3993)" (#4002)
This reverts commit a553d1896c.
2025-09-09 11:07:46 +08:00
yangjianfengo1
f556561584 【docs】 update readme (#4000)
* 更新文档

* update readme

* update docs
2025-09-09 11:04:08 +08:00
yangjianfengo1
a553d1896c 【Fix】Change the name of sparse attn from moba to plas (#3993)
* 更新文档

* 更新文档

* 更新文档

* 更新文档

* 修改moba为plas

* code style

* update ci

* code style

* update ci
2025-09-09 10:57:07 +08:00
yangjianfengo1
e31c8f7336 【FIX】Change the name of sparse attn from moba to plas (#3845)
* 更新文档

* 更新文档

* 更新文档

* 更新文档

* 修改moba为plas

* code style

* update ci

* code style

* update ci
2025-09-09 10:56:50 +08:00
yangjianfengo1
de34222842 更新文档 (#3998) 2025-09-09 10:44:15 +08:00
JYChen
8e8a5913da add a3b-thinking doc (#3994) 2025-09-09 10:27:01 +08:00
Jiang-Jia-Jun
9f0e2a6854 Update README_CN.md 2025-09-09 10:11:25 +08:00
Jiang-Jia-Jun
30ddcc9115 Update README.md 2025-09-09 10:10:45 +08:00
Zhang Yulong
2359c8d21c update ci (#3962)
Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
2025-09-09 10:09:13 +08:00
Jiang-Jia-Jun
1dc1397ef6 Update docs for thinking model support 2025-09-09 10:08:05 +08:00
ming1753
12326b60e1 [Docs] update VL best_practices for release/2.2 (#3965)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* [Docs] update VL best_practices for release/2.2

* fix bug

* modify
2025-09-08 22:07:37 +08:00
lzy
f12159b630 del batch id per token (#3963)
* Update decoder_write_cache_with_rope_kernel.cu

del batch_id_per_token

* Update decoder_write_cache_with_rope_impl.cuh

* Update test_append_attention.py

* Update test_append_attention.py
2025-09-08 21:58:34 +08:00
bukejiyu
08b3153661 update doc (#3990)
Co-authored-by: root <root@tjdm-inf-sci-k8s-hzz2-h12ni8-0214.tjdm.baidu.com>
2025-09-08 21:04:26 +08:00
AIbin
d00faeec69 update dsk doc (#3989) 2025-09-08 20:42:48 +08:00
yinwei
7e0bfd024f update release note (#3986)
Some checks failed
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
2025-09-08 19:03:14 +08:00
JYChen
1f056a7469 [docs] update best practice docs (#3969)
* update best practice docs

* add version and v1 loader info
2025-09-08 17:39:38 +08:00
Echo-Nie
319a4bf75f 【Hackathon 9th No.36】add test_extract_text_token_output(#3862) 2025-09-08 17:31:58 +08:00
co63oc
f884cd4f62 [UnitTest][MTP]add test_speculate_set_stop_value_multi_seqs.py (#3941) 2025-09-08 17:11:00 +08:00
co63oc
f32327661c [UnitTest][MTP]add test_eagle_get_hidden_states (#3876) 2025-09-08 17:10:01 +08:00
co63oc
976aa88e66 【Hackathon 9th No.69】add test_draft_model_preprocess (#3832)
* add test_draft_model_preprocess

* fix

* ci
2025-09-08 17:08:50 +08:00
co63oc
ed462cf238 [UnitTest][MTP] add test_speculate_get_token_penalty_multi_scores.py (#3742)
* add test_speculate_get_token_penalty_multi_scores

* fix
2025-09-08 17:07:11 +08:00
Echo-Nie
20495f927e [UnitTest][MTP] supplementary unit test for ngram_match (#3732)
* supplement unittest for custom_ops: ngram_match

* add annotation

* 借助 step_idx 信息,改为在具体位置判断是否相等

* del anno

* del print

---------

Co-authored-by: Tao Luo <luotao02@baidu.com>
2025-09-08 17:06:06 +08:00
ooo oo
0c46318b34 【Hackathon 9th No.22】add unit tests for share_external_data (#3744) 2025-09-08 17:05:48 +08:00
yangjianfengo1
9ead10e1bc 更新文档 (#3975) 2025-09-08 16:53:37 +08:00
xiaolei373
571ddc677b Modify markdown (#3896)
* feat(log):add_request_and_response_log

* modify markdown graceful shutdown
2025-09-08 16:42:34 +08:00
AIbin
316ac546d3 update_wint2_doc (#3968) 2025-09-08 15:53:09 +08:00
zhuzixuan
83bd55100b [Optimize]Error messages about Model api. (#3839)
* add v1/models interface related

* add model parameters

* default model verification

* unit test

* check model err_msg

* unit test

* type annotation

* model parameter in response

* modify document description

* modify document description

* unit test

* verification

* verification update

* model_name

* pre-commit

* update test case

* update test case

* Update tests/entrypoints/openai/test_serving_models.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update tests/entrypoints/openai/test_serving_models.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update tests/entrypoints/openai/test_serving_models.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update tests/entrypoints/openai/test_serving_models.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update fastdeploy/entrypoints/openai/serving_models.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* 优化报错信息。

---------

Co-authored-by: yangzichao01 <yangzichao01@baidu.com>
Co-authored-by: Yzc216 <101054010+Yzc216@users.noreply.github.com>
Co-authored-by: LiqinruiG <37392159+LiqinruiG@users.noreply.github.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2025-09-08 15:52:26 +08:00
co63oc
aadd6a94d8 fix typos (#3951) 2025-09-08 15:22:41 +08:00
co63oc
2033450391 rename ep_moe_prefill_func ep_moe_expert_dispatch (#3938) 2025-09-08 15:19:28 +08:00
Sunny-bot1
ed5133f704 update env docs for Machete (#3959) 2025-09-08 14:44:31 +08:00
qwes5s5
17169a14f2 [metrics] Add serveral observability metrics (#3868)
* Add several observability metrics

* [wenxin-tools-584] 【可观测性】支持查看本节点的并发数、剩余block_size、排队请求数等信息

* adjust some metrics and md files

* trigger ci

* adjust ci file

* trigger ci

* trigger ci

---------

Co-authored-by: K11OntheBoat <your_email@example.com>
Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-09-08 14:13:13 +08:00
Jundong Liu
3d0aaa5923 [Excutor] Experiment Feature-Support Prefill in cudagraph (#3459)
* Support prefill in Cudagraph

* Refactor GetBlockShapeAndSplitKVBlock Kernel V2

* Refactor GetBlockShapeAndSplitKVBlock Kernel V2.1

* Refactor GetBlockShapeAndSplitKVBlock Kernel V2.2

* Refactor GetBlockShapeAndSplitKVBlock Kernel V2.3

* Refactor GetBlockShapeAndSplitKVBlock Kernel V2.4

* Refactor GetBlockShapeAndSplitKVBlock Kernel V2.5

* Solve problem about encoder_num_blocks_x_cpu

* Add early-exit mechanism for attention kernel

* fix test case about append-attention

* Update testcode, Add annotations to related tensors

* move get_input_length_list

* solve test_code

* Add annotations about early-exit for attention kernel

* Add annotations about early-exit for attention kernel2

* solve comment

* solve mtp

---------

Co-authored-by: RAM <gstian5555@outlook.com>
2025-09-08 13:12:24 +08:00
yangjianfengo1
472402bf4e Update sparse attn documentation (#3954)
* 更新文档

* 更新文档

* 更新文档

* 更新文档
2025-09-08 12:23:18 +08:00
lzy
af49b81ffd supports dynamic Cfp8 (#3767)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* supports dynamic Cfp8

* add unittest
2025-09-07 20:41:29 -07:00
chenjian
b5e20e3015 [Bug fix] Fix prompt token ids dtype in v1 (#3860) 2025-09-08 11:34:13 +08:00
yinwei
7833f2f6cb [XPU]Fixed the issue of performance degradation caused by enabling ENABLE_V1_KVCACHE_SCHEDULER (#3897)
* fix bug

* fix bug

* update

* update

* update
2025-09-08 10:34:46 +08:00
ApplEOFDiscord
b649494655 [Feature] add HTTP GET retry (#3838)
* add http get retry

* fix coments

---------

Co-authored-by: zhangjunjun04 <zhangjunjun04@baidu.com>
Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-09-08 10:11:14 +08:00
bukejiyu
7c268693ed ignore ci (#3950)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-09-07 23:58:52 +08:00
bukejiyu
e52ce1c4b1 cache feature (#3857)
Some checks failed
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
2025-09-07 18:52:46 +08:00
co63oc
30a1c1783f rename eagle_get_base_model_hidden_states.cu (#3753)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-09-07 10:24:58 +08:00
Zhang Yulong
349aa6348b add cache queue port (#3904)
Some checks failed
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
* add cache queue port

* add cache queue port

* add cache queue port
2025-09-05 21:17:06 +08:00
ltd0924
0c45e225d3 mv connection_manager init (#3901)
Co-authored-by: Yuanle Liu <yuanlehome@163.com>
2025-09-05 21:11:48 +08:00
周周周
f6f726c773 clean code in sttantion (#3917) 2025-09-05 20:49:01 +08:00
chen
0d989829bb Compatible with EB 0.3B torch model arch (#3913)
* fix

* check
2025-09-05 19:04:59 +08:00
ltd0924
bd7d15f7ea [Feature] support controller port in multi api server (#3898)
Some checks failed
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
* Update serving_chat.py

* Update serving_completion.py

* Update serving_completion.py

* Update multi_api_server.py
2025-09-05 17:16:31 +08:00
Yuan Xiaolan
2cf55168ca load hadamard_block_size from config (#3797) 2025-09-05 17:07:58 +08:00
AIbin
41aee08982 【Inference Optimize】Update MergedReplicatedLinear for DSK qkv_a_proj_with_mqa. (#3673)
* support MergedReplicatedLinear

* update MergedReplicatedLinear to support DSK_wint4 V1_load

* update model name

* update linear class

* fix

* fix v0 moe_bias load

---------

Co-authored-by: bukejiyu <52310069+bukejiyu@users.noreply.github.com>
2025-09-04 21:16:05 -07:00
ooo oo
b23fc654d9 【Hackathon 9th No.32】add unit tests for group_swiglu_with_masked (#3748) 2025-09-05 11:53:47 +08:00
gaoziyuan
ab1929f5ff fix mem boom in ep (#3854) 2025-09-05 11:48:21 +08:00
Echo-Nie
fc3bc56e59 【Hackathon 9th No.35】add test_moe_redundant_topk_select (#3867) 2025-09-05 11:29:02 +08:00
ltd0924
7643e6e6b2 [Docs] add data parallel (#3883)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* [Docs] add data parallel

* [Docs] add data parallel
2025-09-04 20:33:50 +08:00
ltd0924
e0e7d68435 Update qwen_vl_processor.py (#3808) 2025-09-04 20:31:48 +08:00
Zhang Yulong
4c160aa4dd Update test_ernie_21b_mtp.py (#3885) 2025-09-04 20:20:36 +08:00
YuBaoku
c7b7126b20 [CI] update paddleformers==0.2 in develop (#3878) 2025-09-04 20:12:41 +08:00
SunLei
29628de6a7 Support for async processor added. (#3869)
* Support for async processor added.

* remove yappi code

---------

Co-authored-by: Yuanle Liu <yuanlehome@163.com>
2025-09-04 19:58:53 +08:00
xiaolei373
ed97cf8396 Graceful shut down (#3785)
* feat(log):add_request_and_response_log

* 优雅退出-接口增加退出时长参数
2025-09-04 19:33:50 +08:00
freeliuzc
88d44a2c93 support mtp in v1_scheduler mode (#3695)
Some checks failed
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
2025-09-04 17:39:59 +08:00
xiaoxiaohehe001
f265a26f8b support mtp rope_3d (#3791)
* support mtp rope_3d

* Update speculate_write_cache_with_rope_kernel.cu
2025-09-04 17:18:05 +08:00
RichardWooSJTU
f36a388ffe fix response processsors (#3826)
* fix response processsors

* fix ci

* fix ut

---------

Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
2025-09-04 16:01:25 +08:00
chenjian
22c165d6dd [Feature] Set v1 scheduler as default in develop (#3807)
* Set scheduler v1 as default

* Set scheduler v1 as default

* Set scheduler v1 as default

* Set scheduler v1 as default

* Set scheduler v1 as default

* close V1 in guided_decoding

* fix vl ci

* close V1 in guided_decoding
2025-09-04 15:16:56 +08:00
co63oc
e83251699f 【Hackathon 9th No.63】add test_draft_model_postprocess.py (#3757)
* add test_draft_model_postprocess.py

* fix

* fix
2025-09-04 15:00:48 +08:00
Echo-Nie
ac46ef403a 【Hackathon 9th No.34】add test_get_position_ids_and_mask_encoder_batch (#3739) 2025-09-04 14:54:30 +08:00
RichardWooSJTU
0989788b29 support extend block tables (#3824) 2025-09-04 14:39:04 +08:00
gaoziyuan
6ef3b611b0 add dp config (#3822) 2025-09-04 11:46:48 +08:00
ooo oo
460809070c 【Hackathon 9th No.54、57】 add unit tests for per_token_quant and per_token_quant_padding (#3746) 2025-09-04 11:46:38 +08:00
co63oc
7baf1b56e0 【Hackathon 9th No.27】add test_get_padding_offset (#3708)
* add test_get_padding_offset

* fix

* fix

* fix
2025-09-04 11:42:35 +08:00
co63oc
9ec4fa0f8e fix typo EngineSevice EngineService (#3841) 2025-09-04 11:20:36 +08:00
yangjianfengo1
c870be6d27 fix port (#3863) 2025-09-04 10:01:38 +08:00
plusNew001
3790505319 [XPU] Update XPU stable xvllm and xtdk version for 2.2 (#3853)
Some checks failed
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
* Add debug environment variable exports

Added debug environment variable exports for CLANG_PATH and XVLLM_PATH.

* Lock paddlepaddle-xpu version in CI script

Temporarily lock paddlepaddle-xpu version due to framework update issues.

* Update no_proxy environment variable in CI workflow

* Install lsof tool in run_ci_xpu.sh

* Update dependency versions for stable release

* Update paddlepaddle-xpu installation command
2025-09-03 23:21:00 +08:00
co63oc
e24b745d48 [UnitTest][MTP]add test_speculate_get_output_padding_offset (#3740) 2025-09-03 22:21:21 +08:00
co63oc
aaa2de1afa [UnitTest][MTP]add test_speculate_get_padding_offset (#3730) 2025-09-03 22:21:02 +08:00
yyssys
abde903813 Automatically configure workers based on max-num-seqs (#3846)
Automatically configure workers based on max-num-seqs
2025-09-03 21:12:42 +08:00
YUNSHEN XIE
7dbd9412b0 reopen ut (#3795)
* reopen ut

* update

* update

* update ci dockerfile
2025-09-03 19:05:20 +08:00
luukunn
fc598d4c5a add reasoning parser plugin (#3811)
* add reasoning parser plugin

* fix finish reason
2025-09-03 18:31:27 +08:00
Ayakouji
31313e0f3d [Feature] ernie4_5_vl_moe support huggingface safetensor loading (#3750)
* update

* update

* update in tp

* add todo

* update

---------

Co-authored-by: aquagull <hongyuh@qq.com>
2025-09-03 02:58:59 -07:00
lizexu123
4c998c3636 [Code Simplification] delete cum_offsets_out (#3815)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* fix

* fix
2025-09-03 16:15:33 +08:00
YuanRisheng
0a1ce612c2 V1 loader support ep (#3801) 2025-09-03 16:05:41 +08:00
Yuan Xiaolan
fa58a9fa8f qk norm for speculate decode C16 (#3637) 2025-09-03 14:53:56 +08:00
plusNew001
d22d3de256 [XPU] Update XPU CI case (#3837)
* Add debug environment variable exports

Added debug environment variable exports for CLANG_PATH and XVLLM_PATH.

* Lock paddlepaddle-xpu version in CI script

Temporarily lock paddlepaddle-xpu version due to framework update issues.

* Update no_proxy environment variable in CI workflow

* Install lsof tool in run_ci_xpu.sh
2025-09-03 14:32:12 +08:00
lzy
2527eb0e4e fix test_append_attention_with_output.py (#3831)
Co-authored-by: plusNew001 <95567040+plusNew001@users.noreply.github.com>
2025-09-03 14:07:50 +08:00
AIbin
54b458fd98 [Doc] update wint2 doc (#3819)
* update_wint2_doc
2025-09-03 11:27:43 +08:00
plusNew001
d81c57146f [XPU] FIX XPU CI BUG (#3829)
* Add debug environment variable exports

Added debug environment variable exports for CLANG_PATH and XVLLM_PATH.

* Lock paddlepaddle-xpu version in CI script

Temporarily lock paddlepaddle-xpu version due to framework update issues.
2025-09-03 11:25:48 +08:00
ooo oo
2396e49f9e 【Hackathon 9th No.73】add unit tests for graph_opt_backend (#3609)
* test: add unit tests for graph_opt_backend

* refactor(tests): improve graph optimization test structure and readability

* fix(tests): correct CUDA graph related typos in test files

- Fix class name: TestCUDAGrpahSubgraph -> TestCUDAGraphSubgraph

* refactor(test): support attention layer and optimize graph optimization backend test to eliminate redundant baseline calculations

* remove some func call

---------

Co-authored-by: RAM <gstian5555@outlook.com>
Co-authored-by: Tao Luo <luotao02@baidu.com>
2025-09-03 11:18:00 +08:00
co63oc
94a61d505c fix dcu_worker.py (#3734) 2025-09-03 10:57:42 +08:00
co63oc
ce998449e0 fix w8a8.py (#3733) 2025-09-03 10:57:26 +08:00
Echo-Nie
f7a4bea785 【Hackathon 9th No.84】Supplementary Unit Test for fastdeploy/reasoning (#3570)
测试内容:测试基类的注册、获取函数功能是否正常

Co-authored-by: Tao Luo <luotao02@baidu.com>
2025-09-03 10:55:02 +08:00
co63oc
5441538173 rename fused_get_rope.cu (#3752)
* rename fused_get_rope.cu

* fix

* fix typos

* fix

* fix
2025-09-03 10:54:34 +08:00
ltd0924
2c9b169c0e [BugFix] fix scheduler invalid (#3803)
Some checks failed
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
* [BugFix] fix max streaming tokens invalid

* fix scheduler bug

* fix scheduler bug
2025-09-02 20:28:51 +08:00
Longzhi Wang
e0c9a6c76c [Feat] Support streaming transfer data using ZMQ (#3521)
* Support streaming transfer data of ZMQ

* fix typo

* fix typo

* support tp

* add unittest

* update

* update

* fix typo

* fix typo

* fix tp_num in ci machine

---------

Co-authored-by: Wanglongzhi2001 <>
2025-09-02 19:52:19 +08:00
Echo-Nie
0fe1d62232 [MTP] add test_draft_model_set_value_by_flags.py (#3741) 2025-09-02 19:33:33 +08:00
Jiang-Jia-Jun
18e5d355a1 Update version in docs 2025-09-02 19:21:10 +08:00
yangjianfengo1
8e1b35a09b 【Fix bug] w4afp8 的nblock固定为256,并且fa3的append attn 增加mask参数 (#3771)
* fix w4afp8

* 增加集中式配置

* codestyle

* fix fa3 append attn
2025-09-02 19:17:01 +08:00
bukejiyu
b6a4115369 [v1loader]Reduce EB300B model loading time (#3700)
* speed up eb45

* update
2025-09-02 19:13:57 +08:00
YUNSHEN XIE
693c7d781c fix ce compile job (#3768)
* fix ce compile job

* update

* update

* update

* update
2025-09-02 18:37:13 +08:00
co63oc
aa067a3106 rename speculate_token_penalty_multi_scores.cu (#3735) 2025-09-02 18:12:11 +08:00
lzy
7a521bbf62 Modify mask_offset‘s format (#3525)
* modify mask_offset in decode

* modify mask_offset unittest

---------

Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
2025-09-02 03:05:35 -07:00
co63oc
f296aff6cf rename speculate_stop_generation_multi_stop_seqs (#3743) 2025-09-02 18:04:29 +08:00
RAM
205b706ef8 [Executor] Fix bug of import paddle with RLHF (#3781) 2025-09-02 17:32:13 +08:00
Yuanle Liu
306c024ff3 [BugFix] fix error of import paddle.base.core.Config (#3761)
* 延迟 import Config

* support chunked_prefill

* support chunked_prefill
2025-09-02 17:23:27 +08:00
ltd0924
905d89e42f [Feature] support model weight update in ep (#3765)
* support model weight update in ep

* support model weight update in ep

* support model weight update in ep

* support model weight update in ep

* Update fused_moe_backend_base.py

* Update worker_process.py

* Update worker_process.py

* Update dynamic_weight_manager.py
2025-09-02 17:16:03 +08:00
kevin
1908465542 [Feature] mm and thinking model support structred output (#2749)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* mm support structured output

* update code

* update code

* update format

* update code

* update code

* add enable_thinking default

* update code

* add structured_outputs test case

* add ci install xgrammar

* add ci timeout time

* update test for structured_outputs

* update code

* add error traceback info

* update error msg

* update structred output code

* update code

* update code

* update config

* update torch version

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-09-02 16:21:09 +08:00
Jiang-Jia-Jun
0e4df5a6f4 [Feature] Setting number of apiserver workers automatically (#3790)
Co-authored-by: Jiang-Jia-Jun <jiangjiajun@baidu.com>
2025-09-02 14:17:48 +08:00
ltd0924
bf0cf5167a [BugFix] fix max streaming tokens invalid (#3789) 2025-09-02 13:57:32 +08:00
kevin
7e751c93ae [BugFix] Fix chunked prefill (#3759)
* add error traceback info

* update error msg

* update code

* default enable chunked prefill

* update code

* update code

* add envs

* update code

* update enable chunked_prefill

* update code

* update code

* update code

* update code

* update code

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-09-02 13:40:45 +08:00
Jiang-Jia-Jun
27f2e7a6f1 Create faq.md 2025-09-02 11:07:37 +08:00
co63oc
6ac7cea81b fix test_load_mtp (#3780) 2025-09-02 10:21:02 +08:00
Zhang Yulong
adc246127b Update test_ernie_21b_mtp.py (#3783)
Some checks failed
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
暂时跳过多卡MTP case
2025-09-01 20:39:40 +08:00
lizexu123
6dd61a1bab fix Document (#3782)
Co-authored-by: example_name <example_email>
2025-09-01 20:22:43 +08:00
YUNSHEN XIE
253f388372 add ci images build job (#3749)
update

update
2025-09-01 19:57:36 +08:00
co63oc
d6369b4d51 fix typos (#3684) 2025-09-01 17:50:17 +08:00
Jiang-Jia-Jun
0513a78ecc Update docs for reasoing-parser 2025-09-01 17:42:58 +08:00
Jiang-Jia-Jun
0297127a93 Update FASTDEPLOY_VERSION to 2.3.0-dev 2025-09-01 16:48:42 +08:00
Jiang-Jia-Jun
2bd7d90929 Remove useless parameters
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-09-01 14:43:56 +08:00
YuanRisheng
6566e29807 Add loader test for mtp (#3724)
* add test for mtp

* fix unittest

* fix
2025-09-01 10:55:49 +08:00
Zhang Yulong
085fe070f2 add CI cases (#3714) 2025-09-01 10:06:49 +08:00
ming1753
927e8ec55e Add more runtime information to resource manager (#3706)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-09-01 00:25:28 +08:00
chenjian
465065cd19 [Bug fix] Fix prefix cache in V1 (#3715)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
* [Bug fix] Fix prefix cache in V1

* fix code style
2025-08-31 21:29:33 +08:00
lizhenyun01
bed09ae8f8 fix mask_offset in append_attn (#3745)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* fix mask_offset in append_attn

* fix test
2025-08-31 15:03:16 +08:00
kevin
753772ace8 default enable chunked prefill (#3731)
* add error traceback info

* update error msg

* update code

* default enable chunked prefill

* update code

* update code

* add envs

* update code

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-08-31 13:15:13 +08:00
李泳桦
98e03fb4ea [feat] add metrics for yiyan adapter (#3219) (#3614)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
* [feat] add metrics for yiyan adapter

* [fix] fix metrics num_requests_waiting and num_requests_running

* [fix] fix metrics gpu_cache_usage_perc

* [refactor] change where requests_number increases

* [chore] rename xxx_block_num as xxx_gpu_block_num, and update their values accordingly

* [chore] delete useless code
2025-08-30 23:20:58 +08:00
Sunny-bot1
fe5d09f9ee [FIX]Fix Machete compile via ENABLE_MACHETE (#3727)
* add ENABLE_MACHETE

* fix

* revert

* update

* pre_commit

* fix

* fix

---------

Co-authored-by: Ayakouji <yuhongh@qq.com>
Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
Co-authored-by: aquagull <hongyuh@qq.com>
2025-08-30 17:50:17 +08:00
SunLei
b9af95cf1c [Feature] Add AsyncTokenizerClient&ChatResponseProcessor with remote encode&decode support. (#3674)
* [Feature] add AsyncTokenizerClient

* add decode_image

* Add response_processors with remote decode support.

* [Feature] add tokenizer_base_url startup argument

* Revert comment removal and restore original content.

* [Feature] Non-streaming requests now support remote image decoding.

* Fix parameter type issue in decode_image call.

* Keep completion_token_ids when return_token_ids = False.

* add copyright
2025-08-30 17:06:26 +08:00
luukunn
9a7c231f2c [Feature]support chat_template.jinja (#3721)
* add support chat_template.jinja

* add support chat_template.jinja
2025-08-30 17:05:34 +08:00
lizexu123
b21e085f3e [Code Simplification] delete print (#3729) 2025-08-30 16:19:07 +08:00
chen
7568b20098 check (#3720) 2025-08-30 16:04:20 +08:00
lizexu123
455205f991 [Features] support hugging face qwen3 moe (#3649)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* split ut

* qwen3-30B-A3B

* fix

* add test

* add test_torch_model.py

* fix test_torch_model.py

* delete print

* fix moe

* delete init.py

* fix

* fix

---------

Co-authored-by: bukejiyu <395822456@qq.com>
Co-authored-by: bukejiyu <52310069+bukejiyu@users.noreply.github.com>
2025-08-30 15:26:05 +08:00
Zero Rains
f206474cc7 fix the bug when num_key_value_heads < tensor_parallel_size (#3717) 2025-08-30 12:40:00 +08:00
chenjian
c4b1f6b0a5 [Optimize] Increase zmq buffer size to prevent apiserver too slowly to consume (#3723) 2025-08-30 10:45:26 +08:00
YUNSHEN XIE
a18afcfdd9 Optimize coverage jobs (#3683)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-08-30 00:12:40 +08:00
chen
cd252ec673 [Feature]support load eb 0.3B and 21B torch model (#3660)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
2025-08-29 20:00:48 +08:00
yangjianfengo1
3754a9906d [Feature] block sparse attention (#3668)
* 支持稀疏attn

* fix bug

* code style

* fix moba attn get kv shape

* 修复a100编译

* codestyle

* code style

* code style

* code style

* fix conflict

* 增加单侧

* code style

* 增加eblite 加载时间

* fix bug

* for ci

* for ci

* for ci

* for ci

* 支持mlp block size 128

* 增加小算子单测

* fix 单测 mlp

* 将环境变量加入到config里面

* fix rollout config

* 修复显存

* add test server

* add test server

* fix mlp  最后一层使用full attn
2025-08-29 19:46:30 +08:00
zhouchong
ccd52b5596 [Model]support qwen2_5_vl (#3557)
* adapt qwen_2_5_vl model

* adapt qwen_2_5_vl VIT model

* adapt qwen2_5_vl images_embeds

* adapt qwen2_5_vl 3D rope

* adapt qwen2_5_vl 3D rope v2

* adapt qwen2_5_vl processor

* adapt qwen2_5_vl bypass resampler_model

* adapt qwen2_5_vl 绕过部分ernie逻辑

* adapt qwen2_5_vl 绕过部分ernie逻辑 v2

* adapt qwen2_5_vl 权重加载与命名修改

* adapt qwen2_5_vl 非必须think_end_id

* adapt qwen2_5_vl 区分多种模型的extract_vision_features

* fix:adapt qwen2_5_vl model

* adapt qwen2_5_vl norm

* adapt qwen2_5_vl  processor 更新

* adapt qwen2_5_vl image and video success

* adapt qwen2_5_vl 部分整理代码

* adapt qwen2_5_vl 支持多卡

* adapt qwen2_5_vl on latest develop

* adapt qwen2_5_vl RL

* adapt qwen2_5_vl 整理代码

* support noex rope3d

* adapt qwen2_5_vl add init.py

* adapt qwen2_5_vl add init.py v2

* adapt qwen2_5_vl remove space

* adapt qwen2_5_vl remove space v2

* adapt qwen2_5_vl pre-commit

* adapt qwen2_5_vl update

* adapt qwen2_5_vl pre-commit v2

* adapt qwen2_5_vl modify comments

* adapt qwen2_5_vl fix indentation

* adapt qwen2_5_vl fix indentation v2

---------

Co-authored-by: wangyafeng <wangyafeng@baidu.com>
Co-authored-by: xiaoxiaohehe001 <49090790+xiaoxiaohehe001@users.noreply.github.com>
Co-authored-by: CSWYF3634076 <58356743+CSWYF3634076@users.noreply.github.com>
2025-08-29 18:28:39 +08:00
YuBaoku
65425bf858 [CI] update paddle version to nightly (#3698) 2025-08-29 18:16:13 +08:00
Yuan Xiaolan
c71ee0831c add w4afp8 offline script (#3636) 2025-08-29 17:56:05 +08:00
zyfncg
f677c032c0 [CudaGraph] [SOT] Support spliting static graph into piecewise graph with cuda_graph (#3478)
* support spliting static graph into piecewise graph with cuda_graph

* Update fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* fix merge conflict

* fix bug

---------

Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2025-08-29 16:28:01 +08:00
lzy
48d760539b fix deepcopy(tp_group) in spec (#3648) 2025-08-29 16:08:21 +08:00
Ryan
45f81b34f0 add dtype int32 (#3692)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-08-29 14:56:35 +08:00
xiaoxiaohehe001
1bf4fc7f36 support w4afp8 eplb (#3680) 2025-08-29 14:43:06 +08:00
Yuanle Liu
68f87240da fix key error in mm (#3702) 2025-08-29 14:35:12 +08:00
李泳桦
88297240e7 [feat] completion api supports passing input token ids in either prompt or prompt_token_ids (#3311)
* [feat] completion api supports passing input token ids in either `prompt` or `prompt_token_ids`

* [fix] update comment

* [fix] fix type error

* [test] add a unittest file for serving api test

* [test] try to fix ci error

* [chore] rename test function names

* [test] try to fix ci error

* [test] try to fix ci error

* [test] add tests for qwen
2025-08-29 14:19:42 +08:00
周周周
17b414c2df MoE Default use triton's blockwise fp8 in TP Case (#3678) 2025-08-29 11:07:30 +08:00
co63oc
b6edd15d55 fix scaled_gemm_f8_i4_f16_weight_quantize input (#3685) 2025-08-29 11:04:04 +08:00
Yuanle Liu
2fb2c0f46a fix MultimodalRegistry (#3699) 2025-08-29 11:01:30 +08:00
Echo-Nie
43d5bd62b4 【Hackathon 9th No.70】supplementary unit test for CPUPlatform and CUDAPlatform (#3580)
* 功能模块 CUDAPlatform、CPUPlatform 单测补充

* update the "is_cuda" to "is_cuda_and_available"

* fix pre-commit

---------

Co-authored-by: Tao Luo <luotao02@baidu.com>
2025-08-29 10:34:05 +08:00
lifulll
72094d4d82 enable dcu ci (#3402) 2025-08-29 10:23:08 +08:00
kevin
73d60fe64d update ci envs for structred output (#3687)
* add error traceback info

* update error msg

* update code

* update ci envs for structred output

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-08-29 10:21:36 +08:00
bukejiyu
0b51b9c35b fix qwen3 235B tp 8 (#3697)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-08-28 23:46:25 +08:00
Yuanle Liu
4957908275 add input_processor plugin (#3657)
* add input_processor plugin

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update
2025-08-28 22:53:57 +08:00
ming1753
02b3644903 [Bug Fix] VL Support w4a8/w4afp8 (#3686)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
2025-08-28 21:38:35 +08:00
YuanRisheng
808b548761 support tmp (#3675) 2025-08-28 19:42:32 +08:00
Divano
368bbd9dc6 Update _base_test.yml (#3690)
新增测试并发参数ci case
2025-08-28 19:15:19 +08:00
gaoziyuan
fc635acc47 [BugFix]fix dp&ep&tp and muti node infer (#3629)
* rm log

* fix bug

* fix bug

* fix dp&ep&tp and muti node infer

* fix

---------

Co-authored-by: Yuanle Liu <yuanlehome@163.com>
2025-08-28 19:09:10 +08:00
Divano
17731a8acd add concurrency cases (#3689) 2025-08-28 18:30:19 +08:00
Liumengyuan
2a73a6df03 fix_fp8_deepgemm_moe_tp_bug (#3658) 2025-08-28 17:19:02 +08:00
Liumengyuan
e93d4cfcdd Add with_output version AppendAttention (#3302)
* get use_output from fd_config

* add clear TODO description

* add mask_offset para to align with develop

* fix bug

* fix use_output logic

* fix sot bug
2025-08-28 17:10:18 +08:00
ltd0924
94ded434bd [BugFix] ep mixed offline exit (#3661)
* Update expert_service.py

* Update expert_service.py
2025-08-28 17:09:07 +08:00
ltd0924
e5015eea05 [BugFix] fix logger (#3666) 2025-08-28 17:08:00 +08:00
bukejiyu
73cf6096da fix (#3676)
* fix

* update
2025-08-28 17:06:32 +08:00
ltd0924
98c217b428 Update config.py (#3669) 2025-08-28 15:30:51 +08:00
co63oc
d4fc893fe3 fix typos (#3633)
Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-08-28 14:42:24 +08:00
co63oc
c294fc8139 Fix target_version (#3159)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* Fix

* fix

* fix
2025-08-28 14:17:54 +08:00
Mattheliu
108d989d9d [Docs] add fastdeploy_unit_test_guide.md (#3484)
* docs:add fastdeploy_unit_test_guide.md

* docs:fix fastdeploy_unit_test_guide.md

* docs: add FastDeploy unit test spec (EN) and update usage nav

* fix codestyle
2025-08-28 14:12:25 +08:00
plusNew001
b791bea0c5 Update run_ci_xpu.sh to lock xvllm version (#3671)
Lock version due to xvllm update causing service errors.
2025-08-28 12:30:50 +08:00
Yuan Xiaolan
d37331fc71 fix w4afp8_gemm_scale_permute import error on A100 (#3611) 2025-08-28 11:42:23 +08:00
YuanRisheng
ad9b95e6dd fix rl bugs (#3654) 2025-08-28 11:09:34 +08:00
yangjianfengo1
e81046fdad 【New Feature】集中式支持w4afp8 (#3644)
* 支持tp w4afp8

* code style
2025-08-28 10:53:24 +08:00
周周周
76513f6416 Support 45t fp8 8 GPU (#3659) 2025-08-28 10:52:53 +08:00
Echo-Nie
7afcd4b776 【Hackathon 9th No.77】supplementary unit test for get_filtered_metrics (#3578)
* 功能模块 fastdeploy/metrics/metrics/get_filtered_metrics 单测补充

* fix pre-commit

---------

Co-authored-by: Tao Luo <luotao02@baidu.com>
2025-08-28 10:39:02 +08:00
ltd0924
3d92fb09f7 [BugFix] fix parameter is 0 (#3592)
* Update engine_client.py

* fix

* Update common_engine.py
2025-08-28 09:52:36 +08:00
Sunny-bot1
479c8b85d3 [Optimize]support machete weight only gemm (#3561)
* support machete weight only gemm

* add generate

* update

* fix

* change file location

* add sm_version limit

* fix

* fix

* fix ci

* fix coverage

* fix xpu
2025-08-28 09:49:58 +08:00
Zero Rains
e37e86b3b8 [V1 Loader]support param create and load for wint2 and xpu backend (#3581)
* support wint2 backend'

* [V1 Loader]support param create and load for wint2 and xpu backend

* update weight shape name

* update

* update

* update baseline.txt

* update model name

* update baseline.txt

* fix codestyle

* remove debug coode
2025-08-28 09:49:36 +08:00
lizexu123
b28a0343a6 fix ENABLE_V1_KVCACHE_SCHEDULER (#3625)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-08-27 21:21:29 +08:00
ltd0924
2974016103 [BugFix] fix ce bugs (#3641)
* [BugFix] fix tp8 client refuse

* fix engine port bug

* Update utils.py
2025-08-27 20:38:15 +08:00
Yuanle Liu
836345a4dd delete ernie4_5_vl_tokenizer (#3631) 2025-08-27 20:36:02 +08:00
Liumengyuan
11803e0907 fix undefined cuPointerGetAttribute symbol error (#3628) 2025-08-27 20:24:59 +08:00
Jiang-Jia-Jun
c694fa2879 Revert "[Feature] block sparse attention (#3209)" (#3647)
This reverts commit 646a0c2fd8.
2025-08-27 17:35:04 +08:00
李泳桦
b2afdf4fc6 [fix] qwen output inconsistency when top_p=0 (#3634)
* [fix] qwen output inconsistency when top_p=0

* [fix] remove decode pre_id code
2025-08-27 17:16:23 +08:00
lzy
1265f6c192 deepgemm don't support tp+ep (for ci) (#3638)
* deepgemm don't support tp+ep (for ci)

* deepgemm don't support tp+ep (for ci)
2025-08-27 16:39:19 +08:00
plusNew001
f0140be1e1 Change paddlepaddle-xpu installation command (#3646)
Updated the installation command for paddlepaddle-xpu to use a specific wheel file.
2025-08-27 16:17:19 +08:00
JYChen
e645db348b [docs] Update best practice doc (#3539)
* fix some docs error

* [docs] x1 best-practice

* update docs

* fix docs
2025-08-27 15:45:30 +08:00
xjkmfa
afb9f327ef 【CI case】for echo finish_reason text_after_process and raw_prediction check (#3630)
* Add ci case for min token and max token

* 【CI case】include total_tokens in the last packet of completion interface stream output

* echo&finish_reason&text_after_process&raw_prediction check

* echo&finish_reason&text_after_process&raw_prediction check

* echo&finish_reason&text_after_process&raw_prediction check

* echo&finish_reason&text_after_process&raw_prediction check

* echo&finish_reason&text_after_process&raw_prediction check

---------

Co-authored-by: xujing43 <xujing43@baidu.com>
2025-08-27 15:21:16 +08:00
chen
5ad8721506 check (#3639) 2025-08-27 14:32:13 +08:00
plusNew001
f8b70bf60c update xpu ci (#3632)
* Update Docker image version in CI workflow

* Modify paddlepaddle-xpu installation and add dependencies

Updated installation source for paddlepaddle-xpu and added dependency download step.

* Fix no_proxy environment variable in CI workflow
2025-08-27 14:25:56 +08:00
chen
ce9c0917c5 [Precision] Support lm_head layer running in float32 (#3597)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* support lm_head fp32 bf16 fp16

* support lm_head fp32 bf16 fp16

* add doc and check code

* lm_head_fp32 specify lm_head as fp32

* code check

* check doc
2025-08-27 11:34:53 +08:00
xiaoxiaohehe001
ad319a87cc support fa3 rope3d (#3622) 2025-08-27 11:31:29 +08:00
YUNSHEN XIE
85afa72763 fix publish task (#3635)
* fix publish task

* disable ut
2025-08-27 11:14:53 +08:00
yangjianfengo1
646a0c2fd8 [Feature] block sparse attention (#3209)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* 支持稀疏attn

* fix bug

* code style

* fix moba attn get kv shape

* 修复a100编译

* codestyle

* code style

* code style

* code style

* fix conflict

* 增加单侧

* code style

* 增加eblite 加载时间

* fix bug

* for ci

* for ci

* for ci

* for ci

* 支持mlp block size 128

* 增加小算子单测

* fix 单测 mlp

* 将环境变量加入到config里面

* fix rollout config
2025-08-26 07:16:04 -07:00
RAM
f0a362af18 [CUDAGraph]Switch the scope so that output buffer of CUDAGraph can automatically release (#3612)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
* fix typo

* fix typo

* add print dot files

* fix bug

* Switch the scope so that output buffer of cudagraph can automatically release

* Revert "add print dot files"

This reverts commit dc21809eb5.
2025-08-26 21:28:19 +08:00
gaoziyuan
82e64b13e1 [NewFeature]Support dp multi api server && Fix some bug in mixed ep && merge develop (#3598)
* [Feature] update ep

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix queue ports idx

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* Update engine.py

* fix ci

* fix some bug in mixed ep

* add server fix and op fix

* rm some log

* fix code style

* ltd fix

* fix

* fix

* fix some bug

* fix bug

* fix bug

* fix style

* Update config.py

* Update splitwise_connector.py

* Update cache_messager.py

* Update __init__.py

* merge and fix

* Update engine.py

* Update common_engine.py

* Update run_ci_xpu.sh

* Update ernie_processor.py

* Update ernie_processor.py

---------

Co-authored-by: ltd0924 <ltd0924@sina.com>
Co-authored-by: ltd0924 <32387785+ltd0924@users.noreply.github.com>
2025-08-26 19:59:02 +08:00
Yuanle Liu
cbce94a00e rename ernie_xxx to ernie4_5_xxx (#3621)
* rename ernie_xxx to ernie4_5_xxx

* ci fix
2025-08-26 19:29:27 +08:00
YuanRisheng
642480f5f6 [CI] Standard unittest (#3606)
* standard unittest

* fix bugs

* fix script
2025-08-26 19:03:11 +08:00
SunLei
2f28f40d90 fix: replace list * n initialization with list comprehension to avoid shared references (#3618) 2025-08-26 17:53:31 +08:00
bukejiyu
3200a80de3 [v1 loader]support fp8 (#3593)
* support fp8

* update ci
2025-08-26 02:42:46 -07:00
RAM
00898603c8 [CUDAGraph]Add debug func (#3616)
* add print dot files

* refine code
2025-08-26 16:43:48 +08:00
xiaoxiaohehe001
9afa236e39 [NewFeatures] support eplb (#3547)
* [NewFeatures] support eplb

* fix eplb
2025-08-26 16:19:30 +08:00
Yuanle Liu
56e2d7e668 adaptive rms_norm's dtype (#3617)
* adaptive rms_norm's dtype

* adaptive rms_norm's dtype

* add approve coverage

---------

Co-authored-by: liuyuanle <liuyuanle@baidu.com>
2025-08-26 15:29:15 +08:00
lzy
d339df2e90 Supports DP+TP+EP hybrid parallel deployment strategy (#3489)
* Support DP+TP+EP hybrid parallel deployment strategy

* Support DP+TP+EP hybrid parallel deployment strategy

* fix conflict

* add moe_tp_ep function split_allgather_out

* del tp_group in moe_cutlass_backend

* for ci

* fix parallel_config for ci

* del log
2025-08-26 00:04:01 -07:00
freeliuzc
52eda7fdb3 [Feature][MTP]support new speculative decoding method named hybrid mtp with ngram (#3610) 2025-08-26 14:29:22 +08:00
AIbin
0a0d2959b9 qkv_a_proj horizontal fusion (#3591)
Support DSK qkv_a_proj horizontal fusion under V0 Loder
2025-08-26 14:25:57 +08:00
YuBaoku
75db0d1ae2 [CI] reopen sot test (#3613)
* [CI] change check_service time to 360s

* [CI] disable sot test temporarily

* [CI] reopen sot test
2025-08-26 14:23:38 +08:00
xiaoxiaohehe001
70c75798a7 [NewFeatures] support noex rope3d (#3542)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* [NewFeatures] support noex rope3d

* [NewFeatures] support noex rope3d encoder
2025-08-26 11:44:57 +08:00
tianlef
0bc7d076fc [CE]add x1 w4a8c8 benchamrk config (#3607)
* [CE]add x1 w4a8c8 benchamrk config

* [CE]add x1 w4a8c8 benchamrk config

* [CE]add x1 w4a8c8 benchamrk config
2025-08-26 11:27:32 +08:00
Ryan
a5b4866ff1 [CudaGraph][SOT] Add unit tests for splitting the static graph into piecewise graphs that support cuda_graph (#3590)
* add unitest

* change sot_warmup_sizes

* wtf; add missed commit
2025-08-26 11:25:04 +08:00
Sunny-bot1
c68c3c4b8b [Feature] bad words support v1 scheduler and specifiy token ids (#3608)
* support bad_words_token_ids

* docs

* fix test

* fix

* bad words support kvcache v1 and token ids

* fix
2025-08-25 20:14:51 -07:00
lizexu123
c43a4bec00 [Features] support hugging face qwen3 dense and qwen2 model (#3574)
* support qwen2 and qwen3 hugging face

* fix moe

* defualt_v1 loader

* hugging_face_format deprecated

* modify hugging_face_foramt to model_format

* model_format auto

* fix environemt

* fix bug

* fix qwen3-0.6 bug

* model_format is str

* fix
2025-08-26 10:54:53 +08:00
ltd0924
66c5addce4 [Bugfix] fix api server control signal bugs (#3531)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* Update serving_chat.py

* Update serving_completion.py

* Update serving_completion.py
2025-08-25 21:13:04 +08:00
RAM
2fa173e327 [Executor] CUDAGraph support RL training (#3265)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
* add clear graph opt backend

* cuda graph support rl

* add branch

* 1.fix dynamic_weight_manager bug 2.add clear api for CasualLM

* open test case

* fix typo

* update mkdocs.yaml

* [Docs]Update mkdocs.yml

* update test case

* use unittest in graph test case
2025-08-25 20:59:30 +08:00
Kane2011
2ae7ab28d2 [MetaxGPU] adapt to the latest fastdeploy on metax gpu (#3492) 2025-08-25 17:44:20 +08:00
YuBaoku
c13c904971 [CI] temporarily disable sot test due to occasional timeout issue (#3586)
* [CI] change check_service time to 360s

* [CI] disable sot test temporarily
2025-08-25 14:34:27 +08:00
chen
9cab3f47ff [Feature] Add temp_scaled_logprobs and top_p_normalized_logprobs parameters for logits and logprobs post processing (#3552)
* [feature] Add temp_scaled_logprobs and top_p_normalized_logprobs parameters for logits and logprobs post processing

* infer engine support temp_scaled_logprobs and top_p_normalized_logprobs

* delete some code

* code check

* code check and add doc

* fix tokenizer.decoder(-1), return 'Invalid Token'

* add ci for temp_scaled and top_p logprobs

* check test

* check seq len time shape

* logprob clip inf

---------

Co-authored-by: sunlei1024 <sunlei5788@gmail.com>
2025-08-25 14:11:49 +08:00
YUNSHEN XIE
2410adb041 Add coverage skip (#3553)
* add coverage skip

* update

* fix
2025-08-25 14:08:24 +08:00
Yuan Xiaolan
9205c88da1 support w4afp8 EP inference (#3044)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-08-25 11:27:45 +08:00
YUNSHEN XIE
46664985fc Modify the existing coverage collection method (#3573)
fix cov report
2025-08-25 10:35:35 +08:00
YuBaoku
7821534ff5 [CI] add sot test (#3579)
* [CI] add sot test

* [CI] add sot test
2025-08-25 10:14:50 +08:00
lengxia
137e539456 [Feature][XPU] add custom kernels for mtp (#3537) 2025-08-25 10:14:17 +08:00
bukejiyu
bdbac0aa3d support qwen2 weight only (#3571)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
2025-08-24 11:14:34 +08:00
bukejiyu
77514e3e1e [V1 Loader] support weight_only (#3413)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
* support wint4/wint8

* delete smoe case

* update ci

* print log
2025-08-23 13:13:41 +08:00
Jiang-Jia-Jun
93e1b63200 Revert "[UnitTest][Copilot] Improve unit test coverage for entrypoints module…" (#3564)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
This reverts commit 36325e9ea7.
2025-08-23 10:44:23 +08:00
YuanRisheng
e481b7a779 fix sot (#3556) 2025-08-23 08:37:06 +08:00
Zero Rains
79f0dbbb55 [V1 Loader] Support qwen2(bf16) (#3502)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* support qwen2(bf16)

* merge bias_loader and weight_loader
2025-08-23 01:08:23 +08:00
YUNSHEN XIE
cb166053ba fix test name (#3493)
* fix test name

* update

* update

* fix

* fix

* update

* update

* update

* update

* update

* fix

* update
2025-08-22 23:43:47 +08:00
Copilot
36325e9ea7 [UnitTest][Copilot] Improve unit test coverage for entrypoints modules (#3546)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
* Initial plan

* Add comprehensive unit tests for entrypoints utilities

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>

* Complete entrypoints test coverage improvement with tool parser tests

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>

* Apply pre-commit formatting to test files - fix trailing whitespace and long lines

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-08-22 19:20:51 +08:00
zhink
df7c31012b Modified to support custom all reduce by default (#3538) 2025-08-22 16:59:05 +08:00
lddfym
27666ee586 [Feature] Add Qwen25-VL Processor (#3501)
* add qwen-2.5-vl processor

* add qwen25-vl processor

* add qwen25-vl processor

* add qwen25-vl processor

* add qwen25-vl processor position_ids

* add qwen25-vl processor

* add qwen25-vl processor

* position_ids

* add test for qwen25-vl

* organize comments

* formatted

* qwen_vl_processor

* add qwen_vl_processor unittest

* update model path

* update model path

* update qwen_vl_processor unittest

* add unittest and bug fix

* add unittest and bug fix

* Update fastdeploy/input/qwen_mm_processor/image_processor.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update fastdeploy/input/qwen_vl_processor.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2025-08-22 16:49:42 +08:00
YuanRisheng
5b66462f0e Fix fdconfig bugs (#3528)
* fix config

* fix parallel

* fix ips

* fix rl

* open code
2025-08-22 16:17:15 +08:00
plusNew001
7ae41e9daf [CI] fix xpu ci bug (#3535) 2025-08-22 15:08:39 +08:00
freeliuzc
76759108c9 [Feature][SpeculativeDecoding]Support tree-attention (#3514)
* support tree-attention

* fix merge bug

* fix unit-test api

* fix merge bug
2025-08-22 13:36:41 +08:00
YuBaoku
cc88671507 [CI] add container naming and cleanup logic in workflows (#3526) 2025-08-22 11:42:57 +08:00
YUNSHEN XIE
2630260616 disable stable test (#3529) 2025-08-22 11:38:18 +08:00
YuanRisheng
85fbf5455a [V1 Loader]Ernie VL support loader v1 (#3494)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* ernie vl support new loader

* add unittest

* fix test
2025-08-22 11:16:57 +08:00
Zhang Yulong
3cc182236a update ci (#3519)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-08-21 20:05:50 +08:00
YuanRisheng
c389a4013c Unify server-side and model-side Config(Part-5) (#3497)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
* move config

* fix xpu

* fix

* fix vl

* fix vl

* fix unitest

* fix args

* add unitest

* fix test
2025-08-21 19:00:21 +08:00
yangjianfengo1
e5aa7087db 【bug fix】修复w4a8编译慢 (#3510)
* 修复w4a8编译

* code style

* 修复tma copy
2025-08-21 18:50:14 +08:00
Zhang Yulong
a5692e8b7d Add PD CI case (#3490)
* Create test_ernie_03b_pd.py

* Update test_ernie_03b_pd.py
2025-08-21 18:48:34 +08:00
李泳桦
8bea4b1e25 [fix] fix output tokens count in streaming completion api (#3507) 2025-08-21 18:19:13 +08:00
李泳桦
e4f0b755b4 [fix] setting disable_chat_template while passing prompt_token_ids led to response error (#3228)
* [fix] setting disable_chat_template while passing prompt_token_ids led to response error

* [fix] code syntax

* [test] add test case for this bug

* [test] add test case for empty message list

* [test] fix test case for empty message list
2025-08-21 17:30:51 +08:00
luukunn
371fb3f853 [Feature] add tool parser (#3483)
* add tool parser

* add x1 enable_thinking

* restart ci

* fix vl reasoning parser

* modify call style

* modify call style

* add offline enablethinking

* fix completion

* fix

* fix unit test

* fix unit test

* fix unit test

* fix vl reasoning parser

* fix vl reasoning parser
2025-08-21 17:25:44 +08:00
Yzc216
466cbb5a99 [Feature] Models api (#3073)
* add v1/models interface related

* add model parameters

* default model verification

* unit test

* check model err_msg

* unit test

* type annotation

* model parameter in response

* modify document description

* modify document description

* unit test

* verification

* verification update

* model_name

* pre-commit

* update test case

* update test case

* Update tests/entrypoints/openai/test_serving_models.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update tests/entrypoints/openai/test_serving_models.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update tests/entrypoints/openai/test_serving_models.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update tests/entrypoints/openai/test_serving_models.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update fastdeploy/entrypoints/openai/serving_models.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

---------

Co-authored-by: LiqinruiG <37392159+LiqinruiG@users.noreply.github.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2025-08-21 17:02:56 +08:00
Zhang Yulong
b7eee3aec1 Update CI (#3474)
* update CI cases

* update CI cases

* update CI cases

* update CI cases

* Merge upstream/develop and resolve directory rename conflict

* Merge upstream/develop and resolve directory rename conflict

* Merge upstream/develop and resolve directory rename conflict

* update deploy

* update deploy

* update deploy

* update deploy

* update deploy
2025-08-21 16:49:20 +08:00
qw86972190
c83381d650 revert pr (#3481)
Co-authored-by: iosmers <yinwei_hust@163.com>
2025-08-21 14:19:50 +08:00
ltd0924
51f68ae593 [Feature] add dealer manager to reuse the connection (#3471)
* [BugFix] fix control signal release failed

* [BugFix] fix control signal release failed

* update

* update

* update

* [Feature] add dealer manager to reuse the connection

* fix

* fix

* fix

* fix

* fix

* fix

* Create test_dealer_connection_manager.py

* Delete test/entrypoints/openai directory

* Update test_dealer_connection_manager.py

* Update test_dealer_connection_manager.py
2025-08-21 13:11:13 +08:00
YUNSHEN XIE
985b1265c3 CE 编译任务(合入触发) (#3491)
* add ce compile job

* fix

* update
2025-08-21 11:33:26 +08:00
memoryCoderC
31f639f10b [Feature] add prompt_tokens and completion_tokens (#3504)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-08-21 10:23:27 +08:00
Zero Rains
30b3f2dc07 [BugFix][V1 Loader] fix the bug in creat weight for block_wise_fp8 (#3486)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-08-20 05:52:54 -07:00
Ryan
bcdfc1d6b9 Add custom op declaration for all_reduce (#3473)
* add custom op declaration

* roll back try except
2025-08-20 20:29:58 +08:00
Zhang Yulong
33ff0bfe38 Update disaggregated.md (#3495)
修复文档错误
2025-08-20 19:39:18 +08:00
YUNSHEN XIE
e197894977 add e2e cases (#3476)
* add e2e cases

* fix
2025-08-20 18:50:14 +08:00
Zhang Yulong
9ff2dfb162 Create eb45-8k-fp8-tp1-dp8_ep.yaml (#3485)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
混合架构EP并行yaml
2025-08-20 14:33:54 +08:00
YuBaoku
33d369586b [CI] remove useless case (#3482) 2025-08-20 14:20:30 +08:00
xiaolei373
5d131485d8 add error log to file (#3431)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* feat(log):add_request_and_response_log

* feat[log]:add error log to file
2025-08-20 09:52:34 +08:00
YUNSHEN XIE
3a6058e445 Add stable ci (#3460)
* add stable ci

* fix

* update

* fix

* rename tests dir;fix stable ci bug

* add timeout limit

* update
2025-08-20 08:57:17 +08:00
kevin
67298cf4c0 add error traceback info (#3419)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* add error traceback info

* update error msg

* update code

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-08-19 19:32:04 +08:00
yangjianfengo1
b047681c5d 【New Feature】支持Fp8 group Gemm 24稀疏 (#3463)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
* 支持24稀疏

* code style

* 增加stmatrix 宏定义判断

* code style
2025-08-19 02:54:47 -07:00
ltd0924
d587fb257f [CI] add test generation demo (#3270)
* Create test_generation.py

* update

* update

* format

* Update test_generation.py

* Update test_generation.py

* Update test_generation.py

* Update test_generation.py

* Update test_generation.py

* Update test_generation.py

* Update test_generation.py

* Update test_generation.py

* Update setup.py

* Delete test/plugins/test_model_runner_register.py

---------

Co-authored-by: YUNSHEN XIE <1084314248@qq.com>
2025-08-19 17:12:40 +08:00
Zero Rains
fef447e350 [V1 Loader] Support MOE parameters create and load for DeepGemm and marlin backend (#3447)
* support deepgemm backend

* support marlin backend

* remove print

* fix process_prequanted_weights
2025-08-19 14:15:53 +08:00
chen
6735626014 fix request_output sampling_params (#3154) (#3464) 2025-08-19 13:52:50 +08:00
ltd0924
bca8905b40 [BugFix] fix control signal release failed (#3390)
* [BugFix] fix control signal release failed

* [BugFix] fix control signal release failed

* update

* update

* update
2025-08-19 13:51:38 +08:00
Zero Rains
8b12c80f90 [FixBug] compute early stopping with real batch size (#3418)
* [FixBug] compute early stopping with real batch size

* update

* fix test_sampler
2025-08-18 22:09:21 -07:00
luukunn
3a7a20d191 [Feature] Pass through the chat_template_kwargs to the data processing module (#3421)
* fix chat_template_args

* fix args

* add offline

* add offline

* fix

* fix

* fix default enable_thinking value

* fix default enable_thinking value

* modify condition

* Revert "modify condition"

This reverts commit 26430bdeb1.

* fix unit test
2025-08-19 10:50:01 +08:00
lizexu123
a053ab889b [BugFix] fix num_running_requests in cuda_graph (#3457)
* fix cuda_grpah

* add note

---------

Co-authored-by: RAM <gstian5555@outlook.com>
2025-08-19 10:47:22 +08:00
AIbin
beec24fd89 【Inference Optimize】DeepSeek-v3 model inference performance optimization (#3455)
* DSK_OPT_01

* update FA3
2025-08-19 10:42:42 +08:00
zhuzixuan
c95b3395e9 【BugFix】completion接口echo回显支持 (#3245)
* wenxin-tools-511,修复v1/completion无法回显的问题。

* 支持多prompt的回显

* 支持多prompt情况下的流式回显

* 补充了 completion 接口支持 echo 的单元测试

* pre-commit

* 移除了多余的test文件

* 修复了completion接口echo支持的单测方法

* 补充了单元测试文件

* 补充单测

* unittest

* 补充单测

* 修复单测

* 删除不必要的assert.

* 重新提交

* 更新测试方法

* ut

* 验证是否是正确思路单测

* 验证是否是正确思路单测

* 验证是否是正确思路单测3

* 优化单测代码,有针对性地缩小单测范围。

* 优化单测代码2,有针对性地缩小单测范围。

* 优化单测代码3,有针对性地缩小单测范围。

* support 'echo' in chat/completion.

* update

* update

* update

* update

* update

* update

* 补充了关于tokenid的单元测试

* update

* 修正index错误

* 修正index错误
2025-08-19 10:41:51 +08:00
lizexu123
32b39620bc [Code Simplification] remove cum_offsets (#3410)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
2025-08-18 20:21:25 +08:00
YUNSHEN XIE
2cf96ddd68 add publish workflow (#3063)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* add publish job

* update

* update
2025-08-18 16:42:36 +08:00
luukunn
9c129813f9 [Feature] add custom chat template (#3251)
* add custom chat_template

* add custom chat_template

* add unittest

* fix

* add docs

* fix comment

* add offline chat

* fix unit test

* fix unit test

* fix

* fix pre commit

* fix unit test

* add unit test

* add unit test

* add unit test

* fix pre_commit

* fix enable_thinking

* fix pre commit

* fix pre commit

* fix unit test

* add requirements
2025-08-18 16:34:08 +08:00
Jundong Liu
70ee910cd5 [Excutor] Change cudagraph hashkey from batch size to num_tokens (#3454) 2025-08-18 16:16:48 +08:00
Jundong Liu
ea4a3b479c [Excutor] Increase buffer size to prevent address corruption; add forward metadata debug tool (#3404)
* 修复buffer申请不够大,增加打印forwardmetadata的工具

* fix mistake

* Make CPU tensor in CPUPlace

* Add test about forward_meta_str and Add unitest_requirement

---------

Co-authored-by: RAM <gstian5555@outlook.com>
2025-08-18 16:14:09 +08:00
chen
5585cf7aa5 fix mtp_rej_topp input (#3450) 2025-08-18 16:12:42 +08:00
Divano
246cd7b3a5 Perf (#3453)
* add repitation early stop cases

* add repitation early stop cases

* add stress tool
2025-08-18 15:37:46 +08:00
gaoziyuan
6fdd83da10 fix some bug (#3434) 2025-08-18 14:39:13 +08:00
freeliuzc
a12d0bc549 [Feature][MTP]update multi-draft-token strategy (#3369)
* update multi-draft-token strategy

* fix format

---------

Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
2025-08-18 13:59:56 +08:00
Zhang Yulong
3ee6053e5d Add ci case (#3355)
* add ci cases

* debug

debug H20 baseline

* Update run_pre_ce.sh

* Update test_EB_Lite_serving.py

* Update test_EB_VL_Lite_serving.py

* Update test_EB_Lite_serving_mtp.py

* Update test_Qwen3-MoE_serving.py

* Update test_Qwen2-7B-Instruct_serving.py

* Update run_pre_ce.sh
2025-08-18 11:35:56 +08:00
chen
e88f5552db fix cpu __ini__.py (#3448)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-08-17 12:38:54 +08:00
RAM
33c0197ebe [Docs] Update mkdocs.yml (#3444)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* Updata docs of graph opt backend

* update best_practices

* update mkdocs.yaml

* [Docs]Update mkdocs.yml
2025-08-15 21:57:40 +08:00
RAM
154308102e [Docs]Updata docs of graph opt backend (#3442)
* Updata docs of graph opt backend

* update best_practices
2025-08-15 21:30:32 +08:00
yongqiangma
5703d7aa0f update installation readme (#3429) 2025-08-15 19:09:41 +08:00
yangjianfengo1
615930bc05 Update README (#3426)
* 修改READMe

* code style

* code style
2025-08-15 18:46:28 +08:00
JYChen
6f11171478 fix some docs error (#3439) 2025-08-15 18:45:27 +08:00
yinwei
354575b6d1 [Docs]Modify the gpu-memory-utilization of the 128K 8-card Wint4 model to 0.95 (#3428)
* XPU Update 2.1 Release Documentation

* code style check

* Modify the gpu-memory-utilization of the 128K 8-card Wint4 model to 0.95
2025-08-15 18:34:37 +08:00
YUNSHEN XIE
cc8ee50f27 add accuracy check ci (#3389)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* add accuracy ci

* fix

* fix

* update

* rename ci jobs
2025-08-15 15:17:43 +08:00
GoldPancake
4bd6a9fa7d [Bugs] Fix DeepGEMM pre-compile tools. (#3351)
Fix some miss cache problems.
Add README.md.
2025-08-15 14:37:49 +08:00
ming1753
d4e3a20300 [Docs] Release 2.1 docs and fix some description (#3424) 2025-08-15 14:27:19 +08:00
yinwei
fbb6dcb9e4 [Docs]XPU Update 2.1 Release Documentation (#3423)
* XPU Update 2.1 Release Documentation

* code style check
2025-08-15 14:07:47 +08:00
JYChen
562e01c979 update docs (#3420) 2025-08-15 13:00:08 +08:00
Jiang-Jia-Jun
cca96ab1e4 Update Dockerfile.gpu 2025-08-15 12:29:20 +08:00
Jiang-Jia-Jun
7132fa9ec2 Update dockerfile 2025-08-15 12:28:08 +08:00
Sunny-bot1
6c1f3ff897 topk_gating_softmax support bias (#3405) 2025-08-15 11:57:45 +08:00
ltd0924
5a84324798 [Doc] Add multinode deployment documents (#3417)
* Create multi-node_deployment.md

* Create multi-node_deployment.md

* Update mkdocs.yml
2025-08-15 10:37:04 +08:00
chen
f0f00a6025 [OPs] Universal optimization and Fix early_stop cuda 700 (#3375)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* delete nonzero

* delete setup_ops_base.py

* check if

* check gcp infer_seed.cpu()

* fix repetition_early_stopper_kernel cuda 700
2025-08-14 22:40:44 +08:00
YuanRisheng
09c979f3dd [V1 Loader] Support Ernie text(moe and dense) (#3110)
* new loader support 0.3B

* fix weight

* support parallel load

* support parallel load

* fix slice

* support moe

* delete code

* perfect code

* perfect code
2025-08-14 20:25:28 +08:00
xjkmfa
ab60292f89 【CI】 evil case (#3359)
* Add ci case for min token and max token

* 【CI case】include total_tokens in the last packet of completion interface stream output

* 边缘检测 ,攻击性测试

* 边缘检测 ,攻击性测试

* 边缘检测 ,攻击性测试

* 边缘检测 ,攻击性测试

---------

Co-authored-by: xujing43 <xujing43@baidu.com>
2025-08-14 20:00:47 +08:00
freeliuzc
cacc52bf21 modify readme (#3409) 2025-08-14 19:47:36 +08:00
Sunny-bot1
79d8ae4c38 [UT Fix] Fix bad_words test (#3385)
* fix bad_words test

* add streaming

* fix

* fix
2025-08-14 03:55:02 -07:00
lzy
1e06b9fa6d make append_attn supports mask_offset (#3138)
* make append_attn supports mask_offset

* add unittest
2025-08-14 03:40:55 -07:00
memoryCoderC
6031f9a5f5 [BugFix] fix ErnieProcessor not set raw_prediction (#3400) 2025-08-14 18:07:49 +08:00
YUNSHEN XIE
f72db9386c Add requirements for running unit tests (#3350)
* Add requirements for running unit tests

* update
2025-08-14 17:37:18 +08:00
lizexu123
7b596d0877 [BugFix] fix real_bsz in ep (#3366)
* Your commit message here

* fix ep

* delete cuda_graph
2025-08-14 17:31:19 +08:00
gaoziyuan
0ea8712018 fix op tests (#3398) 2025-08-14 16:45:25 +08:00
Sunny-bot1
2e7831185f [Optimize]Add norm_weights feature for topk_gating_softmax (#3372)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-08-14 15:05:23 +08:00
Jiang-Jia-Jun
666ab65a51 [Polish Code] Remove useless notes 2025-08-14 14:04:52 +08:00
Jiang-Jia-Jun
dd583fb16a [BugFix] Fix default log level of paddleformers (#3376)
* [BugFix] Fix default log level of paddleformers

* [BugFix] Fix default log level of paddleformers

---------

Co-authored-by: Jiang-Jia-Jun <jiangjiajun@baidu.com>
2025-08-14 11:36:24 +08:00
xiaolei373
d4f610e4cd feat(log):add_request_and_response_log (#3373)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-08-13 23:27:41 +08:00
ming1753
396dba0d62 [Bug Fix] Fix V1 video bug (#3388)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-08-13 23:04:07 +08:00
YUNSHEN XIE
1ace375fc3 Optimize CI execution workflow (#3371)
* Optimize CI execution workflow

* fix
2025-08-13 18:47:31 +08:00
Zero Rains
be94bdd0b0 [Loader V1] modify layername for DeepSeekV3 (#3336)
Co-authored-by: Yuanle Liu <yuanlehome@163.com>
Co-authored-by: YUNSHEN XIE <1084314248@qq.com>
2025-08-13 15:47:06 +08:00
memoryCoderC
f702a675a1 fix TestOpenAIServingCompletion fail (#3368) 2025-08-13 15:45:07 +08:00
EnflameGCU
d1a92e3e17 [GCU] Enable gcu CI (#3190)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* [GCU] Update to the latest version

* [GCU] Enable CI
2025-08-13 11:48:24 +08:00
yzwu
ce9180241e [Iluvatar GPU] Modify the names of some variables (#3273) 2025-08-13 11:38:02 +08:00
Kane2011
b4fef2cf29 [MetaxGPU] Support FastDeploy on metax gpu (#3241)
* [MetaxGPU] Support FastDeploy on metax gpu

* Update metax_worker.py

1. change worker log;
2. remove custom allreduce, adapt it later;
3. remove cuda graph;

* Update __init__.py

1. remove metax's key work comment

* Update __init__.py

1. remove metax's key word comment;
2. add fused_moe_kernel_paddle import

---------

Co-authored-by: yongqiangma <xing.wo@163.com>
2025-08-13 11:11:54 +08:00
Ryan
ed6bff215a fix custom op order rms_norm_eps (#3348) 2025-08-13 10:12:49 +08:00
Sunny-bot1
8224b21525 Refactor moe_topk_select op to use apply_norm_weight as a template parameter (#3345)
* Refactor moe_topk_select op to use apply_norm_weight as a template parameter

* update test
2025-08-13 08:44:16 +08:00
luukunn
eda83ca672 add Tool Parser (#3272)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* add tool-parser

* add tool-parser

* add tool parser

* add tool parser

* fix

* add offline

* add offline

* fix

* parsers:tool&reasoning

* 修改tool parser名称·

* update

* fix reasoning-parser

* add requirements

* fix finish reason

* fix

* fix reasoning-parser

* fix

* fix

* fix

* fix

* fix

---------

Co-authored-by: zhuzixuan <zhuzixuan@baidu.com>
2025-08-13 01:06:55 +08:00
memoryCoderC
2d1a4cacdf Completion add raw_prediction/text_after_process (#3356) 2025-08-12 23:06:45 +08:00
zhink
2c0d853067 add test for CustomAllreduce (#3313)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-08-12 20:44:47 +08:00
YUNSHEN XIE
8791ad4e61 Pre ce modified (#3335)
* update

* update

* fix

* fix

* update

* update

* update

* fix

* update
2025-08-12 20:25:03 +08:00
memoryCoderC
c575611a5b [BugFix] v1/completions add finish_reason (#3246)
* [BugFix] v1/completions add finish_reason

* update TestOpenAIServingCompletion for merge

---------

Co-authored-by: YUNSHEN XIE <1084314248@qq.com>
2025-08-12 19:40:26 +08:00
Jiang-Jia-Jun
90bfa0be9c Update envs.py 2025-08-12 16:24:47 +08:00
Jiang-Jia-Jun
5620bd12de Update envs.py 2025-08-12 16:24:33 +08:00
YUNSHEN XIE
7d0d5a543a Use latest PaddlePaddle package (#3347)
* Use latest PaddlePaddle package

* fix
2025-08-12 16:23:41 +08:00
gaoziyuan
ccc7f1beb3 fix mapping (#3320) 2025-08-12 16:15:59 +08:00
RichardWooSJTU
283da92bfa fix ep lm head (#3244)
Co-authored-by: yuanxiaolan <yuanxiaolan01@baidu.com>
2025-08-12 15:38:28 +08:00
ming1753
f5164215be [Bug Fix] fix vl V1 schedule bug (#3323)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* [Bug Fix] fix vl V1 schedule bug

* fix format
2025-08-12 11:31:39 +08:00
yangjianfengo1
b808c49585 [Doc] 增加中英文切换 (#3318)
* 增加中英文切换

* 增加中英文切换

* 修改readme
2025-08-12 11:20:45 +08:00
chenjian
b21272d9ff [Bug fix] fix block num setting in scheduler v1 for develop (#3303)
* fix block num setting in scheduler v1

* fix block num setting in scheduler v1

* fix max_block_num and max_num_batched_tokens setting

* fix max_block_num and max_num_batched_tokens setting

* fix max_block_num and max_num_batched_tokens setting

* fix max_block_num and max_num_batched_tokens setting
2025-08-12 10:38:51 +08:00
Jiang-Jia-Jun
183e3863e8 Remove useless code (#3337) 2025-08-12 10:32:31 +08:00
Sunny-bot1
19fda4e912 fix docs (#3332)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-08-11 21:03:49 +08:00
JYChen
973ddad91e fix unittest (#3328) 2025-08-11 20:58:24 +08:00
Divano
f27e879785 Update _base_test.yml (#3331) 2025-08-11 20:57:20 +08:00
Sunny-bot1
789dc67ff7 [Docs]fix sampling docs (#3113)
* fix sampling docs

* fix sampling docs

* update
2025-08-11 20:42:27 +08:00
Divano
8bf96217b4 Update test_evil_cases.py 2025-08-11 20:27:02 +08:00
YUNSHEN XIE
770b0aa3c5 fix ci pypi index error (#3326) 2025-08-11 20:21:08 +08:00
kevin
9627619235 fix uvicorn multi worker error (#3300)
Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-08-11 19:39:41 +08:00
Zero Rains
b23af29d0b Launch expert_service before kv_cache initialization in worker_process (#3045)
* launch expert_service before kv_cache initialization

* add two signal make sure model loading and expert_service lauching finished

* fix the EP bug

* fix ep

* update launching way

* fix ep

* update

* roback ep

* pre-commit all files

---------

Co-authored-by: RAM <gstian5555@outlook.com>
Co-authored-by: Divano <dddivano@outlook.com>
2025-08-11 19:38:46 +08:00
Zhang Yulong
c27a3dc43b Update deploy.py (#3310)
* Update deploy.py

更新部署工具

* Update deploy.py
2025-08-11 19:11:57 +08:00
Jiang-Jia-Jun
c56c99837a Revert "[BugFix] num_seqs (#3291)" (#3316)
This reverts commit e0aeac58e1.
2025-08-11 16:16:51 +08:00
Yuanle Liu
9571c458f0 enhance eos_tokens (#3274)
* enhance eos_tokens

* update

* update
2025-08-11 14:47:52 +08:00
Divano
21caa63794 update base test (#3304)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* update base test

额外启动一次服务测试repetition stop

* Update _base_test.yml
2025-08-11 14:15:45 +08:00
Zero Rains
42af0b4b64 [V1 Loader] Support DeepSeekV3(bf16) (#3294)
* Support new loader for DeepSeekV3(bf16)

* update paddle version

* remove useless attr
2025-08-11 13:39:28 +08:00
lizexu123
e0aeac58e1 [BugFix] num_seqs (#3291)
* fix num_seqs

* merge develop
2025-08-11 13:38:55 +08:00
chenjian
b88537a456 fix bug for scheduler v0 (#3308) 2025-08-11 13:07:04 +08:00
xjkmfa
71018fb62e 【CI case】include total_tokens in the last packet of completion interface stream output (#3279)
* Add ci case for min token and max token

* 【CI case】include total_tokens in the last packet of completion interface stream output

---------

Co-authored-by: xujing43 <xujing43@baidu.com>
2025-08-11 10:59:47 +08:00
Divano
0b77d396ad Acc (#3301)
* add repitation early stop cases

* add repitation early stop cases

* add accuracy cases
2025-08-11 10:22:06 +08:00
Divano
79868be220 Update _base_test.yml (#3299)
add more cases
2025-08-11 10:03:27 +08:00
chen
46c8491201 merge logprob into batch_output (#3266) 2025-08-11 10:03:00 +08:00
Divano
566badb83c Update _base_test.yml (#3298) 2025-08-11 09:40:14 +08:00
Divano
eaae4a580d Split cases (#3297)
* add repitation early stop cases

* add repitation early stop cases

* split repetition_early_stop from the base test
2025-08-11 09:38:35 +08:00
chenjian
c011cb8b16 [Bug Fix] Fix scheduler bug in develop (#3292)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* Fix scheduler bug in develop

* Fix scheduler bug in develop

* Fix scheduler bug in develop
2025-08-10 13:55:38 +08:00
Jundong Liu
1e4968e810 [Excutor] Fixed the issue of CUDA graph execution failure caused by different branches during decoding (#3223)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* 彻底解决解码切块问题

* update C8 and C4 kernel

* fix problem

* fix with pre-commit

* retain branch for mtp
2025-08-09 07:37:19 +08:00
ltd0924
31d4fcb425 [BugFix] fix too many open files problem (#3256)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* Update cache_messager.py

* fix too many open files problem

* fix too many open files problem

* fix too many open files problem

* fix ci bugs

* Update api_server.py

* add parameter

* format

* format

* format

* format

* Update parameters.md

* Update parameters.md

* Update serving_completion.py

* Update serving_chat.py

* Update envs.py

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-08-08 20:10:11 +08:00
YUNSHEN XIE
22255a65aa add base test ci (#3225) 2025-08-08 19:08:55 +08:00
gaoziyuan
a799d14df1 [Bugfix] Fix model accuracy in some ops (#3231)
* fix noaux_tc op

* fix

* update

* fix qk norm

* fix linear for prequant loader

* test

* fix

* fix

* rm some print

* fix noaux_tc op

* test

* Fix the confused enable_early_stop when only set early_stop_config (#3214)

* fix the confused early_stop_config when only set early_stop_config

* pre-commit

* write a general method

* Add ci case for min token and max token (#3229)

Co-authored-by: xujing43 <xujing43@baidu.com>

* add some evil cases (#3240)

* add repitation early stop cases

* add repitation early stop cases

* add bad cases

* add bad cases

* add evil cases

* qwen3_moe (#3084)

* [Feature] support seed parameter (#3161)

* support seed

* fix

* add SamplingMetadata seed test

* The next_tokens values are inconsistent!

* add air and rejection seed test

* fix

* add SamplingParams seed test

* fix seed=0

* Default to defualt

* fix

* fix args_utils

* fix review

* fix review

* fix

* fix

* add xpu,gcu,iluvatar support seed

* fix

* 【Fix Bug】 修复 fa3 支持集中式bug (#3235)

* fix fa3 集中式bug

* 增加qknorm参数

* fix qk norm

* fix

* update

* fix linear for prequant loader

* fix

* fix

* rm some print

* fix

* fix moe init weight&scale

* fix moe init weight&scale

---------

Co-authored-by: bukejiyu <395822456@qq.com>
Co-authored-by: yuanxiaolan <yuanxiaolan01@baidu.com>
Co-authored-by: Zero Rains <linjunlu@zerorains.top>
Co-authored-by: xjkmfa <108254620+xjkmfa@users.noreply.github.com>
Co-authored-by: xujing43 <xujing43@baidu.com>
Co-authored-by: Divano <dddivano@outlook.com>
Co-authored-by: bukejiyu <52310069+bukejiyu@users.noreply.github.com>
Co-authored-by: lizexu123 <39205361+lizexu123@users.noreply.github.com>
Co-authored-by: yangjianfengo1 <125249383+yangjianfengo1@users.noreply.github.com>
Co-authored-by: qingqing01 <dangqingqing@baidu.com>
2025-08-08 17:30:37 +08:00
Zero Rains
ce1f353c70 Move create_parameters to __init__ in FuseMOE for CultassBackend and TritonBackend (#3148)
* w4a8 bug

* fix w4a8 bug

* remove code

* modify the triton backend

* fix ep

* fix the bug with tensor_wise_fp8 in triton backend

* fix the RL

* fix bug by merge

* fix the bug in w4a8

* fix the tensor_wise_fp8 bug

* fix RL
2025-08-08 15:55:47 +08:00
plusNew001
d0e9a70380 [CI] add CI logprobs case (#3189)
* [ci] add CI case

* [ci] add CI case

* [ci] add CI case

* [ci] add CI case

---------

Co-authored-by: ZhangYulongg <1272816783@qq.com>
2025-08-08 15:47:55 +08:00
freeliuzc
71267840f7 【Fix】fix mtp bug (#3139) 2025-08-08 13:30:12 +08:00
bukejiyu
b76b17fc1b qwen3 0.3B fix (#3255)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-08-08 11:35:40 +08:00
Yuanle Liu
fac2f64837 delete parallel_state.py (#3250) 2025-08-08 11:03:29 +08:00
yzwu
fbdd6b0663 [Iluvatar GPU] Optimze attention and moe performance (#3234) 2025-08-08 10:51:24 +08:00
bukejiyu
37569cca86 [feat]add fast_weights_iterator (#3258)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* add fast_weights_iterator

* update

* update
2025-08-07 22:36:46 +08:00
chenjian
5f0b30f6d0 support logprob in scheduler v1 (#3249)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-08-07 20:14:01 +08:00
Yzc216
6037dd5d9c [fix] multi source download (#3259)
* multi-source download

* multi-source download

* huggingface download revision

* requirement

* style

* add revision arg

* test

* pre-commit

* Change default download

* change requirements.txt

* modify English Documentation

* documentation

* modify model download path

* add requirements

* error optimization

* 连接失败兜底

* 连接失败兜底

* 连接失败兜底

* unit test

* unit test

* unit test

* test

* test

* 兜底修改

* Trigger CI
2025-08-07 19:30:39 +08:00
JYChen
9423c577fe [stop_seq] fix out-bound value for stop sequence (#3216)
* fix out-bound value for stop sequence

* catch error if there are out-of-bounds value

* check in offline mode

* add ut tests
2025-08-07 15:40:21 +08:00
Divano
5885285e57 Ce add benchmark test (#3262)
* add repitation early stop cases

* add repitation early stop cases

* add bad cases

* add bad cases

* add evil cases

* add benchmark gsm8k
2025-08-07 15:28:30 +08:00
YuBaoku
55ac449c31 [CI] remove useless case (#3261) 2025-08-07 15:09:40 +08:00
RAM
820798aec5 [Executor]Update graph test case and delete test_attention (#3257)
* 1.update graph test case 2.delete test_attention

* code style

* delete print
2025-08-07 14:05:15 +08:00
YuanRisheng
0074b423a9 fix ci bug (#3239)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-08-07 11:32:39 +08:00
hong19860320
93a1731891 [Doc] Update deps and fix dead links (#3252) 2025-08-07 11:04:31 +08:00
李泳桦
09cc4e2802 [fix] fix completion stream api output_tokens not in usage (#3247) 2025-08-07 10:36:00 +08:00
Yzc216
d9e3f88f9e [Feature] multi source download (#3125)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* multi-source download

* multi-source download

* huggingface download revision

* requirement

* style

* add revision arg

* test

* pre-commit

* Change default download

* change requirements.txt

* modify English Documentation

* documentation

* modify model download path

* add requirements

* error optimization

* 连接失败兜底

* 连接失败兜底

* 连接失败兜底

* unit test

* unit test

* unit test

* test

* test
2025-08-07 00:40:27 +08:00
bukejiyu
9408e667a5 [bugfix]fix blockwisefp8 and all_reduce (#3243)
* fix

* update

* fix linear for prequant loader
2025-08-06 23:54:33 +08:00
yangjianfengo1
3a15e0c53e 【Fix Bug】 修复 fa3 支持集中式bug (#3235)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* fix fa3 集中式bug

* 增加qknorm参数
2025-08-06 16:24:27 +08:00
lizexu123
afff4d37ea [Feature] support seed parameter (#3161)
* support seed

* fix

* add SamplingMetadata seed test

* The next_tokens values are inconsistent!

* add air and rejection seed test

* fix

* add SamplingParams seed test

* fix seed=0

* Default to defualt

* fix

* fix args_utils

* fix review

* fix review

* fix

* fix

* add xpu,gcu,iluvatar support seed

* fix
2025-08-06 15:20:47 +08:00
bukejiyu
20839abccf qwen3_moe (#3084) 2025-08-06 14:45:27 +08:00
Divano
91dc87f1c5 add some evil cases (#3240)
* add repitation early stop cases

* add repitation early stop cases

* add bad cases

* add bad cases

* add evil cases
2025-08-06 14:23:55 +08:00
xjkmfa
256a82b0b3 Add ci case for min token and max token (#3229)
Co-authored-by: xujing43 <xujing43@baidu.com>
2025-08-06 14:10:57 +08:00
Zero Rains
36dc73470d Fix the confused enable_early_stop when only set early_stop_config (#3214)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* fix the confused early_stop_config when only set early_stop_config

* pre-commit

* write a general method
2025-08-06 11:42:27 +08:00
YuanRisheng
a6e8b780f8 fix approve (#3224) 2025-08-06 10:36:01 +08:00
yangjianfengo1
89397516a8 [New Feature] Support W4Afp8 MoE GroupGemm (#3171)
* init

* 增加多线程编译

* fix bug

* fix bug

* code style

* 增加fp16

* 将print替换成assert

* 修复stmatrix

* 减小单测shape

* 减小单测shape
2025-08-06 10:34:05 +08:00
sg263
841e831575 [Trace]add trace when fd start (#3174)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* add opentelemetry

* add opentelemetry

* add opentelemetry on dequeue

* add opentelemetry on dequeue

* add opentelemetry on dequeue

* fix annotation

* fix annotation when add opentelemetry

* fix opentelemetry-instrumentation-fastapi

* fix pentelemetry-bootstrap

* fix opentelemetry can not work in uvicorn

* move conf to env

* fd start add trace

* fix pre-commit

* fix pre-commit

* change FD_JOB_ID

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
Co-authored-by: shige <shige@baidu.com>
2025-08-05 21:18:27 +08:00
YUNSHEN XIE
e0bbd3b6ca fix approve ci (#3212)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-08-05 17:21:26 +08:00
Yuan Xiaolan
7ce00e597c support qk norm (#3145) 2025-08-05 16:46:14 +08:00
RAM
4a10e29804 fix mla attention backend (#3176) 2025-08-05 16:43:15 +08:00
Yuan Xiaolan
af543b7f0f revise get_moe_scores (#3164) 2025-08-05 16:43:07 +08:00
Divano
e24929efa3 Ce add bad cases (#3215)
* add repitation early stop cases

* add repitation early stop cases

* add bad cases

* add bad cases
2025-08-05 16:37:28 +08:00
lizexu123
b01cfd6007 [BugFix] support real batch_size (#3109)
* support real bsz

* fix

* fix xpu_model_runner.py,gpu_model_runner.py,gcu_model_runner.py,iluvatar_model_runner.py

* add event_loop_ep

* fix

* Add comments

* fix

* support mtp real_batch_size

* fix

* self.tmp_seq_lens_this_time->self.seq_lens_this_time_buffer

* fix

* fix VL real_seq_lens_this_time

* fix

* fix mtp

* fix

* fix mtp

* fix xpu

* fix
2025-08-05 16:33:54 +08:00
Jiang-Jia-Jun
55939f7942 Update engine.py 2025-08-05 16:10:36 +08:00
chen
04fc7eb931 fix test_air_top_p_sampling name (#3211) 2025-08-05 15:47:50 +08:00
Divano
9f1936ae28 Ce add repitation early stop cases (#3213)
* add repitation early stop cases

* add repitation early stop cases
2025-08-05 15:47:28 +08:00
RichardWooSJTU
1e9a8e8cef fix lm head bias (#3185)
Co-authored-by: yuanxiaolan <yuanxiaolan01@baidu.com>
2025-08-05 15:40:24 +08:00
RichardWooSJTU
f5c64a074c [EP] Refactor DeepEP Engine Organization for Mixed Mode & Buffer Management Optimization (#3182)
* Add support for mixed-ep across multi nodes

* code refine

---------

Co-authored-by: yuanxiaolan <yuanxiaolan01@baidu.com>
2025-08-05 15:40:11 +08:00
ming1753
14ed75f7d3 [Test] scaled_gemm_f8_i4_f16 skip test while sm != 89 (#3210) 2025-08-05 15:25:28 +08:00
yangjianfengo1
40f7f3e0d8 [New Feature] fa3 支持flash mask (#3184)
* 支持flash mask

* 修改test_flash_mask

* 修改test.sh
2025-08-05 12:20:48 +08:00
YUNSHEN XIE
b8f3c73aac fix coverage report (#3198)
* fix coverage report

* fix
2025-08-05 11:24:55 +08:00
Divano
fb7a0689cc add more cases (#3207) 2025-08-05 11:17:36 +08:00
RAM
c593e1a39c [Bug Fix]Fix bug of append attention test case (#3202)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
2025-08-05 11:04:45 +08:00
RichardWooSJTU
e39159f3bd Add switch to apply fine-grained per token quant fp8 (#3192)
Co-authored-by: yuanxiaolan <yuanxiaolan01@baidu.com>
2025-08-04 19:54:03 -07:00
Divano
88596c0c63 Add more base chat cases (#3203)
* add test base class

* fix codestyle

* fix codestyle

* add base chat
2025-08-05 10:24:12 +08:00
lizhenyun01
fe540f6caa [plugin] Custom model_runner/model support (#3186)
* support custom model&&model_runner

* fix merge

* add test && update doc

* fix codestyle

* fix unittest

* load model in rl
2025-08-04 18:52:39 -07:00
Sunny-bot1
72ef5a9c93 [FIX]fix bad_words when sending requests consecutively (#3197)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* fix bad_words

* fix log

* fix log
2025-08-04 05:59:41 -07:00
Yuan Xiaolan
1f8289e106 fix expertwise_scale (#3181) 2025-08-04 20:06:15 +08:00
YuBaoku
3eb9a5df60 [CI] add test_compare_top_logprobs (#3191) 2025-08-04 19:49:24 +08:00
SunLei
68bc1d12c0 [Bugfix] Fix uninitialized decoded_token and add corresponding unit test. (#3195) 2025-08-04 19:23:58 +08:00
Longzhi Wang
01d7586661 [Bug fix] Fix cudagraph when use ep. (#3130)
* fix cudagraph when use ep

* fix typo

* reduce full length to adapt large bsz such 128/256
2025-08-04 18:06:18 +08:00
周周周
2bd8a50649 remove useless code (#3166) 2025-08-04 18:03:08 +08:00
gaoziyuan
0443587a57 【Feature】support qwen3 name_mapping (#3179)
* add fd plugins && rm model_classed

* fix reviews

* add docs

* fix

* fix unitest ci

* support qwen3 name_mapping
2025-08-04 01:34:07 -07:00
Zero Rains
17f51f0c92 [unitest] fix the bug in test_sampler (#3157) 2025-08-04 01:23:25 -07:00
YuanRisheng
79bbacc152 Fix approve shell scripts (#3108)
* fix approve

* fix
2025-08-04 15:51:33 +08:00
Divano
3bfb2eca92 Update test_base_chat.py (#3183) 2025-08-04 15:09:53 +08:00
ltd0924
c9e6ce1518 Update cache_messager.py (#3172) 2025-08-04 14:32:34 +08:00
gaoziyuan
4021d66ea5 【Feature】add fd plugins && rm model_classes (#3123)
* add fd plugins && rm model_classed

* fix reviews

* add docs

* fix

* fix unitest ci
2025-08-03 19:53:20 -07:00
bukejiyu
1582814905 fix load_pre_sharded_checkpoint (#3152)
Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-08-04 10:44:20 +08:00
Divano
66d3bb89ad Update __init__.py (#3163)
升级测试基类兼容性
2025-08-04 09:40:09 +08:00
AIbin
22fe695f1c 【Inference Optimize】Support automatic generation of marlin kernel (#3149)
* Support automatic generation of marlin kernel
2025-08-01 22:43:18 +08:00
ApplEOFDiscord
b71cbb466d [Feature] remove dependency on enable_mm and refine multimodal's code (#3014)
* remove dependency on enable_mm

* fix codestyle check error

* fix codestyle check error

* update docs

* resolve conflicts on model config

* fix unit test error

* fix code style check error

---------

Co-authored-by: shige <1021937542@qq.com>
Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-08-01 20:01:18 +08:00
plusNew001
243394044d [XPU]Updata XPU dockerfiles (#3144)
* [CI] add xpu ci case

* [CI]Update run_ci_xpu.sh

* [XPU]Update Dockerfile.xpu

* Update Dockerfile.xpu
2025-08-01 19:41:59 +08:00
Zhang Yulong
0eb32bb9c8 add cases (#3155) 2025-08-01 18:38:57 +08:00
yangjianfengo1
64d7a3194d 集中式支持fa3 (#3112) 2025-08-01 18:03:36 +08:00
YUNSHEN XIE
bdb83e007d fix ci (#3141) 2025-08-01 17:42:26 +08:00
Divano
50db0d7ba9 add case (#3150)
* add test base class

* fix codestyle

* fix codestyle

* add base chat
2025-08-01 17:30:58 +08:00
Ryan
94264bbf60 [Code Simplification] Refactor Post-processing in VL Model Forward Method (#2937)
* rm sth useless

* refactor model forward

* mv bool index to kernel
2025-08-01 17:28:07 +08:00
yinwei
3a4db15765 Fix out-of-memory issue during single-XPU deployment (#3133) 2025-08-01 17:12:03 +08:00
JYChen
c34088b0fd fix stop seq unittest (#3126) 2025-08-01 16:50:05 +08:00
ming1753
fc5f43c6bc [Docs] Optimal Deployment (#2768) 2025-08-01 11:56:27 +08:00
chen
a2f5cc54f8 moe preprocess op support 160 experts and fused_moe triton kernel name add K (#3121) 2025-08-01 10:46:20 +08:00
Divano
1d93565082 [CE] Add base test class for web server testing (#3120)
* add test base class

* fix codestyle

* fix codestyle
2025-07-31 23:28:50 +08:00
YUNSHEN XIE
e1011e92d9 disable test_cuda_graph.py (#3124) 2025-07-31 22:03:48 +08:00
plusNew001
8c63237cfa [CI] add xpu ci case (#3111)
* [CI] add xpu ci case

* [CI]Update run_ci_xpu.sh
2025-07-31 22:03:34 +08:00
YUNSHEN XIE
ff6a109b4d Describe PR diff coverage using JSON file (#3114)
* Refactored ci pipeline

* update

* Describe PR diff coverage using JSON file

* remove pip cache setting from Approve

* fix

* update
2025-07-31 21:59:20 +08:00
992 changed files with 121903 additions and 15679 deletions

187
.github/workflows/_accuracy_test.yml vendored Normal file
View File

@@ -0,0 +1,187 @@
name: Accuracy Test
description: "Run Accuracy Tests"
on:
workflow_call:
inputs:
DOCKER_IMAGE:
description: "Build Images"
required: true
type: string
default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310"
FASTDEPLOY_ARCHIVE_URL:
description: "URL of the compressed FastDeploy code archive."
required: true
type: string
FASTDEPLOY_WHEEL_URL:
description: "URL of the FastDeploy Wheel."
required: true
type: string
CACHE_DIR:
description: "Cache Dir Use"
required: false
type: string
default: ""
MODEL_CACHE_DIR:
description: "Cache Dir Use"
required: false
type: string
default: ""
jobs:
accuracy_tests:
runs-on: [self-hosted, GPU-h20-1Cards]
timeout-minutes: 60
steps:
- name: Code Prepare
shell: bash
env:
docker_image: ${{ inputs.DOCKER_IMAGE }}
fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
run: |
set -x
REPO="https://github.com/${{ github.repository }}.git"
FULL_REPO="${{ github.repository }}"
REPO_NAME="${FULL_REPO##*/}"
BASE_BRANCH="${{ github.base_ref }}"
docker pull ${docker_image}
# Clean the repository directory before starting
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
-e "REPO_NAME=${REPO_NAME}" \
${docker_image} /bin/bash -c '
if [ -d ${REPO_NAME} ]; then
echo "Directory ${REPO_NAME} exists, removing it..."
rm -rf ${REPO_NAME}*
fi
'
wget -q ${fd_archive_url}
tar -xf FastDeploy.tar.gz
rm -rf FastDeploy.tar.gz
cd FastDeploy
git config --global user.name "FastDeployCI"
git config --global user.email "fastdeploy_ci@example.com"
git log -n 3 --oneline
- name: Run FastDeploy Base Tests
shell: bash
env:
docker_image: ${{ inputs.DOCKER_IMAGE }}
fastdeploy_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
CACHE_DIR: ${{ inputs.CACHE_DIR }}
MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
run: |
runner_name="${{ runner.name }}"
CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1)
FLASK_PORT=$((42068 + DEVICE_PORT * 100))
FD_API_PORT=$((42088 + DEVICE_PORT * 100))
FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100))
FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100))
FD_CACHE_QUEUE_PORT=$((42098 + DEVICE_PORT * 100))
echo "Test ENV Parameter:"
echo "========================================================="
echo "FLASK_PORT=${FLASK_PORT}"
echo "FD_API_PORT=${FD_API_PORT}"
echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
echo "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}"
echo "DEVICES=${DEVICES}"
echo "========================================================="
CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
echo "CACHE_DIR is set to ${CACHE_DIR}"
if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
touch "${CACHE_DIR}/gitconfig"
fi
if [ ! -d "${MODEL_CACHE_DIR}" ]; then
echo "Error: MODEL_CACHE_DIR '${MODEL_CACHE_DIR}' does not exist."
exit 1
fi
PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT $FD_CACHE_QUEUE_PORT)
LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log"
echo "==== LOG_FILE is ${LOG_FILE} ===="
echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE
for port in "${PORTS[@]}"; do
PIDS=$(lsof -t -i :$port || true)
if [ -n "$PIDS" ]; then
echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE
echo "$PIDS" | xargs -r kill -9
echo "Port $port cleared" | tee -a $LOG_FILE
else
echo "Port $port is free" | tee -a $LOG_FILE
fi
done
echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE
echo "========================================================="
echo "Ensuring no stale container named ${runner_name} ..."
if [ "$(docker ps -a -q -f name=${runner_name})" ]; then
echo "Removing stale container: ${runner_name}"
docker rm -f ${runner_name} || true
fi
docker run --rm --ipc=host --pid=host --net=host \
--name ${runner_name} \
-v $(pwd):/workspace \
-w /workspace \
-e fastdeploy_wheel_url=${fastdeploy_wheel_url} \
-e "FD_API_PORT=${FD_API_PORT}" \
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
-e "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}" \
-e "FLASK_PORT=${FLASK_PORT}" \
-v "${MODEL_CACHE_DIR}:/MODELDATA" \
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
-v "${CACHE_DIR}/.cache:/root/.cache" \
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
-e TZ="Asia/Shanghai" \
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
python -m pip install ${fastdeploy_wheel_url}
python -m pip install pytest
wget https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64
chmod +x ./llm-deploy-linux-amd64
./llm-deploy-linux-amd64 -python python3.10 \
-model_name ERNIE-4.5-0.3B-Paddle \
-model_path /MODELDATA \
--skip install
git config --global --add safe.directory /workspace/FastDeploy
cd FastDeploy
pushd tests/ce/deploy
ps -ef | grep "${FD_ENGINE_QUEUE_PORT}" | grep -v grep | awk "{print \$2}" | xargs -r kill -9
python3.10 deploy.py > dd.log 2>&1 &
sleep 3
curl -X POST http://0.0.0.0:${FLASK_PORT}/start \
-H "Content-Type: application/json" \
-d "{\"--model\": \"/MODELDATA/ERNIE-4.5-0.3B-Paddle\"}"
curl -X POST http://localhost:${FLASK_PORT}/wait_for_infer?timeout=90
popd
pushd tests/ce/accuracy_cases
export URL=http://localhost:${FD_API_PORT}/v1/chat/completions
export TEMPLATE=TOKEN_LOGPROB
export MODEL_SIZE=0.3B
TEST_EXIT_CODE=0
python gsm8k.py || TEST_EXIT_CODE=1
popd
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> /workspace/FastDeploy/exit_code.env
'
if [ -f ./FastDeploy/exit_code.env ]; then
source ./FastDeploy/exit_code.env
cat ./FastDeploy/exit_code.env >> $GITHUB_ENV
fi
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}"
exit ${TEST_EXIT_CODE}

231
.github/workflows/_base_test.yml vendored Normal file
View File

@@ -0,0 +1,231 @@
name: Base Test
description: "Run Base Tests"
on:
workflow_call:
inputs:
DOCKER_IMAGE:
description: "Build Images"
required: true
type: string
default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310"
FASTDEPLOY_ARCHIVE_URL:
description: "URL of the compressed FastDeploy code archive."
required: true
type: string
FASTDEPLOY_WHEEL_URL:
description: "URL of the FastDeploy Wheel."
required: true
type: string
CACHE_DIR:
description: "Cache Dir Use"
required: false
type: string
default: ""
MODEL_CACHE_DIR:
description: "Cache Dir Use"
required: false
type: string
default: ""
jobs:
base_tests:
runs-on: [self-hosted, GPU-h20-1Cards]
timeout-minutes: 60
steps:
- name: Code Prepare
shell: bash
env:
docker_image: ${{ inputs.DOCKER_IMAGE }}
fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
run: |
set -x
REPO="https://github.com/${{ github.repository }}.git"
FULL_REPO="${{ github.repository }}"
REPO_NAME="${FULL_REPO##*/}"
BASE_BRANCH="${{ github.base_ref }}"
docker pull ${docker_image}
# Clean the repository directory before starting
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
-e "REPO_NAME=${REPO_NAME}" \
${docker_image} /bin/bash -c '
if [ -d ${REPO_NAME} ]; then
echo "Directory ${REPO_NAME} exists, removing it..."
rm -rf ${REPO_NAME}*
fi
'
wget -q ${fd_archive_url}
tar -xf FastDeploy.tar.gz
rm -rf FastDeploy.tar.gz
cd FastDeploy
git config --global user.name "FastDeployCI"
git config --global user.email "fastdeploy_ci@example.com"
git log -n 3 --oneline
- name: Run FastDeploy Base Tests
shell: bash
env:
docker_image: ${{ inputs.DOCKER_IMAGE }}
fastdeploy_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
CACHE_DIR: ${{ inputs.CACHE_DIR }}
MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
run: |
runner_name="${{ runner.name }}"
CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1)
FLASK_PORT=$((42068 + DEVICE_PORT * 100))
FD_API_PORT=$((42088 + DEVICE_PORT * 100))
FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100))
FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100))
FD_CACHE_QUEUE_PORT=$((42098 + DEVICE_PORT * 100))
echo "Test ENV Parameter:"
echo "========================================================="
echo "FLASK_PORT=${FLASK_PORT}"
echo "FD_API_PORT=${FD_API_PORT}"
echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
echo "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}"
echo "DEVICES=${DEVICES}"
echo "========================================================="
CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
echo "CACHE_DIR is set to ${CACHE_DIR}"
if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
touch "${CACHE_DIR}/gitconfig"
fi
if [ ! -d "${MODEL_CACHE_DIR}" ]; then
echo "Error: MODEL_CACHE_DIR '${MODEL_CACHE_DIR}' does not exist."
exit 1
fi
PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT $FD_CACHE_QUEUE_PORT)
LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log"
echo "==== LOG_FILE is ${LOG_FILE} ===="
echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE
for port in "${PORTS[@]}"; do
PIDS=$(lsof -t -i :$port || true)
if [ -n "$PIDS" ]; then
echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE
echo "$PIDS" | xargs -r kill -9
echo "Port $port cleared" | tee -a $LOG_FILE
else
echo "Port $port is free" | tee -a $LOG_FILE
fi
done
echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE
echo "========================================================="
echo "Ensuring no stale container named ${runner_name} ..."
if [ "$(docker ps -a -q -f name=${runner_name})" ]; then
echo "Removing stale container: ${runner_name}"
docker rm -f ${runner_name} || true
fi
docker run --rm --ipc=host --pid=host --net=host \
--name ${runner_name} \
-v $(pwd):/workspace \
-w /workspace \
-e fastdeploy_wheel_url=${fastdeploy_wheel_url} \
-e "FD_API_PORT=${FD_API_PORT}" \
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
-e "FLASK_PORT=${FLASK_PORT}" \
-e "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}" \
-v "${MODEL_CACHE_DIR}:/MODELDATA" \
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
-v "${CACHE_DIR}/.cache:/root/.cache" \
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
-e TZ="Asia/Shanghai" \
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
# python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
python -m pip install paddlepaddle-gpu==3.3.0.dev20250917 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
python -m pip install ${fastdeploy_wheel_url}
python -m pip install pytest
wget https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64
chmod +x ./llm-deploy-linux-amd64
./llm-deploy-linux-amd64 -python python3.10 \
-model_name ERNIE-4.5-0.3B-Paddle \
-model_path /MODELDATA \
--skip install
git config --global --add safe.directory /workspace/FastDeploy
cd FastDeploy
pushd tests/ce/deploy
ps -ef | grep "${FD_ENGINE_QUEUE_PORT}" | grep -v grep | awk "{print \$2}" | xargs -r kill -9
python3.10 deploy.py > dd.log 2>&1 &
sleep 3
curl -X POST http://0.0.0.0:${FLASK_PORT}/start \
-H "Content-Type: application/json" \
-d "{\"--model\": \"/MODELDATA/ERNIE-4.5-0.3B-Paddle\"}"
check_service() {
local timeout=${1:-90}
local url="http://localhost:${FLASK_PORT}/wait_for_infer?timeout=${timeout}"
local resp
resp=$(curl -s -X POST "$url")
if echo "$resp" | grep -q "服务启动超时"; then
exit 8
fi
}
check_service 90
popd
pushd tests/ce/server
export URL=http://localhost:${FD_API_PORT}/v1/chat/completions
export TEMPLATE=TOKEN_LOGPROB
TEST_EXIT_CODE=0
python -m pytest -sv test_base_chat.py test_compare_top_logprobs.py test_logprobs.py test_params_boundary.py test_seed_usage.py test_stream.py test_evil_cases.py test_completions.py test_return_token_ids.py || TEST_EXIT_CODE=1
curl -X POST http://0.0.0.0:${FLASK_PORT}/switch \
-H "Content-Type: application/json" \
-d "{\"--model\": \"/MODELDATA/ERNIE-4.5-0.3B-Paddle\", \"--early-stop-config\": \"{\\\"enable_early_stop\\\":true, \\\"window_size\\\":6, \\\"threshold\\\":0.93}\"}"
check_service 90
python -m pytest -sv test_repetition_early_stop.py || TEST_EXIT_CODE=1
curl -X POST http://0.0.0.0:${FLASK_PORT}/switch \
-H "Content-Type: application/json" \
-d "{ \"--model\": \"/MODELDATA/ERNIE-4.5-0.3B-Paddle\", \"--max-concurrency\": 5, \"--max-waiting-time\": 1 }"
check_service 90
python -m pytest -sv test_max_concurrency.py || TEST_EXIT_CODE=1
curl -X POST http://0.0.0.0:${FLASK_PORT}/switch \
-H "Content-Type: application/json" \
-d "{ \"--model\": \"/MODELDATA/ERNIE-4.5-0.3B-Paddle\", \"--max-concurrency\": 5000, \"--max-waiting-time\": 1 }"
check_service 90
python -m pytest -sv test_max_waiting_time.py || TEST_EXIT_CODE=1
curl -X POST http://0.0.0.0:${FLASK_PORT}/switch \
-H "Content-Type: application/json" \
-d "{\"--model\": \"/MODELDATA/ernie-4_5-21b-a3b-bf16-paddle\", \"--config\": \"21b_mtp.yaml\", \"--enable-logprob\": \"False\"}"
check_service 180
export TEMPLATE=TOKEN_NORMAL
python -m pytest -sv test_seed_usage.py -k "not test_seed_stream" || TEST_EXIT_CODE=1
curl -X POST http://0.0.0.0:${FLASK_PORT}/switch \
-H "Content-Type: application/json" \
-d "{\"--model\": \"/MODELDATA/ernie-4_5-21b-a3b-bf16-paddle\", \"--config\": \"21b_sot.yaml\", \"--enable-logprob\": \"False\"}"
check_service 360
export TEMPLATE=TOKEN_NORMAL
python -m pytest -sv test_seed_usage.py -k "not test_seed_stream" || TEST_EXIT_CODE=1
popd
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> /workspace/FastDeploy/exit_code.env
'
if [ -f ./FastDeploy/exit_code.env ]; then
source ./FastDeploy/exit_code.env
cat ./FastDeploy/exit_code.env >> $GITHUB_ENV
fi
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}"
exit ${TEST_EXIT_CODE}

View File

@@ -22,12 +22,22 @@ on:
description: "Enable nightly build mode (e.g. add date suffix to version)"
required: false
type: string
default: "ON"
default: "OFF"
FD_VERSION:
description: "FastDeploy Package Version"
required: false
type: string
default: ""
PADDLEVERSION:
description: "Paddle Version Build Use"
required: false
type: string
default: ""
PADDLE_WHL_URL:
description: "Paddle Wheel Package URL"
required: false
type: string
default: ""
UPLOAD:
description: "Upload Package"
required: false
@@ -45,6 +55,7 @@ on:
jobs:
fd-build:
runs-on: [self-hosted, GPU-Build]
timeout-minutes: 360
outputs:
wheel_path: ${{ steps.set_output.outputs.wheel_path }}
steps:
@@ -85,13 +96,22 @@ jobs:
compile_arch: ${{ inputs.COMPILE_ARCH }}
fd_version: ${{ inputs.FD_VERSION }}
CACHE_DIR: ${{ inputs.CACHE_DIR }}
BRANCH_REF: ${{ github.ref_name }}
PADDLEVERSION: ${{ inputs.PADDLEVERSION }}
PADDLE_WHL_URL: ${{ inputs.PADDLE_WHL_URL }}
WITH_NIGHTLY_BUILD: ${{ inputs.WITH_NIGHTLY_BUILD }}
run: |
set -x
runner_name="${{ runner.name }}"
CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
gpu_id=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
IFS='/' read -ra parts <<< "${GITHUB_WORKSPACE}"
len=${#parts[@]}
CCACHE_DEFAULT_DIR="/$(IFS=/; echo "${parts[*]:1:$((len-5))}")"
echo "$CCACHE_DEFAULT_DIR"
CACHE_DIR="${CACHE_DIR:-$CCACHE_DEFAULT_DIR}"
echo "CACHE_DIR is set to ${CACHE_DIR}"
if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
touch "${CACHE_DIR}/gitconfig"
@@ -109,6 +129,10 @@ jobs:
-e "COMPILE_ARCH=${compile_arch}" \
-e "FD_VERSION=${fd_version}" \
-e "WITH_NIGHTLY_BUILD=${WITH_NIGHTLY_BUILD}" \
-e "PADDLEVERSION=${PADDLEVERSION}" \
-e "PADDLE_WHL_URL=${PADDLE_WHL_URL}" \
-e "BRANCH_REF=${BRANCH_REF}" \
-e "CCACHE_MAXSIZE=50G" \
--gpus "\"device=${gpu_id}\"" ${docker_image} /bin/bash -c '
if [[ -n "${FD_VERSION}" ]]; then
export FASTDEPLOY_VERSION=${FD_VERSION}
@@ -116,6 +140,7 @@ jobs:
fi
git config --global --add safe.directory /workspace/FastDeploy
chown -R $(whoami) /workspace/FastDeploy
cd FastDeploy
if [[ "${WITH_NIGHTLY_BUILD}" == "ON" ]];then
GIT_COMMIT_TIME=$(git --no-pager show -s --format=%ci HEAD)
@@ -124,14 +149,20 @@ jobs:
echo "Date Only: $DATE_ONLY"
export FASTDEPLOY_VERSION="${FASTDEPLOY_VERSION}.dev${DATE_ONLY}"
fi
pip config set global.index-url http://pip.baidu.com/root/baidu/+simple/
pip config set install.trusted-host pip.baidu.com
pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
# 针对不同分支和tag使用不同的PaddlePaddle安装包
if [[ "${PADDLE_WHL_URL}" != "" ]];then
python -m pip install ${PADDLE_WHL_URL}
elif [[ "${PADDLEVERSION}" != "" ]];then
python -m pip install paddlepaddle-gpu==${PADDLEVERSION} -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
else
python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
fi
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
python -m pip install --upgrade pip
python -m pip install -r requirements.txt
python -m pip install wheel
python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
# 编译RDMA
export ENABLE_FD_RDMA=1
bash build.sh 1 python false [${COMPILE_ARCH}]

73
.github/workflows/_ci_image_build.yml vendored Normal file
View File

@@ -0,0 +1,73 @@
name: Docker Build
description: "FastDeploy CI Image Build"
on:
workflow_call:
inputs:
CI_DOCKER_IMAGE_NAME:
description: "Build Images"
required: true
type: string
default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310"
FASTDEPLOY_ARCHIVE_URL:
description: "URL of the compressed FastDeploy code archive."
required: true
type: string
DOCKER_IMAGE_NAME:
description: "Build Images"
required: false
type: string
default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate"
outputs:
docker_name_precheck:
description: "Output path of the generated wheel"
value: ${{ jobs.docker_build.outputs.docker_name_precheck }}
jobs:
docker_build:
runs-on: [self-hosted, Docker-Build]
outputs:
docker_name_precheck: ${{ steps.docker_build.outputs.docker_name_precheck }}
steps:
- name: Docker Build
id: docker_build
shell: bash
env:
docker_image_name: ${{ inputs.CI_DOCKER_IMAGE_NAME }}
docker_image: ${{ inputs.DOCKER_IMAGE_NAME }}
fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
run: |
set -x
REPO="https://github.com/${{ github.repository }}.git"
FULL_REPO="${{ github.repository }}"
REPO_NAME="${FULL_REPO##*/}"
BASE_BRANCH="${{ github.base_ref }}"
# Clean the repository directory before starting
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
-e "REPO_NAME=${REPO_NAME}" \
${docker_image} /bin/bash -c '
if [ -d ${REPO_NAME} ]; then
echo "Directory ${REPO_NAME} exists, removing it..."
rm -rf ${REPO_NAME}*
fi
'
wget -q ${fd_archive_url}
tar -xf FastDeploy.tar.gz
rm -rf FastDeploy.tar.gz
cd FastDeploy
git config --global user.name "FastDeployCI"
git config --global user.email "fastdeploy_ci@example.com"
git log -n 3 --oneline
# Docker Build
cd tools/dockerfile/
set -e
cp ../../requirements.txt ./
cp ../../scripts/unittest_requirement.txt ./
docker build -t ${docker_image_name} -f Dockerfile.ci . \
--network host \
--no-cache
docker push ${docker_image_name}
echo "docker_name_precheck=${docker_image_name}" >> $GITHUB_OUTPUT

View File

@@ -68,7 +68,7 @@ jobs:
branch_name=${{ github.ref_name }}
target_path=paddle-github-action/BRANCH/FastDeploy/${branch_name}/${commit_id}
fi
wget -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py
wget -O bos_tools.py -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py
push_file=$(realpath bos_tools.py)
python -m pip install bce-python-sdk==0.9.29
ls

View File

@@ -39,6 +39,7 @@ jobs:
docker_image: ${{ inputs.DOCKER_IMAGE }}
paddletest_archive_url: ${{ inputs.PADDLETEST_ARCHIVE_URL }}
run: |
docker pull ${docker_image}
# Clean the repository directory before starting
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
-e "REPO_NAME=${REPO_NAME}" \
@@ -62,18 +63,24 @@ jobs:
MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
run: |
runner_name="${{ runner.name }}"
last_char="${runner_name: -1}"
CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1)
if [[ "$last_char" =~ [0-7] ]]; then
DEVICES="$last_char"
else
DEVICES="0"
fi
FLASK_PORT=$((9160 + DEVICES * 100))
FD_API_PORT=$((9180 + DEVICES * 100))
FD_ENGINE_QUEUE_PORT=$((9150 + DEVICES * 100))
FD_METRICS_PORT=$((9170 + DEVICES * 100))
FLASK_PORT=$((42068 + DEVICE_PORT * 100))
FD_API_PORT=$((42088 + DEVICE_PORT * 100))
FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100))
FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100))
FD_CACHE_QUEUE_PORT=$((42098 + DEVICE_PORT * 100))
echo "Test ENV Parameter:"
echo "========================================================="
echo "FLASK_PORT=${FLASK_PORT}"
echo "FD_API_PORT=${FD_API_PORT}"
echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
echo "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}"
echo "DEVICES=${DEVICES}"
echo "========================================================="
CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
echo "CACHE_DIR is set to ${CACHE_DIR}"
@@ -85,28 +92,51 @@ jobs:
exit 1
fi
PARENT_DIR=$(dirname "$WORKSPACE")
PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT $FD_CACHE_QUEUE_PORT)
LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log"
echo "==== LOG_FILE is ${LOG_FILE} ===="
docker run --ipc=host --pid=host --net=host \
echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE
for port in "${PORTS[@]}"; do
PIDS=$(lsof -t -i :$port || true)
if [ -n "$PIDS" ]; then
echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE
echo "$PIDS" | xargs -r kill -9
echo "Port $port cleared" | tee -a $LOG_FILE
else
echo "Port $port is free" | tee -a $LOG_FILE
fi
done
echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE
echo "========================================================="
echo "Ensuring no stale container named ${runner_name} ..."
if [ "$(docker ps -a -q -f name=${runner_name})" ]; then
echo "Removing stale container: ${runner_name}"
docker rm -f ${runner_name} || true
fi
docker run --rm --ipc=host --pid=host --net=host \
--name ${runner_name} \
-v $(pwd):/workspace \
-w /workspace \
-e fastdeploy_wheel_url=${fastdeploy_wheel_url} \
-e "FD_API_PORT=${FD_API_PORT}" \
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
-e "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}" \
-e "FLASK_PORT=${FLASK_PORT}" \
-v "${MODEL_CACHE_DIR}:/MODELDATA" \
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
-v "${CACHE_DIR}/.cache:/root/.cache" \
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
-e TZ="Asia/Shanghai" \
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -c '
# python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
python -m pip install paddlepaddle-gpu==3.0.0.dev20250729 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
pip config set global.index-url http://pip.baidu.com/root/baidu/+simple/
pip config set install.trusted-host pip.baidu.com
pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
python -m pip install ${fastdeploy_wheel_url}
wget https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64
@@ -117,6 +147,7 @@ jobs:
--skip install
cd PaddleTest/framework/ServeTest
ps -ef | grep "${FD_ENGINE_QUEUE_PORT}" | grep -v grep | awk "{print \$2}" | xargs -r kill -9
python3.10 deploy.py > dd.log 2>&1 &
sleep 3
curl -X POST http://0.0.0.0:${FLASK_PORT}/start \
@@ -124,6 +155,10 @@ jobs:
-d "{\"--model\": \"/MODELDATA/ERNIE-4.5-0.3B-Paddle\"}"
curl -X POST http://localhost:${FLASK_PORT}/wait_for_infer?timeout=90
curl -s -o /dev/null -w "%{http_code}" -m 2 "http://0.0.0.0:${FD_API_PORT}/health"
curl -X POST "http://0.0.0.0:${FD_API_PORT}/v1/chat/completions" \
-H "Content-Type: application/json" \
-d "{\"messages\": [{\"role\": \"user\", \"content\": \"1+1=?\"}], \"logprobs\": true}"
set +e
rm -rf ./baseline_output
cp -r baseline/ERNIE-4.5-0.3B-Paddle ./baseline_output

151
.github/workflows/_pre_ce_test.yml vendored Normal file
View File

@@ -0,0 +1,151 @@
name: Pre-CE-Test
on:
workflow_call:
inputs:
DOCKER_IMAGE:
description: "Build Images"
required: true
type: string
default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:fastdeploy-ciuse-cuda126"
FASTDEPLOY_ARCHIVE_URL:
description: "URL of the compressed FastDeploy code archive."
required: true
type: string
FASTDEPLOY_WHEEL_URL:
description: "URL of the FastDeploy Wheel."
required: true
type: string
CACHE_DIR:
description: "Cache Dir Use"
required: false
type: string
default: ""
MODEL_CACHE_DIR:
description: "Cache Dir Use"
required: false
type: string
default: ""
jobs:
run_ce_cases:
runs-on: [self-hosted, PRE_CE_RUN_2Card]
timeout-minutes: 60
steps:
- name: Print current runner name
run: |
echo "Current runner name: ${{ runner.name }}"
- name: Code Prepare
shell: bash
env:
docker_image: ${{ inputs.DOCKER_IMAGE }}
fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
run: |
set -x
REPO="https://github.com/${{ github.repository }}.git"
FULL_REPO="${{ github.repository }}"
REPO_NAME="${FULL_REPO##*/}"
BASE_BRANCH="${{ github.base_ref }}"
docker pull ${docker_image}
# Clean the repository directory before starting
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
-e "REPO_NAME=${REPO_NAME}" \
${docker_image} /bin/bash -c '
if [ -d ${REPO_NAME} ]; then
echo "Directory ${REPO_NAME} exists, removing it..."
rm -rf ${REPO_NAME}*
fi
'
wget -q ${fd_archive_url}
tar -xf FastDeploy.tar.gz
rm -rf FastDeploy.tar.gz
cd FastDeploy
git config --global user.name "FastDeployCI"
git config --global user.email "fastdeploy_ci@example.com"
git log -n 3 --oneline
- name: Run CI unittest
env:
docker_image: ${{ inputs.DOCKER_IMAGE }}
fd_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
CACHE_DIR: ${{ inputs.CACHE_DIR }}
MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
run: |
runner_name="${{ runner.name }}"
CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1)
FLASK_PORT=$((42068 + DEVICE_PORT * 100))
FD_API_PORT=$((42088 + DEVICE_PORT * 100))
FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100))
FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100))
FD_CACHE_QUEUE_PORT=$((42098 + DEVICE_PORT * 100))
FD_ZMQ_RECV_REQUEST_SERVER_PORT=$((42048 + DEVICE_PORT * 100))
FD_ZMQ_SEND_RESPONSE_SERVER_PORT=$((42038 + DEVICE_PORT * 100))
FD_ZMQ_CONTROL_CMD_SERVER_PORTS=$((42028 + DEVICE_PORT * 100))
echo "Test ENV Parameter:"
echo "========================================================="
echo "FLASK_PORT=${FLASK_PORT}"
echo "FD_API_PORT=${FD_API_PORT}"
echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
echo "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}"
echo "DEVICES=${DEVICES}"
echo "========================================================="
CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
echo "CACHE_DIR is set to ${CACHE_DIR}"
if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
touch "${CACHE_DIR}/gitconfig"
fi
PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT $FD_CACHE_QUEUE_PORT)
LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log"
echo "==== LOG_FILE is ${LOG_FILE} ===="
echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE
for port in "${PORTS[@]}"; do
PIDS=$(lsof -t -i :$port || true)
if [ -n "$PIDS" ]; then
echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE
echo "$PIDS" | xargs -r kill -9
echo "Port $port cleared" | tee -a $LOG_FILE
else
echo "Port $port is free" | tee -a $LOG_FILE
fi
done
echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE
echo "========================================================="
echo "Ensuring no stale container named ${runner_name} ..."
if [ "$(docker ps -a -q -f name=${runner_name})" ]; then
echo "Removing stale container: ${runner_name}"
docker rm -f ${runner_name} || true
fi
docker run --rm --net=host \
--name ${runner_name} \
-v $(pwd):/workspace \
-w /workspace \
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
-v "${CACHE_DIR}/.cache:/root/.cache" \
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
-v "${MODEL_CACHE_DIR}:/ModelData:ro" \
-e "MODEL_PATH=/ModelData" \
-e "FD_API_PORT=${FD_API_PORT}" \
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
-e "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}" \
-e "FLASK_PORT=${FLASK_PORT}" \
-e "fd_wheel_url=${fd_wheel_url}" \
--gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c '
git config --global --add safe.directory /workspace/FastDeploy
cd FastDeploy
python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
python -m pip install ${fd_wheel_url}
bash scripts/run_pre_ce.sh
'

170
.github/workflows/_stable_test.yml vendored Normal file
View File

@@ -0,0 +1,170 @@
name: Stable Test
description: "Run Stable Tests"
on:
workflow_call:
inputs:
DOCKER_IMAGE:
description: "Build Images"
required: true
type: string
default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310"
FASTDEPLOY_ARCHIVE_URL:
description: "URL of the compressed FastDeploy code archive."
required: true
type: string
FASTDEPLOY_WHEEL_URL:
description: "URL of the FastDeploy Wheel."
required: true
type: string
CACHE_DIR:
description: "Cache Dir Use"
required: false
type: string
default: ""
MODEL_CACHE_DIR:
description: "Cache Dir Use"
required: false
type: string
default: ""
jobs:
stable_tests:
runs-on: [self-hosted, GPU-h1z1-2Cards]
timeout-minutes: 60
steps:
- name: Code Prepare
shell: bash
env:
docker_image: ${{ inputs.DOCKER_IMAGE }}
fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
run: |
set -x
REPO="https://github.com/${{ github.repository }}.git"
FULL_REPO="${{ github.repository }}"
REPO_NAME="${FULL_REPO##*/}"
BASE_BRANCH="${{ github.base_ref }}"
docker pull ${docker_image}
# Clean the repository directory before starting
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
-e "REPO_NAME=${REPO_NAME}" \
${docker_image} /bin/bash -c '
if [ -d ${REPO_NAME} ]; then
echo "Directory ${REPO_NAME} exists, removing it..."
rm -rf ${REPO_NAME}*
fi
'
wget -q ${fd_archive_url}
tar -xf FastDeploy.tar.gz
rm -rf FastDeploy.tar.gz
cd FastDeploy
git config --global user.name "FastDeployCI"
git config --global user.email "fastdeploy_ci@example.com"
git log -n 3 --oneline
- name: Run FastDeploy Stable Tests
shell: bash
env:
docker_image: ${{ inputs.DOCKER_IMAGE }}
fastdeploy_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
CACHE_DIR: ${{ inputs.CACHE_DIR }}
MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
run: |
runner_name="${{ runner.name }}"
CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1)
FLASK_PORT=$((42068 + DEVICE_PORT * 100))
FD_API_PORT=$((42088 + DEVICE_PORT * 100))
FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100))
FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100))
FD_CACHE_QUEUE_PORT=$((42038 + DEVICE_PORT * 100))
FD_INFERENCE_MSG_QUEUE_ID=$(( 42048 + DEVICE_PORT * 100))
echo "Test ENV Parameter:"
echo "========================================================="
echo "FLASK_PORT=${FLASK_PORT}"
echo "FD_API_PORT=${FD_API_PORT}"
echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
echo "FD_INFERENCE_MSG_QUEUE_ID=${FD_INFERENCE_MSG_QUEUE_ID}"
echo "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}"
echo "DEVICES=${DEVICES}"
echo "========================================================="
CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
echo "CACHE_DIR is set to ${CACHE_DIR}"
if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
touch "${CACHE_DIR}/gitconfig"
fi
if [ ! -d "${MODEL_CACHE_DIR}" ]; then
echo "Error: MODEL_CACHE_DIR '${MODEL_CACHE_DIR}' does not exist."
exit 1
fi
PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT)
LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log"
echo "==== LOG_FILE is ${LOG_FILE} ===="
echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE
for port in "${PORTS[@]}"; do
PIDS=$(lsof -t -i :$port || true)
if [ -n "$PIDS" ]; then
echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE
echo "$PIDS" | xargs -r kill -9
echo "Port $port cleared" | tee -a $LOG_FILE
else
echo "Port $port is free" | tee -a $LOG_FILE
fi
done
echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE
echo "========================================================="
echo "Ensuring no stale container named ${runner_name} ..."
if [ "$(docker ps -a -q -f name=${runner_name})" ]; then
echo "Removing stale container: ${runner_name}"
docker rm -f ${runner_name} || true
fi
docker run --rm --ipc=host --pid=host --net=host \
--name ${runner_name} \
-v $(pwd):/workspace \
-w /workspace \
-e fastdeploy_wheel_url=${fastdeploy_wheel_url} \
-e "FD_API_PORT=${FD_API_PORT}" \
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
-e "FLASK_PORT=${FLASK_PORT}" \
-e "FD_INFERENCE_MSG_QUEUE_ID=${FD_INFERENCE_MSG_QUEUE_ID}" \
-e "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}" \
-v "${MODEL_CACHE_DIR}:/MODELDATA" \
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
-v "${CACHE_DIR}/.cache:/root/.cache" \
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
-e TZ="Asia/Shanghai" \
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
python -m pip install ${fastdeploy_wheel_url}
python -m pip install pytest
git config --global --add safe.directory /workspace/FastDeploy
cd FastDeploy
TEST_EXIT_CODE=0
pushd tests/ce/stable_cases
bash launch_model.sh /MODELDATA
bash run.sh || TEST_EXIT_CODE=1
popd
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> /workspace/FastDeploy/exit_code.env
'
if [ -f ./FastDeploy/exit_code.env ]; then
source ./FastDeploy/exit_code.env
cat ./FastDeploy/exit_code.env >> $GITHUB_ENV
fi
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}"
exit ${TEST_EXIT_CODE}

View File

@@ -1,4 +1,4 @@
name: Run FastDeploy Unit Tests and Coverage
name: Coverage Check
description: "Run FastDeploy Unit Tests and Coverage"
on:
@@ -22,13 +22,32 @@ on:
required: false
type: string
default: ""
MODEL_CACHE_DIR:
description: "Cache Dir Use"
required: false
type: string
default: ""
secrets:
github-token:
required: true
jobs:
check_cov_skip:
uses: ./.github/workflows/check-bypass.yml
secrets:
github-token: ${{ secrets.github-token }}
with:
workflow-name: coverage
run_tests_with_coverage:
runs-on: [self-hosted, GPU-h1z1-4Cards]
runs-on: [self-hosted, GPU-h1z1-2Cards]
timeout-minutes: 90
needs: check_cov_skip
if: needs.check_cov_skip.outputs.can-skip != 'true'
outputs:
diff_cov_file_url: ${{ steps.cov_upload.outputs.diff_cov_file_url }}
unittest_failed_url: ${{ steps.unittest_failed.outputs.unittest_failed_url }}
unittest_failed_url: ${{ steps.cov_upload.outputs.unittest_failed_url }}
diff_cov_result_json_url: ${{ steps.cov_upload.outputs.diff_cov_result_json_url }}
steps:
- name: Code Prepare
shell: bash
@@ -41,7 +60,7 @@ jobs:
FULL_REPO="${{ github.repository }}"
REPO_NAME="${FULL_REPO##*/}"
BASE_BRANCH="${{ github.base_ref }}"
docker pull ${docker_image}
# Clean the repository directory before starting
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
-e "REPO_NAME=${REPO_NAME}" \
@@ -66,58 +85,125 @@ jobs:
fd_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
CACHE_DIR: ${{ inputs.CACHE_DIR }}
BASE_REF: ${{ github.event.pull_request.base.ref }}
MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }}
IS_PR: ${{ github.event_name == 'pull_request' }}
run: |
set -x
runner_name="${{ runner.name }}"
CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
gpu_id=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
if [[ "$IS_PR" == "true" ]]; then
echo "Running on PR"
else
echo "Not a PR"
fi
runner_name="${{ runner.name }}"
CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}')
DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,)
DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1)
CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
echo "CACHE_DIR is set to ${CACHE_DIR}"
if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
touch "${CACHE_DIR}/gitconfig"
fi
PARENT_DIR=$(dirname "$WORKSPACE")
echo "PARENT_DIR:$PARENT_DIR"
docker run --rm --net=host \
--cap-add=SYS_PTRACE --privileged --shm-size=64G \
-v $(pwd):/workspace -w /workspace \
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
-v "${CACHE_DIR}/.cache:/root/.cache" \
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
-e TZ="Asia/Shanghai" \
-e "fd_wheel_url=${fd_wheel_url}" \
-e "BASE_REF=${BASE_REF}" \
--gpus "\"device=${gpu_id}\"" ${docker_image} /bin/bash -c '
FLASK_PORT=$((42068 + DEVICE_PORT * 100))
FD_API_PORT=$((42088 + DEVICE_PORT * 100))
FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100))
FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100))
FD_CACHE_QUEUE_PORT=$((42098 + DEVICE_PORT * 100))
echo "Test ENV Parameter:"
echo "========================================================="
echo "FLASK_PORT=${FLASK_PORT}"
echo "FD_API_PORT=${FD_API_PORT}"
echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}"
echo "FD_METRICS_PORT=${FD_METRICS_PORT}"
echo "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}"
echo "DEVICES=${DEVICES}"
echo "========================================================="
git config --global --add safe.directory /workspace/FastDeploy
cd FastDeploy
# python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
python -m pip install paddlepaddle-gpu==3.0.0.dev20250729 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}"
echo "CACHE_DIR is set to ${CACHE_DIR}"
if [ ! -f "${CACHE_DIR}/gitconfig" ]; then
touch "${CACHE_DIR}/gitconfig"
fi
pip config set global.index-url http://pip.baidu.com/root/baidu/+simple/
pip config set install.trusted-host pip.baidu.com
pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT $FD_CACHE_QUEUE_PORT)
LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log"
echo "==== LOG_FILE is ${LOG_FILE} ===="
python -m pip install coverage
python -m pip install diff-cover
python -m pip install ${fd_wheel_url}
export COVERAGE_FILE=/workspace/FastDeploy/coveragedata/.coverage
export COVERAGE_RCFILE=/workspace/FastDeploy/scripts/.coveragerc
TEST_EXIT_CODE=0
bash scripts/coverage_run.sh || TEST_EXIT_CODE=8
git diff origin/${BASE_REF}..HEAD --unified=0 > diff.txt
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> exit_code.env
coverage combine coveragedata/
coverage xml -o python_coverage_all.xml
COVERAGE_EXIT_CODE=0
diff-cover python_coverage_all.xml --diff-file=diff.txt --fail-under=90 || COVERAGE_EXIT_CODE=9
echo "COVERAGE_EXIT_CODE=${COVERAGE_EXIT_CODE}" >> exit_code.env
echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE
for port in "${PORTS[@]}"; do
PIDS=$(lsof -t -i :$port || true)
if [ -n "$PIDS" ]; then
echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE
echo "$PIDS" | xargs -r kill -9
echo "Port $port cleared" | tee -a $LOG_FILE
else
echo "Port $port is free" | tee -a $LOG_FILE
fi
done
echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE
echo "========================================================="
echo "Ensuring no stale container named ${runner_name} ..."
if [ "$(docker ps -a -q -f name=${runner_name})" ]; then
echo "Removing stale container: ${runner_name}"
docker rm -f ${runner_name} || true
fi
docker run --rm --net=host \
--name ${runner_name} \
--cap-add=SYS_PTRACE --shm-size=64G \
-v $(pwd):/workspace -w /workspace \
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
-v "${CACHE_DIR}/.cache:/root/.cache" \
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
-v "${MODEL_CACHE_DIR}:/ModelData:ro" \
-e "MODEL_PATH=/ModelData" \
-e "FD_API_PORT=${FD_API_PORT}" \
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
-e "FLASK_PORT=${FLASK_PORT}" \
-e "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}" \
-e TZ="Asia/Shanghai" \
-e "fd_wheel_url=${fd_wheel_url}" \
-e "BASE_REF=${BASE_REF}" \
-e "IS_PR=${IS_PR}" \
--gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c '
git config --global --add safe.directory /workspace/FastDeploy
cd FastDeploy
git diff origin/${BASE_REF}..HEAD --unified=0 > diff.txt
python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
python -m pip install -r scripts/unittest_requirement.txt
python -m pip install ${fd_wheel_url}
rm -rf fastdeploy
# coverage subprocess use
python -m pip install ${fd_wheel_url} --no-deps --target=/workspace/FastDeploy
export PYTHONPATH=/workspace/FastDeploy/
if [ -d "tests/plugins" ]; then
cd tests/plugins
python setup.py install
cd ../..
else
echo "Warning: tests/plugins directory not found, skipping setup.py install"
fi
export COVERAGE_FILE=/workspace/FastDeploy/coveragedata/.coverage
export COVERAGE_RCFILE=/workspace/FastDeploy/scripts/.coveragerc
TEST_EXIT_CODE=0
bash scripts/coverage_run.sh || TEST_EXIT_CODE=8
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> exit_code.env
coverage combine coveragedata/ || echo "No data to combine"
coverage report
coverage xml -o python_coverage_all.xml
COVERAGE_EXIT_CODE=0
if [[ "$IS_PR" == "true" ]]; then
diff-cover python_coverage_all.xml --diff-file=diff.txt --fail-under=80 --json-report diff_coverage.json || COVERAGE_EXIT_CODE=9
python scripts/generate_diff_coverage_xml.py diff.txt python_coverage_all.xml
'
if [ -f FastDeploy/exit_code.env ]; then
cat FastDeploy/exit_code.env >> $GITHUB_ENV
fi
else
echo "Not a PR, skipping diff-cover"
fi
echo "COVERAGE_EXIT_CODE=${COVERAGE_EXIT_CODE}" >> exit_code.env
'
if [ -f FastDeploy/exit_code.env ]; then
cat FastDeploy/exit_code.env >> $GITHUB_ENV
fi
- name: Upload unit resule and diff coverage to bos
id: cov_upload
shell: bash
@@ -125,42 +211,97 @@ jobs:
cd FastDeploy
commit_id=${{ github.event.pull_request.head.sha }}
pr_num=${{ github.event.pull_request.number }}
target_path=paddle-github-action/PR/FastDeploy/${pr_num}/${commit_id}/SM${compile_arch//,/_}/CoverageData
wget -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py
target_path=paddle-github-action/PR/FastDeploy/${pr_num}/${commit_id}/SM${compile_arch//,/_}
wget -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py -O bos_tools.py
push_file=$(realpath bos_tools.py)
python -m pip install bce-python-sdk==0.9.29
diff_cov_file="diff_coverage.xml"
if [ -f ${diff_cov_file} ];then
python ${push_file} ${diff_cov_file} ${target_path}
python ${push_file} ${diff_cov_file} ${target_path}/CoverageData
target_path_stripped="${target_path#paddle-github-action/}"
DIFF_COV_FILE_URL=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/${diff_cov_file}
DIFF_COV_FILE_URL=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/CoverageData/${diff_cov_file}
echo "diff_cov_file_url=${DIFF_COV_FILE_URL}" >> $GITHUB_OUTPUT
echo "diff_cov_file_url=${DIFF_COV_FILE_URL}" >> $GITHUB_ENV
fi
- name: Determine Unit Succ and whether the coverage rate reaches 90%
diff_cov_result_json="diff_coverage.json"
if [ -f ${diff_cov_result_json} ];then
python ${push_file} ${diff_cov_result_json} ${target_path}/CoverageData
target_path_stripped="${target_path#paddle-github-action/}"
DIFF_COV_JSON_URL=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/CoverageData/${diff_cov_result_json}
echo "diff_cov_result_json_url=${DIFF_COV_JSON_URL}" >> $GITHUB_OUTPUT
echo "diff_cov_result_json_url=${DIFF_COV_JSON_URL}" >> $GITHUB_ENV
fi
unittest_result="failed_tests.log"
if [ -s ${unittest_result} ];then
python ${push_file} ${unittest_result} ${target_path}/UnitTestResult
target_path_stripped="${target_path#paddle-github-action/}"
UNIT_TEST_RESULT_URL=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/UnitTestResult/${unittest_result}
echo "unittest_failed_url=${UNIT_TEST_RESULT_URL}" >> $GITHUB_OUTPUT
echo "unittest_failed_url=${UNIT_TEST_RESULT_URL}" >> $GITHUB_ENV
fi
- name: Check Unit Test Success
shell: bash
run: |
cd FastDeploy
if [ "$TEST_EXIT_CODE" -eq 8 ]; then
filename=$(basename "$unittest_failed_url")
if [ -z "${unittest_failed_url}" ]; then
echo "No diff unit failed file URL provided."
else
rm -rf "${filename}"
wget -O ${filename} ${unittest_failed_url} || echo "Download unittest file failed, but continuing..."
fi
echo "Unit tests failed (exit code 8)"
if [ -f "${filename}" ];then
echo "Failed test cases:"
cat "${filename}"
fi
exit "$TEST_EXIT_CODE"
fi
echo "All tests passed"
- name: Verify Code Coverage Threshold (80%)
if: ${{ github.event_name == 'pull_request' }}
shell: bash
run: |
cd FastDeploy
if [ "$COVERAGE_EXIT_CODE" -eq 9 ]; then
echo "Coverage generation failed (exit code 9)"
filename=$(basename "$diff_cov_result_json_url")
if [ -z "${diff_cov_result_json_url}" ]; then
echo "No diff cov result file URL provided."
else
rm -rf "${filename}"
wget -O ${filename} ${diff_cov_result_json_url} || echo "Download cov json file failed, but continuing..."
fi
if [ -f "${filename}" ];then
echo "Failed test cases:"
if command -v jq >/dev/null 2>&1; then
jq . "${filename}"
else
cat "${filename}"
fi
fi
exit "$COVERAGE_EXIT_CODE"
fi
echo "All tests and coverage passed"
echo "coverage passed"
exit 0
diff_coverage_report:
needs: run_tests_with_coverage
if: always()
runs-on: ubuntu-latest
env:
fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
steps:
- name: coverage diff file download
shell: bash
env:
diff_cov_file_url: ${{ needs.run_tests_with_coverage.outputs.diff_cov_file_url }}
run: |
wget ${fd_archive_url}
tar -xf FastDeploy.tar.gz
cd FastDeploy
if [ -z "${diff_cov_file_url}" ]; then
echo "No diff coverage file URL provided."
exit 0
@@ -170,6 +311,9 @@ jobs:
if: ${{ needs.run_tests_with_coverage.outputs.diff_cov_file_url != null && needs.run_tests_with_coverage.outputs.diff_cov_file_url != '' }}
uses: codecov/codecov-action@v5
with:
files: ./diff_coverage.xml
files: ./FastDeploy/diff_coverage.xml
name: python diff coverage
verbose: true
disable_search: true
commit_parent: false
flags: diff

View File

@@ -6,6 +6,9 @@ on:
- develop
- 'release/*'
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
jobs:
Approval:
name: Approval
@@ -33,7 +36,6 @@ jobs:
uses: actions/setup-python@v5
with:
python-version: '3.10'
cache: 'pip'
- name: Run approval check script
run: |

248
.github/workflows/ce_job.yml vendored Normal file
View File

@@ -0,0 +1,248 @@
name: CE Compile Job
on:
workflow_dispatch:
push:
branches:
- develop
- 'release/*'
permissions: read-all
concurrency:
group: CE-Job-${{ github.ref }}-${{ github.sha }}
cancel-in-progress: true
jobs:
ce_job_pre_check:
runs-on: ubuntu-latest
env:
COMPILE_BRANCH: ${{ vars.COMPILE_BRANCH }}
CE_COMPILE_SELECTION: ${{ vars.CE_COMPILE_SELECTION }}
COMPILE_USE_PADDLE_WHL_URL_MAPPINGS: ${{ vars.COMPILE_USE_PADDLE_WHL_URL_MAPPINGS }}
outputs:
branch_match: ${{ steps.set_output.outputs.branch_match }}
compile_use_paddle_whl_url: ${{ steps.set_output.outputs.compile_use_paddle_whl_url }}
sm8689_match: ${{ steps.set_output.outputs.sm8689_match }}
sm8090_match: ${{ steps.set_output.outputs.sm8090_match }}
steps:
- name: Set Version
id: set_output
env:
COMPILE_BRANCH: ${{ env.COMPILE_BRANCH }}
CE_COMPILE_SELECTION: ${{ env.CE_COMPILE_SELECTION }}
COMPILE_USE_PADDLE_WHL_URL_MAPPINGS: ${{ env.COMPILE_USE_PADDLE_WHL_URL_MAPPINGS }}
GITHUB_REF_NAME: ${{ github.ref_name }}
run: |
# 选择要触发编译任务的分支 done
# 选择指定分支要编译的任务 8090或者8689
# 指定分支编译要使用的Paddle的安装包,默认使用nightly最新的
IFS=',' read -ra BRANCHES <<< "$COMPILE_BRANCH"
MATCH=false
for b in "${BRANCHES[@]}"; do
if [[ "$b" == "${GITHUB_REF_NAME}" ]]; then
MATCH=true
break
fi
done
echo "branch_match=$MATCH" >> $GITHUB_OUTPUT
# 通过变量CE_COMPILE_SELECTION中的映射关系,决定分支是编译sm8090还是sm8689
for pair in $(echo "$CE_COMPILE_SELECTION" | tr ';' ' '); do
branch=$(echo "$pair" | cut -d',' -f1)
compile_task_list=$(echo "$pair" | cut -d',' -f2)
if [[ "$branch" == "$GITHUB_REF_NAME" ]]; then
# 判断里面是否包含 sm8090 或 sm8689
if [[ "$compile_task_list" == *"sm8090"* ]]; then
echo "sm8090_match=true" >> $GITHUB_OUTPUT
fi
if [[ "$compile_task_list" == *"sm8689"* ]]; then
echo "sm8689_match=true" >> $GITHUB_OUTPUT
fi
break
fi
done
# 通过变量COMPILE_USE_PADDLE_WHL_URL_MAPPINGS中的映射关系,决定是否是安装指定版本的Paddle还是直接安装URL
for pair in $(echo $COMPILE_USE_PADDLE_WHL_URL_MAPPINGS | tr ';' ' '); do
branch=$(echo "$pair" | cut -d',' -f1)
paddle_whl_url=$(echo "$pair" | cut -d',' -f2)
if [[ "$branch" == "${{ github.ref_name }}" ]]; then
FOUND_PADDLE_URL="$paddle_whl_url"
echo "compile_use_paddle_whl_url=${FOUND_PADDLE_URL}" >> $GITHUB_OUTPUT
break
fi
done
print_ce_job_pre_check_outputs:
runs-on: ubuntu-latest
needs: ce_job_pre_check
steps:
- name: Print outputs as JSON
run: |
echo '${{ toJSON(needs.ce_job_pre_check.outputs) }}'
clone:
environment: CodeSync
name: FD-Clone-Linux
runs-on: ubuntu-latest
needs: ce_job_pre_check
if: ${{ needs.ce_job_pre_check.outputs.branch_match == 'true' }}
outputs:
repo_archive_url: ${{ steps.set_output.outputs.repo_archive_url }}
steps:
- name: Clone FastDeploy
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request'
&& github.event.pull_request.base.ref
|| github.ref_name }}
submodules: 'recursive'
fetch-depth: 1000
- name: Python Setup
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Code Info Show and Upload
id: set_output
env:
AK: ${{ secrets.BOS_AK }}
SK: ${{ secrets.BOS_SK }}
run: |
git config --unset http.https://github.com/.extraheader
git submodule foreach --recursive sh -c "git config --local --unset-all 'http.https://github.com/.extraheader'"
git submodule foreach --recursive sh -c "git config remote.origin.fetch '+refs/heads/*:refs/remotes/origin/*'"
echo "Current HEAD Log:"
git log --oneline -n 5
ls
cd ..
tar -zcf FastDeploy.tar.gz FastDeploy
commit_id=${{ github.sha }}
branch_name=${{ github.ref_name }}
target_path=paddle-qa/BRANCH/FastDeploy/${branch_name}/${commit_id}
wget -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py
push_file=$(realpath bos_tools.py)
python -m pip install bce-python-sdk==0.9.29
ls
python ${push_file} FastDeploy.tar.gz ${target_path}
target_path_stripped="${target_path#paddle-qa/}"
REPO_ARCHIVE_URL=https://paddle-qa.bj.bcebos.com/${target_path_stripped}/FastDeploy.tar.gz
echo "repo_archive_url=${REPO_ARCHIVE_URL}" >> $GITHUB_OUTPUT
resultshow:
name: Show Code Archive Output
needs: clone
runs-on: ubuntu-latest
steps:
- name: Print wheel path
run: |
echo "The code archive is located at: ${{ needs.clone.outputs.repo_archive_url }}"
build_sm8090:
name: BUILD_SM8090
needs: [clone, ce_job_pre_check]
if: ${{ needs.ce_job_pre_check.outputs.sm8090_match == 'true' }}
uses: ./.github/workflows/_build_linux.yml
with:
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
COMPILE_ARCH: "80,90"
WITH_NIGHTLY_BUILD: OFF
FD_VERSION: 0.0.0
PADDLE_WHL_URL: ${{ needs.ce_job_pre_check.outputs.compile_use_paddle_whl_url }}
build_sm8689:
name: BUILD_SM8689
needs: [clone, ce_job_pre_check]
if: ${{ needs.ce_job_pre_check.outputs.sm8689_match == 'true' }}
uses: ./.github/workflows/_build_linux.yml
with:
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
COMPILE_ARCH: "86,89"
WITH_NIGHTLY_BUILD: OFF
FD_VERSION: 0.0.0
PADDLE_WHL_URL: ${{ needs.ce_job_pre_check.outputs.compile_use_paddle_whl_url }}
ce_upload_sm8090:
environment: CodeSync
name: CE_UPLOAD
needs: build_sm8090
runs-on: ubuntu-latest
env:
AK: ${{ secrets.BOS_AK }}
SK: ${{ secrets.BOS_SK }}
FASTDEPLOY_WHEEL_URL: ${{ needs.build_sm8090.outputs.wheel_path }}
COMPILE_ARCH: "80,90"
steps:
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Wheel Info Show and Upload
run: |
echo "The wheel is located at: ${{ needs.build_sm8090.outputs.wheel_path }}"
wget -q --no-check-certificate ${{ needs.build_sm8090.outputs.wheel_path }}
filename=$(basename ${{ needs.build_sm8090.outputs.wheel_path }})
commit_id=${{ github.sha }}
branch_name=${{ github.ref_name }}
target_path=paddle-qa/paddle-pipeline/FastDeploy_ActionCE/SM${COMPILE_ARCH//,/_}/${branch_name}/${commit_id}
wget -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py
push_file=$(realpath bos_tools.py)
python -m pip install bce-python-sdk==0.9.29
ls
python ${push_file} ${filename} ${target_path}
target_path_stripped="${target_path#paddle-qa/}"
WHEEL_PATH=https://paddle-qa.bj.bcebos.com/${target_path_stripped}/${filename}
target_path_latest=paddle-qa/paddle-pipeline/FastDeploy_ActionCE/SM${COMPILE_ARCH//,/_}/${branch_name}/latest
python ${push_file} ${filename} ${target_path_latest}
target_path_stripped_latest="${target_path_latest#paddle-qa/}"
WHEEL_PATH_LATEST=https://paddle-qa.bj.bcebos.com/${target_path_stripped_latest}/${filename}
echo "commit wheel url is ${WHEEL_PATH}"
echo "latest wheel url is ${WHEEL_PATH_LATEST}"
ce_upload_sm8689:
environment: CodeSync
name: CE_UPLOAD
needs: build_sm8689
runs-on: ubuntu-latest
env:
AK: ${{ secrets.BOS_AK }}
SK: ${{ secrets.BOS_SK }}
FASTDEPLOY_WHEEL_URL: ${{ needs.build_sm8689.outputs.wheel_path }}
COMPILE_ARCH: "86,89"
steps:
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Wheel Info Show and Upload
run: |
echo "The wheel is located at: ${{ needs.build_sm8689.outputs.wheel_path }}"
wget -q --no-check-certificate ${{ needs.build_sm8689.outputs.wheel_path }}
filename=$(basename ${{ needs.build_sm8689.outputs.wheel_path }})
commit_id=${{ github.sha }}
branch_name=${{ github.ref_name }}
target_path=paddle-qa/paddle-pipeline/FastDeploy_ActionCE/SM${COMPILE_ARCH//,/_}/${branch_name}/${commit_id}
wget -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py
push_file=$(realpath bos_tools.py)
python -m pip install bce-python-sdk==0.9.29
ls
python ${push_file} ${filename} ${target_path}
target_path_stripped="${target_path#paddle-qa/}"
WHEEL_PATH=https://paddle-qa.bj.bcebos.com/${target_path_stripped}/${filename}
target_path_latest=paddle-qa/paddle-pipeline/FastDeploy_ActionCE/SM${COMPILE_ARCH//,/_}/${branch_name}/latest
python ${push_file} ${filename} ${target_path_latest}
target_path_stripped_latest="${target_path_latest#paddle-qa/}"
WHEEL_PATH_LATEST=https://paddle-qa.bj.bcebos.com/${target_path_stripped_latest}/${filename}
echo "commit wheel url is ${WHEEL_PATH}"
echo "latest wheel url is ${WHEEL_PATH_LATEST}"

51
.github/workflows/check-bypass.yml vendored Normal file
View File

@@ -0,0 +1,51 @@
on:
workflow_call:
inputs:
workflow-name:
required: true
type: string
secrets:
github-token:
required: true
outputs:
can-skip:
description: "Whether the workflow can be skipped."
value: ${{ jobs.check-bypass.outputs.can-skip }}
jobs:
check-bypass:
name: Check bypass
runs-on: ubuntu-latest
permissions:
contents: read
env:
CI_TEAM_MEMBERS: '["yuanlehome","YuanRisheng","Jiang-Jia-Jun","DDDivano","XieYunshen"]'
outputs:
can-skip: ${{ steps.check-bypass.outputs.can-skip }}
steps:
- name: Cleanup
run: |
rm -rf * .[^.]*
- id: check-bypass
name: Check Bypass
uses: PFCCLab/ci-bypass@v1
with:
github-token: ${{ secrets.github-token }}
non-pull-request-event-strategy: 'never-skipped'
type: 'composite'
composite-rule: |
{
"any": [
{
"type": "labeled",
"label": ["skip-ci: ${{ inputs.workflow-name }}", "skip-ci: all"],
"username": ${{ env.CI_TEAM_MEMBERS }}
},
{
"type": "commented",
"comment-pattern": [".*/skip-ci ${{ inputs.workflow-name }}.*", ".*/skip-ci all.*"],
"username": ${{ env.CI_TEAM_MEMBERS }}
}
]
}

View File

@@ -1,89 +0,0 @@
name: CI
on:
pull_request:
branches:
- develop
- 'release/*'
workflow_dispatch:
concurrency:
group: ${{ github.event.pull_request.number }}
cancel-in-progress: true
jobs:
build:
runs-on: [self-hosted, GPU-L20-4Card]
steps:
- name: Print current runner name
run: |
echo "Current runner name: ${{ runner.name }}"
# Because the system version is lower than 2.23, the checkout cannot be used.
# - name: Checkout code
# uses: actions/checkout@v4
- name: Code Checkout
env:
docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:fastdeploy-ciuse-cuda126
run: |
REPO="https://github.com/${{ github.repository }}.git"
FULL_REPO="${{ github.repository }}"
REPO_NAME="${FULL_REPO##*/}"
BASE_BRANCH="${{ github.base_ref }}"
# Clean the repository directory before starting
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
-e "REPO_NAME=${REPO_NAME}" \
-e "BASE_BRANCH=${BASE_BRANCH}" \
${docker_image} /bin/bash -c '
if [ -d ${REPO_NAME} ]; then
echo "Directory ${REPO_NAME} exists, removing it..."
rm -rf ${REPO_NAME}
fi
'
git config --global user.name "FastDeployCI"
git config --global user.email "fastdeploy_ci@example.com"
git clone ${REPO} ${REPO_NAME} -b ${BASE_BRANCH}
cd FastDeploy
if [ "${{ github.event_name }}" = "pull_request" ]; then
git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}
git merge pr/${{ github.event.pull_request.number }}
git log -n 3 --oneline
else
git checkout ${{ github.sha }}
git log -n 3 --oneline
fi
- name: Run CI unittest
env:
docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:fastdeploy-ciuse-cuda126
run: |
runner_name="${{ runner.name }}"
last_char="${runner_name: -1}"
if [ "${last_char}" = "1" ]; then
gpu_id=2
DEVICES="2,3"
else
gpu_id=0
DEVICES="0,1"
fi
FD_API_PORT=$((9180 + gpu_id * 100))
FD_ENGINE_QUEUE_PORT=$((9150 + gpu_id * 100))
FD_METRICS_PORT=$((9170 + gpu_id * 100))
PARENT_DIR=$(dirname "$WORKSPACE")
echo "PARENT_DIR:$PARENT_DIR"
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
-v "/ssd4/GithubActions/gitconfig:/etc/gitconfig:ro" \
-v "/ssd4/GithubActions/ModelData:/ModelData:ro" \
-v "/ssd4/GithubActions/CacheDir:/root/.cache" \
-v "/ssd4/GithubActions/ConfigDir:/root/.config" \
-e "MODEL_PATH=/ModelData" \
-e "FD_API_PORT=${FD_API_PORT}" \
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -c "
git config --global --add safe.directory /workspace/FastDeploy
cd FastDeploy
bash scripts/run_ci.sh
"

View File

@@ -13,7 +13,8 @@ concurrency:
jobs:
CI_GCU:
runs-on: [self-hosted, GCU-S60-8Card]
runs-on:
group: GCU
steps:
- name: Print current runner name
run: |
@@ -28,7 +29,9 @@ jobs:
REPO_NAME="${FULL_REPO##*/}"
BASE_BRANCH="${{ github.base_ref }}"
# Clean the repository directory before starting
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
docker run --rm --net=host -v $(pwd):/workspace \
-v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \
-w /workspace \
-e "REPO_NAME=${REPO_NAME}" \
-e "BASE_BRANCH=${BASE_BRANCH}" \
${docker_image} /bin/bash -c '
@@ -39,6 +42,7 @@ jobs:
'
git config --global user.name "FastDeployCI"
git config --global user.email "fastdeploy_ci@example.com"
source ${{ github.workspace }}/../../../proxy
git clone ${REPO} ${REPO_NAME} -b ${BASE_BRANCH}
cd FastDeploy
if [ "${{ github.event_name }}" = "pull_request" ]; then
@@ -49,6 +53,9 @@ jobs:
git checkout ${{ github.sha }}
git log -n 3 --oneline
fi
echo "Copy models..."
sudo mkdir -p ci_models && sudo cp -r /work/deps/ERNIE-4.5-21B-A3B-Paddle ci_models
echo "Copy models done."
- name: Run CI unittest
env:
@@ -70,19 +77,21 @@ jobs:
echo "PARENT_DIR:$PARENT_DIR"
echo "Install drivers..."
cd /work/deps
bash TopsRider_i3x_*_deb_amd64.run --driver --no-auto-load -y
sudo bash TopsRider_i3x_*_deb_amd64.run --driver --no-auto-load -y
cd -
docker run --rm --network=host --ipc=host -it --privileged \
-v $(pwd):/workspace -w /workspace \
-v "/home:/home" \
-v "/work:/work" \
-e "MODEL_PATH=/work/models" \
echo "Create docker..."
docker run --rm --network=host --ipc=host --privileged \
-v $(pwd):/workspace \
-v /home:/home \
-v /work:/work \
-w /workspace \
-e "MODEL_PATH=./ci_models" \
-e "http_proxy=$(git config --global --get http.proxy)" \
-e "https_proxy=$(git config --global --get https.proxy)" \
-e "FD_API_PORT=${FD_API_PORT}" \
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
${docker_image} /bin/bash -c "
${docker_image} /bin/bash -c "
git config --global --add safe.directory /workspace/FastDeploy
cd FastDeploy
bash scripts/run_ci_gcu.sh

View File

@@ -11,7 +11,8 @@ concurrency:
jobs:
CI_ILUVATAR:
runs-on: [self-hosted, IXUCA]
runs-on:
group: IXUCA
steps:
- name: Print current runner name
run: |
@@ -27,18 +28,22 @@ jobs:
REPO="https://github.com/${{ github.repository }}.git"
FULL_REPO="${{ github.repository }}"
REPO_NAME="${FULL_REPO##*/}"
BASE_BRANCH="${{ github.base_ref }}"
# Clean the repository directory before starting
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
-e "REPO_NAME=${REPO_NAME}" \
-e "BASE_BRANCH=${BASE_BRANCH}" \
${docker_image} /bin/bash -c '
if [ -d ${REPO_NAME} ]; then
echo "Directory ${REPO_NAME} exists, removing it..."
rm -rf ${REPO_NAME}
fi
'
git config --global http.proxy "http://61.151.249.150:33128"
git config --global https.proxy "http://61.151.249.150:33128"
git config --global user.name "FastDeployCI"
git config --global user.email "fastdeploy_ci@example.com"
git clone ${REPO} ${REPO_NAME}
git clone --recursive ${REPO} ${REPO_NAME} -b ${BASE_BRANCH}
cd FastDeploy
if [ "${{ github.event_name }}" = "pull_request" ]; then
git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}

174
.github/workflows/ci_image_update.yml vendored Normal file
View File

@@ -0,0 +1,174 @@
name: CI Images Build
on:
workflow_dispatch:
schedule:
- cron: '0 18 * * *' # 2:00 AM China Standard Time (UTC+8)
permissions: read-all
concurrency:
group: CI-Images-Build-${{ github.ref }}-${{ github.sha }}
cancel-in-progress: true
jobs:
clone:
environment: CodeSync
name: FD-Clone-Linux
runs-on: ubuntu-latest
outputs:
repo_archive_url: ${{ steps.set_output.outputs.repo_archive_url }}
steps:
- name: Clone FastDeploy
uses: actions/checkout@v4
with:
ref: ${{ github.ref_name }}
submodules: 'recursive'
fetch-depth: 1000
- name: Python Setup
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Code Info Show and Upload
id: set_output
env:
AK: ${{ secrets.BOS_AK }}
SK: ${{ secrets.BOS_SK }}
run: |
git config --unset http.https://github.com/.extraheader
git submodule foreach --recursive sh -c "git config --local --unset-all 'http.https://github.com/.extraheader'"
git submodule foreach --recursive sh -c "git config remote.origin.fetch '+refs/heads/*:refs/remotes/origin/*'"
echo "Current HEAD Log:"
git log --oneline -n 5
ls
cd ..
tar -zcf FastDeploy.tar.gz FastDeploy
if [[ "${{ github.ref_type }}" == "tag" ]]; then
commit_id=${{ github.sha }}
tag_name=${{ github.ref_name }}
target_path=paddle-qa/TAG/FastDeploy/${tag_name}/${commit_id}
else
commit_id=${{ github.sha }}
branch_name=${{ github.ref_name }}
target_path=paddle-qa/BRANCH/FastDeploy/${branch_name}/${commit_id}
fi
wget -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py
push_file=$(realpath bos_tools.py)
python -m pip install bce-python-sdk==0.9.29
ls
python ${push_file} FastDeploy.tar.gz ${target_path}
target_path_stripped="${target_path#paddle-qa/}"
REPO_ARCHIVE_URL=https://paddle-qa.bj.bcebos.com/${target_path_stripped}/FastDeploy.tar.gz
echo "repo_archive_url=${REPO_ARCHIVE_URL}" >> $GITHUB_OUTPUT
resultshow:
name: Show Code Archive Output
needs: clone
runs-on: ubuntu-latest
steps:
- name: Print wheel path
run: |
echo "The code archive is located at: ${{ needs.clone.outputs.repo_archive_url }}"
ci_image_build:
name: CI Images Build
needs: clone
uses: ./.github/workflows/_ci_image_build.yml
with:
CI_DOCKER_IMAGE_NAME: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate-precheck
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
build_sm8090:
name: BUILD_SM8090
needs: [clone, ci_image_build]
uses: ./.github/workflows/_build_linux.yml
with:
DOCKER_IMAGE: ${{ needs.ci_image_build.outputs.docker_name_precheck }}
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
COMPILE_ARCH: "90"
WITH_NIGHTLY_BUILD: ${{ needs.publish_pre_check.outputs.with_nightly_build }}
FD_VERSION: ${{ needs.publish_pre_check.outputs.fd_version }}
PADDLEVERSION: ${{ needs.publish_pre_check.outputs.compile_use_paddle_version }}
PADDLE_WHL_URL: ${{ needs.publish_pre_check.outputs.compile_use_paddle_whl_url }}
unittest_coverage:
name: Run FastDeploy Unit Tests and Coverage
needs: [clone,build_sm8090,ci_image_build]
uses: ./.github/workflows/_unit_test_coverage.yml
with:
DOCKER_IMAGE: ${{ needs.ci_image_build.outputs.docker_name_precheck }}
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
FASTDEPLOY_WHEEL_URL: ${{ needs.build_sm8090.outputs.wheel_path }}
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
logprob_test:
name: Run FastDeploy LogProb Tests
needs: [build_sm8090,ci_image_build]
uses: ./.github/workflows/_logprob_test_linux.yml
with:
DOCKER_IMAGE: ${{ needs.ci_image_build.outputs.docker_name_precheck }}
PADDLETEST_ARCHIVE_URL: "https://xly-devops.bj.bcebos.com/PaddleTest/PaddleTest.tar.gz"
FASTDEPLOY_WHEEL_URL: ${{ needs.build_sm8090.outputs.wheel_path }}
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
pre_ce_test:
name: Extracted partial CE model tasks to run in CI.
needs: [clone,build_sm8090,ci_image_build]
uses: ./.github/workflows/_pre_ce_test.yml
with:
DOCKER_IMAGE: ${{ needs.ci_image_build.outputs.docker_name_precheck }}
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
FASTDEPLOY_WHEEL_URL: ${{ needs.build_sm8090.outputs.wheel_path }}
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
base_test:
name: Run Base Tests
needs: [clone,build_sm8090,ci_image_build]
uses: ./.github/workflows/_base_test.yml
with:
DOCKER_IMAGE: ${{ needs.ci_image_build.outputs.docker_name_precheck }}
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
FASTDEPLOY_WHEEL_URL: ${{ needs.build_sm8090.outputs.wheel_path }}
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
accuracy_test:
name: Run Accuracy Tests
needs: [clone,build_sm8090,ci_image_build]
uses: ./.github/workflows/_accuracy_test.yml
with:
DOCKER_IMAGE: ${{ needs.ci_image_build.outputs.docker_name_precheck }}
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
FASTDEPLOY_WHEEL_URL: ${{ needs.build_sm8090.outputs.wheel_path }}
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
stable_test:
name: Run Stable Tests
needs: [clone,build_sm8090,ci_image_build]
uses: ./.github/workflows/_stable_test.yml
with:
DOCKER_IMAGE: ${{ needs.ci_image_build.outputs.docker_name_precheck }}
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
FASTDEPLOY_WHEEL_URL: ${{ needs.build_sm8090.outputs.wheel_path }}
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
publish_pre_check:
name: Publish Docker Images Pre Check
needs: [ci_image_build, unittest_coverage,logprob_test,pre_ce_test,base_test,accuracy_test,stable_test]
runs-on: [self-hosted, Docker-Build]
steps:
- name: Images Uploading
env:
images_name: ${{ needs.ci_image_build.outputs.docker_name_precheck }}
ci_image_name: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate"
run: |
echo "images_name=${images_name}"
docker images ${ci_image_name}
docker tag ${images_name} ${ci_image_name}
docker push ${ci_image_name}

View File

@@ -24,7 +24,7 @@ jobs:
- name: Code Checkout
env:
docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.0
docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.1.0
run: |
REPO="https://github.com/${{ github.repository }}.git"
FULL_REPO="${{ github.repository }}"
@@ -55,7 +55,7 @@ jobs:
- name: Run CI unittest
env:
docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.0
docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.1.0
run: |
runner_name="${{ runner.name }}"
last_char="${runner_name: -1}"
@@ -77,6 +77,7 @@ jobs:
-e "MODEL_PATH=/ssd3/model" \
-e "http_proxy=$(git config --global --get http.proxy)" \
-e "https_proxy=$(git config --global --get https.proxy)" \
-e "no_proxy=bcebos.com,mirrors.tuna.tsinghua.edu.cn,127.0.0.1,localhost" \
-e "FD_API_PORT=${FD_API_PORT}" \
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \

View File

@@ -15,7 +15,7 @@ jobs:
- uses: actions/setup-python@v5
with:
python-version: 3.x
- run: pip install mkdocs-material mkdocs-get-deps mkdocs-material-extensions mkdocs-multilang
- run: pip install mkdocs-material mkdocs-get-deps mkdocs-material-extensions mkdocs-multilang mkdocs-static-i18n
- name: Deploy to GitHub Pages
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@@ -19,7 +19,7 @@ jobs:
needs: clone
uses: ./.github/workflows/_build_linux.yml
with:
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
COMPILE_ARCH: "90"
WITH_NIGHTLY_BUILD: "OFF"
@@ -39,16 +39,59 @@ jobs:
needs: [clone,build]
uses: ./.github/workflows/_unit_test_coverage.yml
with:
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
logprob_test:
name: Run FastDeploy LogProb Tests
needs: [build]
uses: ./.github/workflows/_logprob_test_linux.yml
with:
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
PADDLETEST_ARCHIVE_URL: "https://xly-devops.bj.bcebos.com/PaddleTest/PaddleTest.tar.gz"
FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelCache"
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
pre_ce_test:
name: Extracted partial CE model tasks to run in CI.
needs: [clone,build]
uses: ./.github/workflows/_pre_ce_test.yml
with:
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
base_test:
name: Run Base Tests
needs: [clone,build]
uses: ./.github/workflows/_base_test.yml
with:
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
accuracy_test:
name: Run Accuracy Tests
needs: [clone,build]
uses: ./.github/workflows/_accuracy_test.yml
with:
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
stable_test:
name: Run Stable Tests
needs: [clone,build]
uses: ./.github/workflows/_stable_test.yml
with:
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"

331
.github/workflows/publish_job.yml vendored Normal file
View File

@@ -0,0 +1,331 @@
name: Publish Job
on:
workflow_dispatch:
schedule:
- cron: '0 18 * * *' # 2:00 AM China Standard Time (UTC+8)
push:
# branches:
# - develop
tags:
- '*'
permissions: read-all
concurrency:
group: Publish-Job-${{ github.ref }}-${{ github.sha }}
cancel-in-progress: true
jobs:
publish_pre_check:
runs-on: ubuntu-latest
if: |
github.event.repository.fork == false &&
(
(github.event_name == 'schedule' && github.ref_name == 'develop') ||
(github.event_name == 'push' && github.ref_type == 'tag') ||
((github.event_name == 'workflow_dispatch') &&
(github.ref_name == 'develop' || github.ref_type == 'tag'))
)
env:
TAG_VERSION_MAPPINGS: ${{ vars.TAG_VERSION_MAPPINGS }}
FD_VERSION_DEV: ${{ vars.FD_VERSION_DEV }}
COMPILE_USE_PADDLE_WHL_URL_MAPPINGS: ${{ vars.COMPILE_USE_PADDLE_WHL_URL_MAPPINGS }}
outputs:
compile_use_paddle_version: ${{ steps.set_output.outputs.compile_use_paddle_version }}
compile_continue: ${{ steps.set_output.outputs.compile_continue }}
fd_version: ${{ steps.set_output.outputs.fd_version }}
with_nightly_build: ${{ steps.set_output.outputs.with_nightly_build }}
compile_use_paddle_whl_url: ${{ steps.set_output.outputs.compile_use_paddle_whl_url }}
steps:
- name: Get tag version
if: github.ref_type == 'tag'
run: |
TAG_NAME="${GITHUB_REF##*/}" # 提取 tag 名称,比如 v2.1.0
TAG_VERSION="${TAG_NAME#v}" # 去掉前缀 v
echo "FD_VERSION=$TAG_VERSION" >> $GITHUB_ENV
- name: Check FD version to Paddle version mapping
if: github.ref_type == 'tag'
env:
TARGET_FD: ${{ env.FD_VERSION }}
run: |
FOUND_PADDLE=""
# 遍历映射
for pair in $(echo $TAG_VERSION_MAPPINGS | tr ';' ' '); do
fd=$(echo "$pair" | cut -d',' -f1)
paddle=$(echo "$pair" | cut -d',' -f2)
if [[ "$fd" == "$TARGET_FD" ]]; then
FOUND_PADDLE="$paddle"
break
fi
done
if [[ -z "$FOUND_PADDLE" ]]; then
echo "No Paddle version found for FD $TARGET_FD"
else
echo "FD $TARGET_FD maps to Paddle $FOUND_PADDLE"
echo "PADDLE_VERSION=$FOUND_PADDLE" >> $GITHUB_ENV
fi
- name: Set Version
id: set_output
env:
PADDLE_VERSION: ${{ env.PADDLE_VERSION }}
FD_VERSION: ${{ env.FD_VERSION }}
run: |
if [[ "${{ github.ref_type }}" == "tag" ]]; then
if [[ -z "$PADDLE_VERSION" ]]; then
compile_continue=false
else
compile_use_paddle_version=$PADDLE_VERSION
compile_continue=true
fi
fd_version=$FD_VERSION
fi
if [[ "${{ github.ref_name }}" == "develop" ]];then
compile_continue=true
compile_use_paddle_version=""
fd_version=${FD_VERSION_DEV}
with_nightly_build=ON
fi
# Todo
# 通过变量COMPILE_USE_PADDLE_WHL_URL_MAPPINGS中的映射关系,决定是否是安装指定版本的Paddle还是直接安装URL
for pair in $(echo $COMPILE_USE_PADDLE_WHL_URL_MAPPINGS | tr ';' ' '); do
branch=$(echo "$pair" | cut -d',' -f1)
paddle_whl_url=$(echo "$pair" | cut -d',' -f2)
if [[ "$branch" == "${{ github.ref_name }}" ]]; then
FOUND_PADDLE_URL="$paddle_whl_url"
echo "compile_use_paddle_whl_url=${FOUND_PADDLE_URL}" >> $GITHUB_OUTPUT
compile_continue=true
break
fi
done
echo "compile_continue=${compile_continue}" >> $GITHUB_OUTPUT
echo "compile_use_paddle_version=${compile_use_paddle_version}" >> $GITHUB_OUTPUT
echo "fd_version=${fd_version}" >> $GITHUB_OUTPUT
echo "with_nightly_build=${with_nightly_build:-OFF}" >> $GITHUB_OUTPUT
print_publish_pre_check_outputs:
runs-on: ubuntu-latest
needs: publish_pre_check
steps:
- name: Print outputs as JSON
run: |
echo '${{ toJSON(needs.publish_pre_check.outputs) }}'
clone:
environment: CodeSync
name: FD-Clone-Linux
runs-on: ubuntu-latest
needs: publish_pre_check
if: ${{ needs.publish_pre_check.outputs.compile_continue == 'true' }}
outputs:
repo_archive_url: ${{ steps.set_output.outputs.repo_archive_url }}
steps:
- name: Clone FastDeploy
uses: actions/checkout@v4
with:
ref: ${{ github.ref_name }}
submodules: 'recursive'
fetch-depth: 1000
- name: Python Setup
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Code Info Show and Upload
id: set_output
env:
AK: ${{ secrets.BOS_AK }}
SK: ${{ secrets.BOS_SK }}
run: |
git config --unset http.https://github.com/.extraheader
git submodule foreach --recursive sh -c "git config --local --unset-all 'http.https://github.com/.extraheader'"
git submodule foreach --recursive sh -c "git config remote.origin.fetch '+refs/heads/*:refs/remotes/origin/*'"
echo "Current HEAD Log:"
git log --oneline -n 5
ls
cd ..
tar -zcf FastDeploy.tar.gz FastDeploy
if [[ "${{ github.ref_type }}" == "tag" ]]; then
commit_id=${{ github.sha }}
tag_name=${{ github.ref_name }}
target_path=paddle-qa/TAG/FastDeploy/${tag_name}/${commit_id}
else
commit_id=${{ github.sha }}
branch_name=${{ github.ref_name }}
target_path=paddle-qa/BRANCH/FastDeploy/${branch_name}/${commit_id}
fi
wget -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py
push_file=$(realpath bos_tools.py)
python -m pip install bce-python-sdk==0.9.29
ls
python ${push_file} FastDeploy.tar.gz ${target_path}
target_path_stripped="${target_path#paddle-qa/}"
REPO_ARCHIVE_URL=https://paddle-qa.bj.bcebos.com/${target_path_stripped}/FastDeploy.tar.gz
echo "repo_archive_url=${REPO_ARCHIVE_URL}" >> $GITHUB_OUTPUT
resultshow:
name: Show Code Archive Output
needs: clone
runs-on: ubuntu-latest
steps:
- name: Print wheel path
run: |
echo "The code archive is located at: ${{ needs.clone.outputs.repo_archive_url }}"
build_sm8090:
name: BUILD_SM8090
needs: [clone, publish_pre_check]
uses: ./.github/workflows/_build_linux.yml
with:
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
COMPILE_ARCH: "80,90"
WITH_NIGHTLY_BUILD: ${{ needs.publish_pre_check.outputs.with_nightly_build }}
FD_VERSION: ${{ needs.publish_pre_check.outputs.fd_version }}
PADDLEVERSION: ${{ needs.publish_pre_check.outputs.compile_use_paddle_version }}
PADDLE_WHL_URL: ${{ needs.publish_pre_check.outputs.compile_use_paddle_whl_url }}
build_sm8689:
name: BUILD_SM8689
needs: [clone, publish_pre_check]
uses: ./.github/workflows/_build_linux.yml
with:
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
COMPILE_ARCH: "86,89"
WITH_NIGHTLY_BUILD: ${{ needs.publish_pre_check.outputs.with_nightly_build }}
FD_VERSION: ${{ needs.publish_pre_check.outputs.fd_version }}
PADDLEVERSION: ${{ needs.publish_pre_check.outputs.compile_use_paddle_version }}
PADDLE_WHL_URL: ${{ needs.publish_pre_check.outputs.compile_use_paddle_whl_url }}
paddle_pypi_upload_sm8090:
environment: PaddleSourceUpload
name: PADDLE_PYPI_UPLOAD_8090
needs: build_sm8090
runs-on: ubuntu-latest
env:
AK: ${{ secrets.BOS_AK }}
SK: ${{ secrets.BOS_SK }}
FASTDEPLOY_WHEEL_URL: ${{ needs.build_sm8090.outputs.wheel_path }}
COMPILE_ARCH: "80,90"
steps:
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Wheel Info Show and Upload
if: github.ref_name == 'develop' || github.ref_type == 'tag'
run: |
echo "The wheel is located at: ${FASTDEPLOY_WHEEL_URL}"
wget -q --no-check-certificate ${FASTDEPLOY_WHEEL_URL}
filename=$(basename ${FASTDEPLOY_WHEEL_URL})
if [[ "${{ github.ref_name }}" == "develop" ]];then
target_path=paddle-whl/nightly/fastdeploy-gpu-${COMPILE_ARCH//,/_}/fastdeploy-gpu
elif [[ "${{ github.ref_type }}" == "tag" ]]; then
target_path=paddle-whl/stable/fastdeploy-gpu-${COMPILE_ARCH//,/_}/fastdeploy-gpu
else
echo "Not develop or tag, do nothing"
fi
wget -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py
push_file=$(realpath bos_tools.py)
python -m pip install bce-python-sdk==0.9.29
ls
python ${push_file} ${filename} ${target_path}
paddle_pypi_upload_sm8689:
environment: PaddleSourceUpload
name: PADDLE_PYPI_UPLOAD_8689
needs: build_sm8689
runs-on: ubuntu-latest
env:
AK: ${{ secrets.BOS_AK }}
SK: ${{ secrets.BOS_SK }}
FASTDEPLOY_WHEEL_URL: ${{ needs.build_sm8689.outputs.wheel_path }}
COMPILE_ARCH: "86,89"
steps:
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Wheel Info Show and Upload
if: github.ref_name == 'develop' || github.ref_type == 'tag'
run: |
echo "The wheel is located at: ${FASTDEPLOY_WHEEL_URL}"
wget -q --no-check-certificate ${FASTDEPLOY_WHEEL_URL}
filename=$(basename ${FASTDEPLOY_WHEEL_URL})
if [[ "${{ github.ref_name }}" == "develop" ]];then
target_path=paddle-whl/nightly/fastdeploy-gpu-${COMPILE_ARCH//,/_}/fastdeploy-gpu
elif [[ "${{ github.ref_type }}" == "tag" ]]; then
target_path=paddle-whl/stable/fastdeploy-gpu-${COMPILE_ARCH//,/_}/fastdeploy-gpu
else
echo "Not develop or tag, do nothing"
fi
wget -q --no-proxy --no-check-certificate https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddlePaddle/PaddleTest/tools/bos_tools.py
push_file=$(realpath bos_tools.py)
python -m pip install bce-python-sdk==0.9.29
ls
python ${push_file} ${filename} ${target_path}
unittest_coverage:
name: Run FastDeploy Unit Tests and Coverage
needs: [clone,build_sm8090]
uses: ./.github/workflows/_unit_test_coverage.yml
with:
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
FASTDEPLOY_WHEEL_URL: ${{ needs.build_sm8090.outputs.wheel_path }}
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
logprob_test:
name: Run FastDeploy LogProb Tests
needs: [build_sm8090]
uses: ./.github/workflows/_logprob_test_linux.yml
with:
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
PADDLETEST_ARCHIVE_URL: "https://xly-devops.bj.bcebos.com/PaddleTest/PaddleTest.tar.gz"
FASTDEPLOY_WHEEL_URL: ${{ needs.build_sm8090.outputs.wheel_path }}
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
pre_ce_test:
name: Extracted partial CE model tasks to run in CI.
needs: [clone,build_sm8090]
uses: ./.github/workflows/_pre_ce_test.yml
with:
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
FASTDEPLOY_WHEEL_URL: ${{ needs.build_sm8090.outputs.wheel_path }}
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
base_test:
name: Run Base Tests
needs: [clone,build_sm8090]
uses: ./.github/workflows/_base_test.yml
with:
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
FASTDEPLOY_WHEEL_URL: ${{ needs.build_sm8090.outputs.wheel_path }}
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
accuracy_test:
name: Run Accuracy Tests
needs: [clone,build_sm8090]
uses: ./.github/workflows/_accuracy_test.yml
with:
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
FASTDEPLOY_WHEEL_URL: ${{ needs.build_sm8090.outputs.wheel_path }}
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"
stable_test:
name: Run Stable Tests
needs: [clone,build_sm8090]
uses: ./.github/workflows/_stable_test.yml
with:
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }}
MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData"

14
.gitignore vendored
View File

@@ -121,7 +121,7 @@ dmypy.json
FETCH_HEAD
#log
log*/
log/
checkpoints/
checkpoints_origin/
@@ -156,6 +156,12 @@ nohup.out
custom_ops/gpu_ops/fp8_deep_gemm/deep_gemm/include/cutlass
custom_ops/gpu_ops/fp8_deep_gemm/deep_gemm/include/cute
#marlin_kernel
custom_ops/gpu_ops/moe/moe_wna16_marlin_utils/kernel_*.cu
#machete_kernel
custom_ops/gpu_ops/machete/generated
# buff
custom_ops/tmp*
@@ -164,3 +170,9 @@ build
.ccls-cache
third_party
custom_ops/gpu_ops/w4afp8_gemm/w4afp8_gemm_*.cu
custom_ops/gpu_ops/w4afp8_gemm/w4afp8_gemm_template.h
custom_ops/gpu_ops/wfp8afp8_sparse_gemm/wfp8Afp8_sparse_gemm_*.cu
custom_ops/gpu_ops/wfp8afp8_sparse_gemm/wfp8Afp8_sparse_gemm_template.h

10
.gitmodules vendored Normal file
View File

@@ -0,0 +1,10 @@
[submodule "custom_ops/third_party/DeepGEMM"]
path = custom_ops/third_party/DeepGEMM
url = https://github.com/deepseek-ai/DeepGEMM.git
ignore = all
[submodule "custom_ops/third_party/cutlass"]
path = custom_ops/third_party/cutlass
url = https://github.com/NVIDIA/cutlass.git
[submodule "custom_ops/third_party/nlohmann_json"]
path = custom_ops/third_party/nlohmann_json
url = https://github.com/nlohmann/json.git

View File

@@ -1,3 +1,4 @@
English | [简体中文](README_CN.md)
<p align="center">
<a href="https://github.com/PaddlePaddle/FastDeploy/releases"><img src="https://github.com/user-attachments/assets/42b0039f-39e3-4279-afda-6d1865dfbffb" width="500"></a>
</p>
@@ -22,11 +23,12 @@
</p>
--------------------------------------------------------------------------------
# FastDeploy 2.0: Inference and Deployment Toolkit for LLMs and VLMs based on PaddlePaddle
# FastDeploy : Inference and Deployment Toolkit for LLMs and VLMs based on PaddlePaddle
## News
**[2025-09] 🔥 FastDeploy v2.2 is newly released!** It now offers compatibility with models in the HuggingFace ecosystem, has further optimized performance, and newly adds support for [baidu/ERNIE-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking)!
**[2025-07] 《FastDeploy2.0推理部署实测》专题活动已上线!** 完成文心4.5系列开源模型的推理部署等任务即可获得骨瓷马克杯等FastDeploy2.0官方周边及丰富奖金!🎁 欢迎大家体验反馈~ 📌[报名地址](https://www.wjx.top/vm/meSsp3L.aspx#) 📌[活动详情](https://github.com/PaddlePaddle/FastDeploy/discussions/2728)
**[2025-08] 🔥 Released FastDeploy v2.1:** A brand-new KV Cache scheduling strategy has been introduced, and expanded support for PD separation and CUDA Graph across more models. Enhanced hardware support has been added for platforms like Kunlun and Hygon, along with comprehensive optimizations to improve the performance of both the service and inference engine.
**[2025-07] The FastDeploy 2.0 Inference Deployment Challenge is now live!** Complete the inference deployment task for the ERNIE 4.5 series open-source models to win official FastDeploy 2.0 merch and generous prizes! 🎁 You're welcome to try it out and share your feedback! 📌[Sign up here](https://www.wjx.top/vm/meSsp3L.aspx#) 📌[Event details](https://github.com/PaddlePaddle/FastDeploy/discussions/2728)
@@ -41,7 +43,7 @@
- 🤝 **OpenAI API Server and vLLM Compatible**: One-command deployment with [vLLM](https://github.com/vllm-project/vllm/) interface compatibility.
- 🧮 **Comprehensive Quantization Format Support**: W8A16, W8A8, W4A16, W4A8, W2A16, FP8, and more.
-**Advanced Acceleration Techniques**: Speculative decoding, Multi-Token Prediction (MTP) and Chunked Prefill.
- 🖥️ **Multi-Hardware Support**: NVIDIA GPU, Kunlunxin XPU, Hygon DCU, Ascend NPU, Iluvatar GPU, Enflame GCU, MetaX GPU etc.
- 🖥️ **Multi-Hardware Support**: NVIDIA GPU, Kunlunxin XPU, Hygon DCU, Ascend NPU, Iluvatar GPU, Enflame GCU, MetaX GPU, Intel Gaudi etc.
## Requirements
@@ -50,14 +52,17 @@
## Installation
FastDeploy supports inference deployment on **NVIDIA GPUs**, **Kunlunxin XPUs**, **Iluvatar GPUs**, **Enflame GCUs**, and other hardware. For detailed installation instructions:
FastDeploy supports inference deployment on **NVIDIA GPUs**, **Kunlunxin XPUs**, **Iluvatar GPUs**, **Enflame GCUs**, **Hygon DCUs** and other hardware. For detailed installation instructions:
- [NVIDIA GPU](./docs/get_started/installation/nvidia_gpu.md)
- [Kunlunxin XPU](./docs/get_started/installation/kunlunxin_xpu.md)
- [Iluvatar GPU](./docs/get_started/installation/iluvatar_gpu.md)
- [Enflame GCU](./docs/get_started/installation/Enflame_gcu.md)
- [Hygon DCU](./docs/get_started/installation/hygon_dcu.md)
- [MetaX GPU](./docs/get_started/installation/metax_gpu.md)
- [Intel Gaudi](./docs/get_started/installation/intel_gaudi.md)
**Note:** We are actively working on expanding hardware support. Additional hardware platforms including Ascend NPU, Hygon DCU, and MetaX GPU are currently under development and testing. Stay tuned for updates!
**Note:** We are actively working on expanding hardware support. Additional hardware platforms including Ascend NPU are currently under development and testing. Stay tuned for updates!
## Get Started
@@ -67,19 +72,12 @@ Learn how to use FastDeploy through our documentation:
- [ERNIE-4.5-VL Multimodal Model Deployment](./docs/get_started/ernie-4.5-vl.md)
- [Offline Inference Development](./docs/offline_inference.md)
- [Online Service Deployment](./docs/online_serving/README.md)
- [Full Supported Models List](./docs/supported_models.md)
- [Best Practices](./docs/best_practices/README.md)
## Supported Models
| Model | Data Type | PD Disaggregation | Chunked Prefill | Prefix Caching | MTP | CUDA Graph | Maximum Context Length |
|:--- | :------- | :---------- | :-------- | :-------- | :----- | :----- | :----- |
|ERNIE-4.5-300B-A47B | BF16/WINT4/WINT8/W4A8C8/WINT2/FP8 | ✅| ✅ | ✅|✅(WINT4)| WIP |128K |
|ERNIE-4.5-300B-A47B-Base| BF16/WINT4/WINT8 | ✅| ✅ | ✅|✅(WINT4)| WIP | 128K |
|ERNIE-4.5-VL-424B-A47B | BF16/WINT4/WINT8 | WIP | ✅ | WIP | ❌ | WIP |128K |
|ERNIE-4.5-VL-28B-A3B | BF16/WINT4/WINT8 | ❌ | ✅ | WIP | ❌ | WIP |128K |
|ERNIE-4.5-21B-A3B | BF16/WINT4/WINT8/FP8 | ❌ | ✅ | ✅ | WIP | ✅|128K |
|ERNIE-4.5-21B-A3B-Base | BF16/WINT4/WINT8/FP8 | ❌ | ✅ | ✅ | WIP | ✅|128K |
|ERNIE-4.5-0.3B | BF16/WINT8/FP8 | ❌ | ✅ | ✅ | ❌ | ✅| 128K |
Learn how to download models, enable using the torch format, and more:
- [Full Supported Models List](./docs/supported_models.md)
## Advanced Usage

90
README_CN.md Normal file
View File

@@ -0,0 +1,90 @@
[English](README.md) | 简体中文
<p align="center">
<a href="https://github.com/PaddlePaddle/FastDeploy/releases"><img src="https://github.com/user-attachments/assets/42b0039f-39e3-4279-afda-6d1865dfbffb" width="500"></a>
</p>
<p align="center">
<a href=""><img src="https://img.shields.io/badge/python-3.10-aff.svg"></a>
<a href=""><img src="https://img.shields.io/badge/os-linux-pink.svg"></a>
<a href="https://github.com/PaddlePaddle/FastDeploy/graphs/contributors"><img src="https://img.shields.io/github/contributors/PaddlePaddle/FastDeploy?color=9ea"></a>
<a href="https://github.com/PaddlePaddle/FastDeploy/commits"><img src="https://img.shields.io/github/commit-activity/m/PaddlePaddle/FastDeploy?color=3af"></a>
<a href="https://github.com/PaddlePaddle/FastDeploy/issues"><img src="https://img.shields.io/github/issues/PaddlePaddle/FastDeploy?color=9cc"></a>
<a href="https://github.com/PaddlePaddle/FastDeploy/stargazers"><img src="https://img.shields.io/github/stars/PaddlePaddle/FastDeploy?color=ccf"></a>
</p>
<p align="center">
<a href="https://trendshift.io/repositories/4046" target="_blank"><img src="https://trendshift.io/api/badge/repositories/4046" alt="PaddlePaddle%2FFastDeploy | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a></br>
<a href="https://paddlepaddle.github.io/FastDeploy/zh/get_started/installation/nvidia_gpu/"><b> 安装指导 </b></a>
|
<a href="https://paddlepaddle.github.io/FastDeploy/zh/get_started/quick_start"><b> 快速入门 </b></a>
|
<a href="https://paddlepaddle.github.io/FastDeploy/zh/supported_models/"><b> 支持模型列表 </b></a>
</p>
--------------------------------------------------------------------------------
# FastDeploy :基于飞桨的大语言模型与视觉语言模型推理部署工具包
## 最新活动
**[2025-09] 🔥 FastDeploy v2.2 全新发布**: HuggingFace生态模型兼容性能进一步优化更新增对[baidu/ERNIE-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking)支持!
**[2025-08] FastDeploy v2.1 发布**:全新的KV Cache调度策略更多模型支持PD分离和CUDA Graph昆仑、海光等更多硬件支持增强全方面优化服务和推理引擎的性能。
**[2025-07] 《FastDeploy2.0推理部署实测》专题活动已上线!** 完成文心4.5系列开源模型的推理部署等任务即可获得骨瓷马克杯等FastDeploy2.0官方周边及丰富奖金!🎁 欢迎大家体验反馈~ 📌[报名地址](https://www.wjx.top/vm/meSsp3L.aspx#) 📌[活动详情](https://github.com/PaddlePaddle/FastDeploy/discussions/2728)
## 关于
**FastDeploy** 是基于飞桨PaddlePaddle的大语言模型LLM与视觉语言模型VLM推理部署工具包提供**开箱即用的生产级部署方案**,核心技术特性包括:
- 🚀 **负载均衡式PD分解**工业级解决方案支持上下文缓存与动态实例角色切换在保障SLO达标和吞吐量的同时优化资源利用率
- 🔄 **统一KV缓存传输**轻量级高性能传输库支持智能NVLink/RDMA选择
- 🤝 **OpenAI API服务与vLLM兼容**:单命令部署,兼容[vLLM](https://github.com/vllm-project/vllm/)接口
- 🧮 **全量化格式支持**W8A16、W8A8、W4A16、W4A8、W2A16、FP8等
-**高级加速技术**推测解码、多令牌预测MTP及分块预填充
- 🖥️ **多硬件支持**NVIDIA GPU、昆仑芯XPU、海光DCU、昇腾NPU、天数智芯GPU、燧原GCU、沐曦GPU、英特尔Gaudi等
## 要求
- 操作系统: Linux
- Python: 3.10 ~ 3.12
## 安装
FastDeploy 支持在**英伟达NVIDIAGPU**、**昆仑芯KunlunxinXPU**、**天数IluvatarGPU**、**燧原EnflameGCU**、**海光HygonDCU** 以及其他硬件上进行推理部署。详细安装说明如下:
- [英伟达 GPU](./docs/zh/get_started/installation/nvidia_gpu.md)
- [昆仑芯 XPU](./docs/zh/get_started/installation/kunlunxin_xpu.md)
- [天数 CoreX](./docs/zh/get_started/installation/iluvatar_gpu.md)
- [燧原 S60](./docs/zh/get_started/installation/Enflame_gcu.md)
- [海光 DCU](./docs/zh/get_started/installation/hygon_dcu.md)
- [沐曦 GPU](./docs/zh/get_started/installation/metax_gpu.md)
- [英特尔 Gaudi](./docs/zh/get_started/installation/intel_gaudi.md)
**注意:** 我们正在积极拓展硬件支持范围。目前包括昇腾AscendNPU 等其他硬件平台正在开发测试中。敬请关注更新!
## 入门指南
通过我们的文档了解如何使用 FastDeploy
- [10分钟快速部署](./docs/zh/get_started/quick_start.md)
- [ERNIE-4.5 部署](./docs/zh/get_started/ernie-4.5.md)
- [ERNIE-4.5-VL 部署](./docs/zh/get_started/ernie-4.5-vl.md)
- [离线推理](./docs/zh/offline_inference.md)
- [在线服务](./docs/zh/online_serving/README.md)
- [最佳实践](./docs/zh/best_practices/README.md)
## 支持模型列表
通过我们的文档了解如何下载模型如何支持torch格式等
- [模型支持列表](./docs/zh/supported_models.md)
## 进阶用法
- [量化](./docs/zh/quantization/README.md)
- [分离式部署](./docs/zh/features/disaggregated.md)
- [投机解码](./docs/zh/features/speculative_decoding.md)
- [前缀缓存](./docs/zh/features/prefix_caching.md)
- [分块预填充](./docs/zh/features/chunked_prefill.md)
## 致谢
FastDeploy 依据 [Apache-2.0 开源许可证](./LICENSE). 进行授权。在开发过程中,我们参考并借鉴了 [vLLM](https://github.com/vllm-project/vllm) 的部分代码,以保持接口兼容性,在此表示衷心感谢。

View File

@@ -98,7 +98,7 @@ def main(args):
raise ValueError("--max_concurrency should be same length as --s_itl_base_model")
for max_concurrency, s_itl in zip(args.max_concurrency, args.s_itl_base_model):
# Wramup
# Warmup
print("Starting warmup...")
with open(os.devnull, "w") as f:
with contextlib.redirect_stdout(f):

View File

@@ -965,7 +965,7 @@ if __name__ == "__main__":
parser.add_argument(
"--backend",
type=str,
default="vllm",
default="openai-chat",
choices=list(ASYNC_REQUEST_FUNCS.keys()),
)
parser.add_argument(

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 128
tensor_parallel_size: 4
use_cudagraph: True
load_choices: "default_v1"

View File

@@ -0,0 +1,6 @@
max_model_len: 32768
max_num_seqs: 128
tensor_parallel_size: 4
use_cudagraph: True
load_choices: "default_v1"
quantization: wfp8afp8

View File

@@ -6,3 +6,4 @@ tensor_parallel_size: 8
max_num_batched_tokens: 4096
max_num_partial_prefills: 3
max_long_partial_prefills: 3
quantization: wint4

View File

@@ -0,0 +1,6 @@
tensor_parallel_size: 1
max_model_len: 131072
max_num_seqs: 32
quantization: wint4
max_num_batched_tokens: 8192
plas_attention_config: '{"plas_encoder_top_k_left": 50, "plas_encoder_top_k_right": 60, "plas_decoder_top_k_left": 100, "plas_decoder_top_k_right": 120}'

View File

@@ -6,3 +6,4 @@ tensor_parallel_size: 8
max_num_batched_tokens: 4096
max_num_partial_prefills: 3
max_long_partial_prefills: 3
quantization: wint8

View File

@@ -0,0 +1,5 @@
max_model_len: 32768
max_num_seqs: 256
kv_cache_ratio: 0.75
tensor_parallel_size: 4
gpu_memory_utilization: 0.9

View File

@@ -13,3 +13,4 @@ pd_comm_port: "2334"
max_num_batched_tokens: 384
max_num_partial_prefills: 3
max_long_partial_prefills: 3
quantization: wint4

View File

@@ -10,3 +10,4 @@ engine_worker_queue_port: 6677
cache_transfer_protocol: "rdma,ipc"
rdma_comm_ports: "7675,7676,7677,7678"
pd_comm_port: "2333"
quantization: wint4

View File

@@ -0,0 +1,6 @@
num_gpu_blocks_override: 1024
max_model_len: 8192
max_num_seqs: 64
data_parallel_size: 8
tensor_parallel_size: 1
enable_expert_parallel: True

View File

@@ -0,0 +1,11 @@
enable_mm: True
max_model_len: 131072
max_num_seqs: 56
gpu_memory_utilization: 0.8
kv_cache_ratio: 0.8
tensor_parallel_size: 8
quantization: wint4
limit_mm_per_prompt: '{"image": 100, "video": 100}'
enable_chunked_prefill: True
max_num_batched_tokens: 384
reasoning_parser: ernie-45-vl

View File

@@ -1,7 +1,7 @@
enable_mm: True
max_model_len: 32768
max_num_seqs: 36
gpu_memory_utilization: 0.95
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.8
tensor_parallel_size: 8
quantization: wint8

View File

@@ -1,7 +1,7 @@
enable_mm: True
max_model_len: 32768
max_num_seqs: 36
gpu_memory_utilization: 0.8
gpu_memory_utilization: 0.85
kv_cache_ratio: 0.8
tensor_parallel_size: 8
quantization: wint8

View File

@@ -0,0 +1,9 @@
enable_mm: True
max_model_len: 32768
max_num_seqs: 128
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.71
tensor_parallel_size: 1
enable_chunked_prefill: True
max_num_batched_tokens: 384
reasoning_parser: ernie-45-vl

View File

@@ -0,0 +1,10 @@
enable_mm: True
max_model_len: 32768
max_num_seqs: 128
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.71
tensor_parallel_size: 1
enable_chunked_prefill: True
max_num_batched_tokens: 384
quantization: wint4
reasoning_parser: ernie-45-vl

View File

@@ -0,0 +1,10 @@
enable_mm: True
max_model_len: 32768
max_num_seqs: 128
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.71
tensor_parallel_size: 1
enable_chunked_prefill: True
max_num_batched_tokens: 384
quantization: wint8
reasoning_parser: ernie-45-vl

View File

@@ -0,0 +1,8 @@
top_p: 0.95
temperature: 0.6
metadata:
min_tokens: 1
max_tokens: 12288
repetition_penalty: 1.0
frequency_penalty: 0
presence_penalty: 0

View File

@@ -0,0 +1 @@
max_tokens: 131071

View File

@@ -0,0 +1 @@
max_tokens: 12288

View File

@@ -0,0 +1,8 @@
top_p: 0.95
temperature: 0.6
metadata:
min_tokens: 1
max_tokens: 131071
repetition_penalty: 1.0
frequency_penalty: 0
presence_penalty: 0

View File

@@ -0,0 +1,10 @@
reasoning-parser: ernie_x1
tool_call_parser: ernie_x1
tensor_parallel_size: 4
max_model_len: 65536
max_num_seqs: 128
enable_prefix_caching: True
enable_chunked_prefill: True
gpu_memory_utilization: 0.85
use_cudagraph: True
enable_custom_all_reduce: True

View File

@@ -0,0 +1,6 @@
tensor_parallel_size: 1
max_model_len: 131072
max_num_seqs: 32
reasoning_parser: ernie_x1
tool_call_parser: ernie_x1
load_choices: "default_v1"

View File

@@ -34,7 +34,6 @@ EGG_DIR="fastdeploy.egg-info"
# custom_ops directory config
OPS_SRC_DIR="custom_ops"
OPS_TMP_DIR_BASE="tmp_base"
OPS_TMP_DIR="tmp"
# command line log config
@@ -71,25 +70,20 @@ function copy_ops(){
PY_VERSION="py${PY_MAIN_VERSION}.${PY_SUB_VERSION}"
SYSTEM_VERSION=`${python} -c "import platform; print(platform.system().lower())"`
PROCESSOR_VERSION=`${python} -c "import platform; print(platform.processor())"`
WHEEL_BASE_NAME="fastdeploy_base_ops-${OPS_VERSION}-${PY_VERSION}-${SYSTEM_VERSION}-${PROCESSOR_VERSION}.egg"
WHEEL_NAME="fastdeploy_ops-${OPS_VERSION}-${PY_VERSION}-${SYSTEM_VERSION}-${PROCESSOR_VERSION}.egg"
WHEEL_CPU_NAME="fastdeploy_cpu_ops-${OPS_VERSION}-${PY_VERSION}-${SYSTEM_VERSION}-${PROCESSOR_VERSION}.egg"
is_rocm=`$python -c "import paddle; print(paddle.is_compiled_with_rocm())"`
if [ "$is_rocm" = "True" ]; then
DEVICE_TYPE="rocm"
mkdir -p ../fastdeploy/model_executor/ops/base
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
echo -e "BASE and ROCM ops have been copy to fastdeploy"
echo -e "ROCM ops have been copy to fastdeploy"
return
fi
mkdir -p ../fastdeploy/model_executor/ops/base
is_cuda=`$python -c "import paddle; print(paddle.is_compiled_with_cuda())"`
if [ "$is_cuda" = "True" ]; then
DEVICE_TYPE="gpu"
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
echo -e "BASE and CUDA ops have been copy to fastdeploy"
echo -e "CUDA ops have been copy to fastdeploy"
return
fi
@@ -112,9 +106,8 @@ function copy_ops(){
if_corex=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device(\"iluvatar_gpu\"))"`
if [ "$if_corex" = "True" ]; then
DEVICE_TYPE="iluvatar-gpu"
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/iluvatar
echo -e "BASE and Iluvatar ops have been copy to fastdeploy"
echo -e "Iluvatar ops have been copy to fastdeploy"
return
fi
@@ -126,27 +119,39 @@ function copy_ops(){
return
fi
is_maca=`$python -c "import paddle; print(paddle.device.is_compiled_with_custom_device('metax_gpu'))"`
if [ "$is_maca" = "True" ]; then
DEVICE_TYPE="metax_gpu"
mkdir -p ../fastdeploy/model_executor/ops/base
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
echo -e "MACA ops have been copy to fastdeploy"
return
fi
is_intel_hpu=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device('intel_hpu'))"`
if [ "$is_intel_hpu" = "True" ]; then
DEVICE_TYPE="intel-hpu"
echo -e "intel_hpu ops have been copy to fastdeploy"
return
fi
DEVICE_TYPE="cpu"
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
cd ../../../../
cp -r ${OPS_TMP_DIR}/${WHEEL_CPU_NAME}/* ../fastdeploy/model_executor/ops/cpu
echo -e "BASE and CPU ops have been copy to fastdeploy"
echo -e "CPU ops have been copy to fastdeploy"
return
}
function build_and_install_ops() {
cd $OPS_SRC_DIR
export no_proxy=bcebos.com,paddlepaddle.org.cn,${no_proxy}
echo -e "${BLUE}[build]${NONE} build and install fastdeploy_base_ops..."
${python} setup_ops_base.py install --install-lib ${OPS_TMP_DIR_BASE}
find ${OPS_TMP_DIR_BASE} -type f -name "*.o" -exec rm -f {} \;
echo -e "${BLUE}[build]${NONE} build and install fastdeploy_ops..."
TMP_DIR_REAL_PATH=`readlink -f ${OPS_TMP_DIR}`
is_xpu=`$python -c "import paddle; print(paddle.is_compiled_with_xpu())"`
if [ "$is_xpu" = "True" ]; then
cd xpu_ops/src
cd xpu_ops
bash build.sh ${TMP_DIR_REAL_PATH}
cd ../..
cd ..
elif [ "$FD_CPU_USE_BF16" == "true" ]; then
if [ "$FD_BUILDING_ARCS" == "" ]; then
FD_CPU_USE_BF16=True ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
@@ -160,7 +165,9 @@ function build_and_install_ops() {
else
FD_BUILDING_ARCS=${FD_BUILDING_ARCS} ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
fi
find ${OPS_TMP_DIR} -type f -name "*.o" -exec rm -f {} \;
if [ -d "${OPS_TMP_DIR}" ]; then
find ${OPS_TMP_DIR} -type f -name "*.o" -exec rm -f {} \;
fi
else
echo "Error: Invalid parameter '$FD_CPU_USE_BF16'. Please use true or false."
exit 1
@@ -213,7 +220,6 @@ function cleanup() {
fi
rm -rf $OPS_SRC_DIR/$BUILD_DIR $OPS_SRC_DIR/$EGG_DIR
rm -rf $OPS_SRC_DIR/$OPS_TMP_DIR_BASE
rm -rf $OPS_SRC_DIR/$OPS_TMP_DIR
}

View File

@@ -84,7 +84,6 @@ std::vector<paddle::Tensor> GetPaddingOffset(const paddle::Tensor &input_ids,
seq_length,
bsz);
return {x_remove_padding,
cum_offsets_out,
padding_offset,
cu_seqlens_q,
cu_seqlens_k};
@@ -97,7 +96,7 @@ std::vector<std::vector<int64_t>> GetPaddingOffsetInferShape(
const std::vector<int64_t> &seq_len_shape) {
int64_t bsz = seq_len_shape[0];
int64_t seq_len = input_ids_shape[1];
return {{-1}, {bsz}, {-1}, {bsz + 1}, {bsz + 1}};
return {{-1}, {-1}, {bsz + 1}, {bsz + 1}};
}
std::vector<paddle::DataType> GetPaddingOffsetInferDtype(
@@ -106,7 +105,6 @@ std::vector<paddle::DataType> GetPaddingOffsetInferDtype(
const paddle::DataType &token_num_dtype,
const paddle::DataType &seq_len_dtype) {
return {input_ids_dtype,
seq_len_dtype,
seq_len_dtype,
seq_len_dtype,
seq_len_dtype};
@@ -115,7 +113,6 @@ std::vector<paddle::DataType> GetPaddingOffsetInferDtype(
PD_BUILD_STATIC_OP(get_padding_offset_cpu)
.Inputs({"input_ids", "cum_offsets", "token_num", "seq_len"})
.Outputs({"x_remove_padding",
"cum_offsets_out",
"padding_offset",
"cu_seqlens_q",
"cu_seqlens_k"})

View File

@@ -1,4 +1,4 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -19,10 +19,11 @@
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
#endif
template <typename T>
void RebuildPaddingCPUImpl(T *output_data,
const T *input_data,
const int *cum_offsets_data,
const int *cu_seqlens_q_data,
const int *seq_len_this_time_data,
const int *seq_lens_decoder_data,
const int *seq_lens_encoder_data,
@@ -40,11 +41,12 @@ void RebuildPaddingCPUImpl(T *output_data,
if (seq_lens_decoder_data[bi] == 0 && seq_lens_encoder_data[bi] == 0) {
continue;
}
if (seq_lens_encoder_data[bi] > 0) {
seq_id = seq_lens_encoder_data[bi] - 1;
}
const int ori_token_idx =
bi * max_input_length - cum_offsets_data[bi] + seq_id;
const int ori_token_idx = cu_seqlens_q_data[bi] + seq_id;
const int src_offset = ori_token_idx * dim_embed + bias_idx;
output_data[i] = input_data[src_offset];
@@ -54,7 +56,7 @@ void RebuildPaddingCPUImpl(T *output_data,
template <typename T>
void RebuildAppendPaddingCPUImpl(T *output_data,
const T *input_data,
const int *cum_offsets_data,
const int *cu_seqlens_q_data,
const int *seq_len_this_time_data,
const int *seq_lens_decoder_data,
const int *seq_lens_encoder_data,
@@ -69,30 +71,32 @@ void RebuildAppendPaddingCPUImpl(T *output_data,
int bi = ori_token_id / max_input_length;
if (seq_len_this_time_data[bi] == 0 ||
(seq_lens_decoder_data[bi] == 0 &&
seq_lens_encoder_data[bi] == 0)) {
continue;
}
seq_lens_encoder_data[bi] == 0)) {
continue;
}
int seq_id = 0;
if (seq_lens_encoder_data[bi] > 0) {
seq_id = seq_lens_encoder_data[bi] - 1;
}
int input_token_id = ori_token_id - cum_offsets_data[bi] + seq_id;
int input_token_id = cu_seqlens_q_data[bi] + seq_id;
int bias_idx = i % dim_embed;
int src_offset = input_token_id * dim_embed + bias_idx;
output_data[i] = input_data[src_offset];
}
}
std::vector<paddle::Tensor> RebuildPaddingCPU(
const paddle::Tensor &tmp_out,
const paddle::Tensor &cum_offsets,
const paddle::Tensor &cu_seqlens_q,
const paddle::Tensor &seq_len_this_time,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &seq_lens_encoder,
const paddle::optional<paddle::Tensor> &output_padding_offset,
int max_input_length) {
auto tmp_out_cpu = tmp_out.copy_to(paddle::CPUPlace(), true);
auto cum_offsets_cpu = cum_offsets.copy_to(paddle::CPUPlace(), true);
auto cu_seqlens_q_cpu = cu_seqlens_q.copy_to(paddle::CPUPlace(), true);
auto seq_len_this_time_cpu =
seq_len_this_time.copy_to(paddle::CPUPlace(), true);
auto seq_lens_decoder_cpu =
@@ -107,7 +111,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
int token_num = tmp_out_cpu.shape()[0];
int dim_embed = tmp_out_cpu.shape()[1];
int bsz = cum_offsets_cpu.shape()[0];
int bsz = cu_seqlens_q_cpu.shape()[0] - 1;
paddle::Tensor out;
if (output_padding_offset_cpu) {
@@ -128,7 +132,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
{bsz, dim_embed}, 0, tmp_out_cpu.dtype(), paddle::CPUPlace());
}
const int *cum_offsets_data = cum_offsets_cpu.data<int>();
const int *cu_seqlens_q_data = cu_seqlens_q_cpu.data<int>();
const int *seq_len_this_time_data = seq_len_this_time_cpu.data<int>();
const int *seq_lens_decoder_data = seq_lens_decoder_cpu.data<int>();
const int *seq_lens_encoder_data = seq_lens_encoder_cpu.data<int>();
@@ -141,7 +145,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
case paddle::DataType::FLOAT32:
RebuildAppendPaddingCPUImpl<float>(out.data<float>(),
tmp_out_cpu.data<float>(),
cum_offsets_data,
cu_seqlens_q_data,
seq_len_this_time_data,
seq_lens_decoder_data,
seq_lens_encoder_data,
@@ -154,7 +158,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
RebuildAppendPaddingCPUImpl<paddle::float16>(
out.data<paddle::float16>(),
tmp_out_cpu.data<paddle::float16>(),
cum_offsets_data,
cu_seqlens_q_data,
seq_len_this_time_data,
seq_lens_decoder_data,
seq_lens_encoder_data,
@@ -167,7 +171,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
RebuildAppendPaddingCPUImpl<paddle::bfloat16>(
out.data<paddle::bfloat16>(),
tmp_out_cpu.data<paddle::bfloat16>(),
cum_offsets_data,
cu_seqlens_q_data,
seq_len_this_time_data,
seq_lens_decoder_data,
seq_lens_encoder_data,
@@ -186,7 +190,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
case paddle::DataType::FLOAT32:
RebuildPaddingCPUImpl<float>(out.data<float>(),
tmp_out_cpu.data<float>(),
cum_offsets_data,
cu_seqlens_q_data,
seq_len_this_time_data,
seq_lens_decoder_data,
seq_lens_encoder_data,
@@ -198,7 +202,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
RebuildPaddingCPUImpl<paddle::float16>(
out.data<paddle::float16>(),
tmp_out_cpu.data<paddle::float16>(),
cum_offsets_data,
cu_seqlens_q_data,
seq_len_this_time_data,
seq_lens_decoder_data,
seq_lens_encoder_data,
@@ -207,11 +211,10 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
elem_nums);
break;
case paddle::DataType::BFLOAT16:
RebuildPaddingCPUImpl<paddle::bfloat16>(
out.data<paddle::bfloat16>(),
tmp_out_cpu.data<paddle::bfloat16>(),
cum_offsets_data,
cu_seqlens_q_data,
seq_len_this_time_data,
seq_lens_decoder_data,
seq_lens_encoder_data,
@@ -230,7 +233,7 @@ std::vector<paddle::Tensor> RebuildPaddingCPU(
std::vector<std::vector<int64_t>> RebuildPaddingInferShape(
const std::vector<int64_t> &tmp_out_shape,
const std::vector<int64_t> &cum_offsets_shape,
const std::vector<int64_t> &cu_seqlens_q_shape,
const std::vector<int64_t> &seq_len_this_time_shape,
const std::vector<int64_t> &seq_lens_decoder_shape,
const std::vector<int64_t> &seq_lens_encoder_shape,
@@ -239,14 +242,14 @@ std::vector<std::vector<int64_t>> RebuildPaddingInferShape(
if (output_padding_offset_shape) {
return {{-1, dim_embed}};
} else {
int64_t bsz = cum_offsets_shape[0];
int64_t bsz = cu_seqlens_q_shape[0] - 1;
return {{bsz, dim_embed}};
}
}
std::vector<paddle::DataType> RebuildPaddingInferDtype(
const paddle::DataType &tmp_out_dtype,
const paddle::DataType &cum_offsets_dtype,
const paddle::DataType &cu_seqlens_q_dtype,
const paddle::DataType &seq_len_this_time_dtype,
const paddle::DataType &seq_lens_decoder_dtype,
const paddle::DataType &seq_lens_encoder_dtype,
@@ -256,7 +259,7 @@ std::vector<paddle::DataType> RebuildPaddingInferDtype(
PD_BUILD_STATIC_OP(rebuild_padding_cpu)
.Inputs({"tmp_out",
"cum_offsets",
"cu_seqlens_q",
"seq_len_this_time",
"seq_lens_decoder",
"seq_lens_encoder",

View File

@@ -14,7 +14,7 @@
#include "paddle/extension.h"
void set_value_by_flag_and_id(const bool *stop_flags,
void set_value_by_flags_and_idx(const bool *stop_flags,
int64_t *pre_ids_all,
const int64_t *input_ids,
const int *seq_lens_encoder,
@@ -50,7 +50,7 @@ void SetValueByFlagsAndIdx(const paddle::Tensor &pre_ids_all,
int length = pre_ids_all_shape[1];
int length_input_ids = input_ids.shape()[1];
set_value_by_flag_and_id(stop_flags.data<bool>(),
set_value_by_flags_and_idx(stop_flags.data<bool>(),
const_cast<int64_t *>(pre_ids_all.data<int64_t>()),
input_ids.data<int64_t>(),
seq_lens_encoder.data<int>(),

View File

@@ -46,7 +46,7 @@ void update_inputs_kernel(bool *not_need_stop,
not_need_stop[0] = stop_sum < stop_nums[0];
}
void UpdateInputes(const paddle::Tensor &stop_flags,
void UpdateInputs(const paddle::Tensor &stop_flags,
const paddle::Tensor &not_need_stop,
const paddle::Tensor &seq_lens_this_time,
const paddle::Tensor &seq_lens_encoder,
@@ -90,4 +90,4 @@ PD_BUILD_STATIC_OP(update_inputs_cpu)
{"seq_lens_encoder", "seq_lens_encoder_out"},
{"seq_lens_decoder", "seq_lens_decoder_out"},
{"input_ids", "input_ids_out"}})
.SetKernelFn(PD_KERNEL(UpdateInputes));
.SetKernelFn(PD_KERNEL(UpdateInputs));

View File

@@ -38,7 +38,7 @@ class type2value<phi::dtype::float16> {
template <paddle::DataType D>
std::vector<paddle::Tensor> AppendAttentionKernel(
void AppendAttentionKernel(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv,
const paddle::Tensor& key_cache,
@@ -60,6 +60,7 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
const paddle::Tensor& decoder_num_blocks,
const paddle::Tensor& set_max_lengths,
const paddle::Tensor& max_len_kv,
paddle::Tensor& fmha_out,
const paddle::optional<paddle::Tensor>& rotary_embs,
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>& qkv_bias,
@@ -72,7 +73,11 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
const paddle::optional<paddle::Tensor>& cache_v_zp,
const paddle::optional<paddle::Tensor>& out_linear_shifts,
const paddle::optional<paddle::Tensor>& out_linear_smooths,
const paddle::optional<paddle::Tensor>& mask_offset,
const paddle::optional<paddle::Tensor>& kv_signal_data,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps,
const std::string& cache_quant_type_str,
const bool use_neox_rotary_style,
const bool rope_3d,
@@ -118,27 +123,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
} else {
qkv_out = qkv;
}
paddle::Tensor fmha_out;
if (out_linear_in_scale > 0.0) {
if (fabs(quant_max_bound - 127.0f) < 0.000001) {
fmha_out = GetEmptyTensor(
{meta_data.token_nums, meta_data.q_num_heads * meta_data.head_dims},
paddle::DataType::INT8,
qkv.place());
} else if (fabs(quant_max_bound - 448.0f) < 0.000001) {
fmha_out = GetEmptyTensor(
{meta_data.token_nums, meta_data.q_num_heads * meta_data.head_dims},
paddle::DataType::FLOAT8_E4M3FN,
qkv.place());
}else{
PD_THROW("Only supported attr of quant_max_bound in ['127', '448'].");
}
} else {
fmha_out = GetEmptyTensor(
{meta_data.token_nums, meta_data.q_num_heads * meta_data.head_dims},
D,
qkv.place());
}
auto dispatch_CascadeAppendAttentionKernel = [&](auto temp_args,
const paddle::Tensor& lambda_batch_ids,
@@ -156,8 +140,8 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
key_cache,
value_cache,
attn_mask,
cache_k_dequant_scales,
cache_v_dequant_scales,
cache_quant_type_str == "block_wise_fp8" ? cache_k_quant_scales : cache_k_dequant_scales,
cache_quant_type_str == "block_wise_fp8" ? cache_v_quant_scales : cache_v_dequant_scales,
cache_k_zp,
cache_v_zp,
out_linear_shifts,
@@ -223,7 +207,10 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
main_stream,
&qkv_out,
const_cast<paddle::Tensor*>(&key_cache),
const_cast<paddle::Tensor*>(&value_cache));
const_cast<paddle::Tensor*>(&value_cache),
q_norm_weight,
k_norm_weight,
rms_norm_eps);
};
if (qkv_out_scales) {
@@ -286,11 +273,15 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
cache_v_zp,
cache_quant_type_str,
use_neox_rotary_style,
rope_3d,
max_input_length,
exec_stream,
&qkv_out,
const_cast<paddle::Tensor*>(&key_cache),
const_cast<paddle::Tensor*>(&value_cache));
const_cast<paddle::Tensor*>(&value_cache),
q_norm_weight,
k_norm_weight,
rms_norm_eps);
} else {
SpeculateWriteCacheWithRoPEKernel<data_t, data_t>(
meta_data,
@@ -309,11 +300,15 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
cache_v_zp,
cache_quant_type_str,
use_neox_rotary_style,
rope_3d,
max_input_length,
exec_stream,
&qkv_out,
const_cast<paddle::Tensor*>(&key_cache),
const_cast<paddle::Tensor*>(&value_cache));
const_cast<paddle::Tensor*>(&value_cache),
q_norm_weight,
k_norm_weight,
rms_norm_eps);
}
} else {
if (qkv_out_scales) {
@@ -322,7 +317,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
qkv, // [token_num, num_heads, head_dim]
seq_lens_decoder,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
block_tables,
rotary_embs,
@@ -339,14 +333,16 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
exec_stream,
&qkv_out,
const_cast<paddle::Tensor*>(&key_cache),
const_cast<paddle::Tensor*>(&value_cache));
const_cast<paddle::Tensor*>(&value_cache),
q_norm_weight,
k_norm_weight,
rms_norm_eps);
} else {
DecoderWriteCacheWithRoPEKernel<data_t, data_t>(
meta_data,
qkv_out, // [token_num, num_heads, head_dim]
seq_lens_decoder,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
block_tables,
rotary_embs,
@@ -363,7 +359,10 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
exec_stream,
&qkv_out,
const_cast<paddle::Tensor*>(&key_cache),
const_cast<paddle::Tensor*>(&value_cache));
const_cast<paddle::Tensor*>(&value_cache),
q_norm_weight,
k_norm_weight,
rms_norm_eps);
}
}
@@ -392,8 +391,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
cudaStreamWaitEvent(main_stream, decoder_event);
}
}
return {fmha_out, qkv_out};
}
std::vector<paddle::Tensor> AppendAttention(
@@ -429,7 +426,11 @@ std::vector<paddle::Tensor> AppendAttention(
const paddle::optional<paddle::Tensor>& cache_v_zp,
const paddle::optional<paddle::Tensor>& out_linear_shifts,
const paddle::optional<paddle::Tensor>& out_linear_smooths,
const paddle::optional<paddle::Tensor>& mask_offset,
const paddle::optional<paddle::Tensor>& kv_signal_data,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps,
const std::string& compute_dtype,
const std::string& cache_quant_type_str,
const bool use_neox_rotary_style,
@@ -464,8 +465,60 @@ std::vector<paddle::Tensor> AppendAttention(
meta_data.block_size = key_cache.dims()[2];
meta_data.batch_size = seq_lens_this_time.dims()[0];
auto dispatch_by_template = [&](auto temp_args) -> std::vector<paddle::Tensor> {
return AppendAttentionKernel<type2value<decltype(temp_args)>::value>(
// template dtype generation
phi::DataType dtype_id;
switch (qkv.dtype()) {
case paddle::DataType::FLOAT16: {dtype_id = phi::DataType::FLOAT16; break;}
case paddle::DataType::BFLOAT16: {dtype_id = phi::DataType::BFLOAT16; break;}
case paddle::DataType::INT32: {
if (compute_dtype == "bf16") {
dtype_id = phi::DataType::BFLOAT16;
break;
} else if (compute_dtype == "fp16") {
dtype_id = phi::DataType::FLOAT16;
break;
} else {
PD_THROW("Only supported attr of compute_dtype in ['fp16', 'bf16'].");
break;
}
}
default: {
PD_THROW(
"NOT supported data type. "
"Only float16 and bfloat16 are supported. ");
break;
}
}
// fmha_out generation, rewrite from AppendAttentionKernel
paddle::Tensor fmha_out;
if (out_linear_in_scale > 0.0) {
if (fabs(quant_max_bound - 127.0f) < 0.000001) {
fmha_out = GetEmptyTensor(
{meta_data.token_nums, meta_data.q_num_heads * meta_data.head_dims},
paddle::DataType::INT8,
qkv.place());
} else if (fabs(quant_max_bound - 448.0f) < 0.000001) {
fmha_out = GetEmptyTensor(
{meta_data.token_nums, meta_data.q_num_heads * meta_data.head_dims},
paddle::DataType::FLOAT8_E4M3FN,
qkv.place());
} else{
PD_THROW("Only supported attr of quant_max_bound in ['127', '448'].");
}
} else {
fmha_out = GetEmptyTensor(
{meta_data.token_nums, meta_data.q_num_heads * meta_data.head_dims},
dtype_id,
qkv.place());
}
if (mask_offset) {
meta_data.mask_offset = mask_offset.get().data<int>();
}
auto dispatch_by_template = [&](auto temp_args) -> void {
AppendAttentionKernel<type2value<decltype(temp_args)>::value>(
meta_data,
qkv,
key_cache,
@@ -487,6 +540,7 @@ std::vector<paddle::Tensor> AppendAttention(
decoder_num_blocks,
set_max_lengths,
max_len_kv,
fmha_out,
rotary_embs,
attn_mask,
qkv_bias,
@@ -499,7 +553,11 @@ std::vector<paddle::Tensor> AppendAttention(
cache_v_zp,
out_linear_shifts,
out_linear_smooths,
mask_offset,
kv_signal_data,
q_norm_weight,
k_norm_weight,
rms_norm_eps,
cache_quant_type_str,
use_neox_rotary_style,
rope_3d,
@@ -514,20 +572,183 @@ std::vector<paddle::Tensor> AppendAttention(
speculate_max_draft_token_num,
causal,
speculate_decoder);
};
phi::dtype::float16 fp16_dtype;
phi::dtype::bfloat16 bp16_dtype;
switch (dtype_id){
case phi::DataType::FLOAT16: {
dispatch_by_template(fp16_dtype);
return {fmha_out};
}
case phi::DataType::BFLOAT16: {
dispatch_by_template(bp16_dtype);
return {fmha_out};
}
default:
PD_THROW(
"NOT supported data type. "
"Only float16 and bfloat16 are supported. ");
break;
}
return {paddle::Tensor{}};
}
void AppendAttentionWithOutput(
const paddle::Tensor& qkv,
const paddle::Tensor& key_cache,
const paddle::Tensor& value_cache,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_tables,
const paddle::Tensor& encoder_batch_ids,
const paddle::Tensor& encoder_tile_ids_per_batch,
const paddle::Tensor& encoder_num_blocks,
const paddle::Tensor& kv_batch_ids,
const paddle::Tensor& kv_tile_ids_per_batch,
const paddle::Tensor& kv_num_blocks,
const paddle::Tensor& decoder_batch_ids,
const paddle::Tensor& decoder_tile_ids_per_batch,
const paddle::Tensor& decoder_num_blocks,
const paddle::Tensor& set_max_lengths,
const paddle::Tensor& max_len_kv,
paddle::Tensor& fmha_out,
const paddle::optional<paddle::Tensor>& rotary_embs,
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>& qkv_bias,
const paddle::optional<paddle::Tensor>& qkv_out_scales,
const paddle::optional<paddle::Tensor>& cache_k_quant_scales,
const paddle::optional<paddle::Tensor>& cache_v_quant_scales,
const paddle::optional<paddle::Tensor>& cache_k_dequant_scales,
const paddle::optional<paddle::Tensor>& cache_v_dequant_scales,
const paddle::optional<paddle::Tensor>& cache_k_zp,
const paddle::optional<paddle::Tensor>& cache_v_zp,
const paddle::optional<paddle::Tensor>& out_linear_shifts,
const paddle::optional<paddle::Tensor>& out_linear_smooths,
const paddle::optional<paddle::Tensor>& mask_offset,
const paddle::optional<paddle::Tensor>& kv_signal_data,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps,
const std::string& compute_dtype,
const std::string& cache_quant_type_str,
const bool use_neox_rotary_style,
const bool rope_3d,
const int max_input_length,
const float quant_max_bound,
const float quant_min_bound,
const float out_linear_in_scale,
const int encoder_block_shape_q,
const int decoder_block_shape_q,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool speculate_decoder) {
AppendAttnMetaData meta_data;
const auto& qkv_dims = qkv.dims();
const auto& key_cache_dims = key_cache.dims();
meta_data.token_nums = qkv_dims[0];
meta_data.kv_num_heads = key_cache_dims[1];
meta_data.head_dims = key_cache_dims[3];
// TODO: trick method support c4, add attr head_dims in the future
if (cache_quant_type_str == "cache_int4_zp") {
meta_data.head_dims *= 2;
}
const int total_num_head =
qkv_dims[qkv_dims.size() - 1] / meta_data.head_dims;
meta_data.q_num_heads = total_num_head - 2 * meta_data.kv_num_heads;
meta_data.max_blocks_per_seq = block_tables.dims()[1];
meta_data.block_size = key_cache.dims()[2];
meta_data.batch_size = seq_lens_this_time.dims()[0];
if (mask_offset) {
meta_data.mask_offset = mask_offset.get().data<int>();
}
auto dispatch_by_template = [&](auto temp_args) -> void {
AppendAttentionKernel<type2value<decltype(temp_args)>::value>(
meta_data,
qkv,
key_cache,
value_cache,
seq_lens_encoder,
seq_lens_decoder,
seq_lens_this_time,
batch_id_per_token,
cu_seqlens_q,
block_tables,
encoder_batch_ids,
encoder_tile_ids_per_batch,
encoder_num_blocks,
kv_batch_ids,
kv_tile_ids_per_batch,
kv_num_blocks,
decoder_batch_ids,
decoder_tile_ids_per_batch,
decoder_num_blocks,
set_max_lengths,
max_len_kv,
fmha_out,
rotary_embs,
attn_mask,
qkv_bias,
qkv_out_scales,
cache_k_quant_scales,
cache_v_quant_scales,
cache_k_dequant_scales,
cache_v_dequant_scales,
cache_k_zp,
cache_v_zp,
out_linear_shifts,
out_linear_smooths,
mask_offset,
kv_signal_data,
q_norm_weight,
k_norm_weight,
rms_norm_eps,
cache_quant_type_str,
use_neox_rotary_style,
rope_3d,
max_input_length,
quant_max_bound,
quant_min_bound,
out_linear_in_scale,
encoder_block_shape_q,
decoder_block_shape_q,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
causal,
speculate_decoder);
};
phi::dtype::float16 fp16_dtype;
phi::dtype::bfloat16 bp16_dtype;
switch (qkv.dtype()) {
case paddle::DataType::FLOAT16: return dispatch_by_template(fp16_dtype);
case paddle::DataType::BFLOAT16: return dispatch_by_template(bp16_dtype);
case paddle::DataType::FLOAT16: {
dispatch_by_template(fp16_dtype);
break;
}
case paddle::DataType::BFLOAT16: {
dispatch_by_template(bp16_dtype);
break;
}
case paddle::DataType::INT32: {
if (compute_dtype == "bf16") {
return dispatch_by_template(bp16_dtype);
dispatch_by_template(bp16_dtype);
break;
} else if (compute_dtype == "fp16") {
return dispatch_by_template(fp16_dtype);
dispatch_by_template(fp16_dtype);
break;
} else {
PD_THROW("Only supported attr of compute_dtype in ['fp16', 'bf16'].");
break;
@@ -540,9 +761,9 @@ std::vector<paddle::Tensor> AppendAttention(
break;
}
}
return {paddle::Tensor{}};
}
std::vector<std::vector<int64_t>> AppendAttentionInferShape(
const std::vector<int64_t>& qkv_shape,
const std::vector<int64_t>& key_cache_shape,
@@ -576,7 +797,11 @@ std::vector<std::vector<int64_t>> AppendAttentionInferShape(
const paddle::optional<std::vector<int64_t>>& cache_v_zp_shape,
const paddle::optional<std::vector<int64_t>>& out_linear_shifts_shape,
const paddle::optional<std::vector<int64_t>>& out_linear_smooths_shape,
const paddle::optional<std::vector<int64_t>>& mask_offset_shape,
const paddle::optional<std::vector<int64_t>>& kv_signal_data_shape,
const paddle::optional<std::vector<int64_t>>& q_norm_weight_shape,
const paddle::optional<std::vector<int64_t>>& k_norm_weight_shape,
const float rms_norm_eps,
const std::string& compute_dtype,
const std::string& cache_quant_type_str,
const bool use_neox_rotary_style,
@@ -600,7 +825,7 @@ std::vector<std::vector<int64_t>> AppendAttentionInferShape(
}
const int total_num_head = qkv_shape[qkv_shape.size() - 1] / head_dim;
const int num_heads = total_num_head - 2 * kv_num_heads;
return {{token_num, num_heads * head_dim}, qkv_shape};
return {{token_num, num_heads * head_dim}};
}
std::vector<paddle::DataType> AppendAttentionInferDtype(
@@ -636,7 +861,11 @@ std::vector<paddle::DataType> AppendAttentionInferDtype(
const paddle::optional<paddle::DataType>& cache_v_zp_dtype,
const paddle::optional<paddle::DataType>& out_linear_shifts_dtype,
const paddle::optional<paddle::DataType>& out_linear_smooths_dtype,
const paddle::optional<paddle::DataType>& mask_offset_dtype,
const paddle::optional<paddle::DataType>& kv_signal_data_dtype,
const paddle::optional<paddle::DataType>& q_norm_weight_dtype,
const paddle::optional<paddle::DataType>& k_norm_weight_dtype,
const float rms_norm_eps,
const std::string& compute_dtype,
const std::string& cache_quant_type_str,
const bool use_neox_rotary_style,
@@ -655,32 +884,148 @@ std::vector<paddle::DataType> AppendAttentionInferDtype(
if (compute_dtype == "bf16") {
if (out_linear_in_scale > 0.0) {
if (fabs(quant_max_bound - 127.0f) < 0.000001) {
return {paddle::DataType::INT8, paddle::DataType::BFLOAT16};
return {paddle::DataType::INT8};
} else if (fabs(quant_max_bound - 448.0f) < 0.000001) {
return {paddle::DataType::FLOAT8_E4M3FN, paddle::DataType::BFLOAT16};
return {paddle::DataType::FLOAT8_E4M3FN};
}else{
PD_THROW("Only supported attr of quant_max_bound in ['127.0', '448.0'].");
}
} else {
return {paddle::DataType::BFLOAT16, paddle::DataType::BFLOAT16};
return {paddle::DataType::BFLOAT16};
}
} else if (compute_dtype == "fp16") {
if (out_linear_in_scale > 0.0) {
if (fabs(quant_max_bound - 127.0f) < 0.000001) {
return {paddle::DataType::INT8, paddle::DataType::FLOAT16};
return {paddle::DataType::INT8};
} else if (fabs(quant_max_bound - 448.0f) < 0.000001) {
return {paddle::DataType::FLOAT8_E4M3FN, paddle::DataType::FLOAT16};
return {paddle::DataType::FLOAT8_E4M3FN};
}else{
PD_THROW("Only supported attr of quant_max_bound in ['127.0', '448.0'].");
}
} else {
return {paddle::DataType::FLOAT16, paddle::DataType::FLOAT16};
return {paddle::DataType::FLOAT16};
}
} else {
PD_THROW("Only supported attr of compute_dtype in ['fp16', 'bf16'].");
}
}
std::vector<std::vector<int64_t>> AppendAttentionWithOutputInferShape(
const std::vector<int64_t>& qkv_shape,
const std::vector<int64_t>& key_cache_shape,
const std::vector<int64_t>& value_cache_shape,
const std::vector<int64_t>& seq_lens_encoder_shape,
const std::vector<int64_t>& seq_lens_decoder_shape,
const std::vector<int64_t>& seq_lens_this_time_shape,
const std::vector<int64_t>& batch_id_per_token_shape,
const std::vector<int64_t>& cu_seqlens_q_shape,
const std::vector<int64_t>& block_tables_shape,
const std::vector<int64_t>& encoder_batch_ids_shape,
const std::vector<int64_t>& encoder_tile_ids_per_batch_shape,
const std::vector<int64_t>& encoder_num_blocks_shape,
const std::vector<int64_t>& kv_batch_ids_shape,
const std::vector<int64_t>& kv_tile_ids_per_batch_shape,
const std::vector<int64_t>& kv_num_blocks_shape,
const std::vector<int64_t>& decoder_batch_ids_shape,
const std::vector<int64_t>& decoder_tile_ids_per_batch_shape,
const std::vector<int64_t>& decoder_num_blocks_shape,
const std::vector<int64_t>& set_max_lengths_shape,
const std::vector<int64_t>& max_len_kv_shape,
const std::vector<int64_t>& fmha_out_shape,
const paddle::optional<std::vector<int64_t>>& rotary_embs_shape,
const paddle::optional<std::vector<int64_t>>& attn_mask_shape,
const paddle::optional<std::vector<int64_t>>& qkv_bias_shape,
const paddle::optional<std::vector<int64_t>>& qkv_out_scales_shape,
const paddle::optional<std::vector<int64_t>>& cache_k_quant_scales_shape,
const paddle::optional<std::vector<int64_t>>& cache_v_quant_scales_shape,
const paddle::optional<std::vector<int64_t>>& cache_k_dequant_scales_shape,
const paddle::optional<std::vector<int64_t>>& cache_v_dequant_scales_shape,
const paddle::optional<std::vector<int64_t>>& cache_k_zp_shape,
const paddle::optional<std::vector<int64_t>>& cache_v_zp_shape,
const paddle::optional<std::vector<int64_t>>& out_linear_shifts_shape,
const paddle::optional<std::vector<int64_t>>& out_linear_smooths_shape,
const paddle::optional<std::vector<int64_t>>& mask_offset_shape,
const paddle::optional<std::vector<int64_t>>& kv_signal_data_shape,
const paddle::optional<std::vector<int64_t>>& q_norm_weight_shape,
const paddle::optional<std::vector<int64_t>>& k_norm_weight_shape,
const float rms_norm_eps,
const std::string& compute_dtype,
const std::string& cache_quant_type_str,
const bool use_neox_rotary_style,
const bool rope_3d,
const int max_input_length,
const float quant_max_bound,
const float quant_min_bound,
const float out_linear_in_scale,
const int encoder_block_shape_q,
const int decoder_block_shape_q,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool speculate_decoder) {
return {fmha_out_shape};
}
std::vector<paddle::DataType> AppendAttentionWithOutputInferDtype(
const paddle::DataType& qkv_dtype,
const paddle::DataType& key_cache_dtype,
const paddle::DataType& value_cache_dtype,
const paddle::DataType& seq_lens_encoder_dtype,
const paddle::DataType& seq_lens_decoder_dtype,
const paddle::DataType& seq_lens_this_time_dtype,
const paddle::DataType& batch_id_per_token_dtype,
const paddle::DataType& cu_seqlens_q_dtype,
const paddle::DataType& block_tables_dtype,
const paddle::DataType& encoder_batch_ids_dtype,
const paddle::DataType& encoder_tile_ids_per_batch_dtype,
const paddle::DataType& encoder_num_blocks_dtype,
const paddle::DataType& kv_batch_ids_dtype,
const paddle::DataType& kv_tile_ids_per_batch_dtype,
const paddle::DataType& kv_num_blocks_dtype,
const paddle::DataType& decoder_batch_ids_dtype,
const paddle::DataType& decoder_tile_ids_per_batch_dtype,
const paddle::DataType& decoder_num_blocks_dtype,
const paddle::DataType& set_max_lengths_dtype,
const paddle::DataType& max_len_kv_dtype,
const paddle::DataType& fmha_out_dtype,
const paddle::optional<paddle::DataType>& rotary_embs_dtype,
const paddle::optional<paddle::DataType>& attn_mask_dtype,
const paddle::optional<paddle::DataType>& qkv_bias_dtype,
const paddle::optional<paddle::DataType>& qkv_out_scales_dtype,
const paddle::optional<paddle::DataType>& cache_k_quant_scales_dtype,
const paddle::optional<paddle::DataType>& cache_v_quant_scales_dtype,
const paddle::optional<paddle::DataType>& cache_k_dequant_scales_dtype,
const paddle::optional<paddle::DataType>& cache_v_dequant_scales_dtype,
const paddle::optional<paddle::DataType>& cache_k_zp_dtype,
const paddle::optional<paddle::DataType>& cache_v_zp_dtype,
const paddle::optional<paddle::DataType>& out_linear_shifts_dtype,
const paddle::optional<paddle::DataType>& out_linear_smooths_dtype,
const paddle::optional<paddle::DataType>& mask_offset_dtype,
const paddle::optional<paddle::DataType>& kv_signal_data_dtype,
const paddle::optional<paddle::DataType>& q_norm_weight_dtype,
const paddle::optional<paddle::DataType>& k_norm_weight_dtype,
const float rms_norm_eps,
const std::string& compute_dtype,
const std::string& cache_quant_type_str,
const bool use_neox_rotary_style,
const bool rope_3d,
const int max_input_length,
const float quant_max_bound,
const float quant_min_bound,
const float out_linear_in_scale,
const int encoder_block_shape_q,
const int decoder_block_shape_q,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool speculate_decoder) {
return {fmha_out_dtype};
}
PD_BUILD_STATIC_OP(append_attention)
.Inputs({"qkv",
"key_cache",
@@ -714,11 +1059,15 @@ PD_BUILD_STATIC_OP(append_attention)
paddle::Optional("cache_v_zp"),
paddle::Optional("out_linear_shifts"),
paddle::Optional("out_linear_smooths"),
paddle::Optional("kv_signal_data")})
.Outputs({"fmha_out", "qkv_out", "key_cache_out", "value_cache_out"})
paddle::Optional("mask_offset"),
paddle::Optional("kv_signal_data"),
paddle::Optional("q_norm_weight"),
paddle::Optional("k_norm_weight")})
.Outputs({"fmha_out", "key_cache_out", "value_cache_out"})
.SetInplaceMap({{"key_cache", "key_cache_out"},
{"value_cache", "value_cache_out"}})
.Attrs({"compute_type: std::string",
.Attrs({"rms_norm_eps: float",
"compute_type: std::string",
"cache_quant_type: std::string",
"use_neox_rotary_style: bool",
"rope_3d: bool",
@@ -732,7 +1081,71 @@ PD_BUILD_STATIC_OP(append_attention)
"encoder_max_partition_size: int",
"speculate_max_draft_token_num: int",
"causal: bool",
"speculate_decoder: bool"})
"speculate_decoder: bool",
})
.SetKernelFn(PD_KERNEL(AppendAttention))
.SetInferShapeFn(PD_INFER_SHAPE(AppendAttentionInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(AppendAttentionInferDtype));
PD_BUILD_STATIC_OP(append_attention_with_output)
.Inputs({"qkv",
"key_cache",
"value_cache",
"seq_lens_encoder",
"seq_lens_decoder",
"seq_lens_this_time",
"batch_id_per_token",
"cu_seqlens_q",
"block_tables",
"encoder_batch_ids",
"encoder_tile_ids_per_batch",
"encoder_num_blocks",
"kv_batch_ids",
"kv_tile_ids_per_batch",
"kv_num_blocks",
"decoder_batch_ids",
"decoder_tile_ids_per_batch",
"decoder_num_blocks",
"set_max_lengths",
"max_len_kv",
"fmha_out",
paddle::Optional("rotary_embs"),
paddle::Optional("attn_mask"),
paddle::Optional("qkv_bias"),
paddle::Optional("qkv_out_scales"),
paddle::Optional("cache_k_quant_scales"),
paddle::Optional("cache_v_quant_scales"),
paddle::Optional("cache_k_dequant_scales"),
paddle::Optional("cache_v_dequant_scales"),
paddle::Optional("cache_k_zp"),
paddle::Optional("cache_v_zp"),
paddle::Optional("out_linear_shifts"),
paddle::Optional("out_linear_smooths"),
paddle::Optional("mask_offset"),
paddle::Optional("kv_signal_data"),
paddle::Optional("q_norm_weight"),
paddle::Optional("k_norm_weight")})
.Outputs({"fmha_out_out", "qkv_out", "key_cache_out", "value_cache_out"})
.SetInplaceMap({{"fmha_out", "fmha_out_out"},
{"key_cache", "key_cache_out"},
{"value_cache", "value_cache_out"}})
.Attrs({"rms_norm_eps: float",
"compute_type: std::string",
"cache_quant_type: std::string",
"use_neox_rotary_style: bool",
"rope_3d: bool",
"max_input_length: int",
"quant_max_bound: float",
"quant_min_bound: float",
"out_linear_in_scale: float",
"encoder_block_shape_q: int",
"decoder_block_shape_q: int",
"max_partition_size: int",
"encoder_max_partition_size: int",
"speculate_max_draft_token_num: int",
"causal: bool",
"speculate_decoder: bool",
})
.SetKernelFn(PD_KERNEL(AppendAttentionWithOutput))
.SetInferShapeFn(PD_INFER_SHAPE(AppendAttentionWithOutputInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(AppendAttentionWithOutputInferDtype));

View File

@@ -43,6 +43,7 @@ __global__ void multi_query_append_attention_kernel(
const int *__restrict__ tile_ids_per_batch,
const int *__restrict__ cu_seqlens_q,
const int *__restrict__ block_table, // [bsz, block_num_per_seq]
const int *__restrict__ mask_offset,
const int max_seq_len,
const int max_dec_len,
const int max_block_num_per_seq,
@@ -51,6 +52,7 @@ __global__ void multi_query_append_attention_kernel(
const float quant_min_bound,
const float in_scale,
const uint32_t chunk_size,
const int num_blocks_x_cpu,
T *__restrict__ tmp_workspace, // split kv [token_num, num_chunks,
// num_heads, head_dim]
float *__restrict__ tmp_m, // [token_num, num_chunks, num_heads]
@@ -73,6 +75,11 @@ __global__ void multi_query_append_attention_kernel(
block_table_now = block_table + batch_id * max_block_num_per_seq;
//When cudagraph capture prefill, may launch more gridDim.x
if(btid >= static_cast<uint32_t>(num_blocks_x_cpu)){
return;
}
const uint32_t q_len = seq_lens[batch_id];
if (q_len <= 0) {
return;
@@ -141,6 +148,7 @@ __global__ void multi_query_append_attention_kernel(
} else {
o_base_ptr_int8 = out + o_offset;
}
const int *mask_offset_this_seq = mask_offset ? mask_offset + q_start_seq_id * 2 : nullptr;
smem_t qo_smem(smem);
uint32_t q_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head>(
@@ -179,7 +187,7 @@ __global__ void multi_query_append_attention_kernel(
kv_len - q_len +
tile_id * num_rows_per_block / GROUP_SIZE,
chunk_start)))
: chunk_len) /
: mask_offset ? 0 : chunk_len) /
(num_frags_z * 16);
uint32_t k_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head>(
8 * (tid / 16) + tid % 8, (tid % 16) / 8);
@@ -245,12 +253,16 @@ __global__ void multi_query_append_attention_kernel(
NUM_WARPS,
num_frags_x,
num_frags_y,
num_frags_z>(q_base_seq_id_this_block,
num_frags_z>(nullptr,
q_base_seq_id_this_block,
kv_idx_base,
q_len,
kv_len,
chunk_end,
s_frag);
-1,
s_frag,
mask_offset_this_seq);
}
// update m,d
@@ -406,6 +418,8 @@ __global__ void multi_query_append_attention_warp1_4_kernel(
const int *__restrict__ tile_ids_per_batch,
const int *__restrict__ cu_seqlens_q,
const int *__restrict__ block_table, // [bsz, block_num_per_seq]
const int *__restrict__ mask_offset,
const bool *__restrict__ attn_mask, // [bsz, max_q, max_q] for tree-mask
const int max_seq_len,
const int max_dec_len,
const int max_block_num_per_seq,
@@ -414,12 +428,14 @@ __global__ void multi_query_append_attention_warp1_4_kernel(
const float quant_min_bound,
const float in_scale,
const uint32_t chunk_size,
const int num_blocks_x_cpu,
T *__restrict__ tmp_workspace, // split kv [token_num, num_chunks,
// num_heads, head_dim]
float *__restrict__ tmp_m, // [token_num, num_chunks, num_heads]
float *__restrict__ tmp_d, // [token_num, num_chunks, num_heads]
OutT *__restrict__ out,
const int speculate_max_draft_token_num = 5) {
const int speculate_max_draft_token_num = 5,
const uint32_t attn_mask_len = -1) {
constexpr uint32_t num_vecs_per_head = HEAD_DIM / num_elems_per_128b<T>();
static_assert(NUM_WARP_Q == 1, "NUM_WARP_Q must be 1");
static_assert(NUM_WARP_KV == 4, "NUM_WARP_KV must be 4");
@@ -436,6 +452,11 @@ __global__ void multi_query_append_attention_warp1_4_kernel(
const uint32_t num_rows_per_block = num_frags_x * 16;
const int *block_table_now = block_table + batch_id * max_block_num_per_seq;
//When cudagraph capture prefill, may launch more gridDim.x
if(btid >= static_cast<uint32_t>(num_blocks_x_cpu)){
return;
}
const uint32_t q_len = seq_lens[batch_id];
if (q_len <= 0) {
return;
@@ -502,7 +523,7 @@ __global__ void multi_query_append_attention_warp1_4_kernel(
tid % 8 * num_elems_per_128b<T>();
}
}
const int *mask_offset_this_seq = mask_offset ? mask_offset + q_start_seq_id * 2 : nullptr;
smem_t qo_smem(smem);
uint32_t q_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head>(
@@ -540,10 +561,9 @@ __global__ void multi_query_append_attention_warp1_4_kernel(
const uint32_t mask_check_iteration =
(CAUSAL ? (min(chunk_len,
sub_if_greater_or_zero(
kv_len - q_len +
tile_id * num_rows_per_block / GROUP_SIZE,
kv_len - q_len,
chunk_start)))
: chunk_len) /
: mask_offset ? 0 : chunk_len) /
(NUM_WARP_KV * num_frags_z * 16);
uint32_t k_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head>(
@@ -611,12 +631,15 @@ __global__ void multi_query_append_attention_warp1_4_kernel(
NUM_WARPS,
num_frags_x,
num_frags_y,
num_frags_z>(q_base_seq_id_this_block,
num_frags_z>(attn_mask ? attn_mask + batch_id * attn_mask_len *attn_mask_len : nullptr,
q_base_seq_id_this_block,
kv_idx_base + wid * num_frags_z * 16,
q_len,
kv_len,
chunk_end,
s_frag);
attn_mask_len,
s_frag,
mask_offset_this_seq);
}
// update m,d
@@ -882,6 +905,7 @@ void MultiQueryAppendAttention(
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
block_table.data<int>(),
meta_data.mask_offset,
max_seq_len,
max_dec_len,
max_block_num_per_seq,
@@ -890,6 +914,7 @@ void MultiQueryAppendAttention(
quant_min_bound,
in_scale,
chunk_size,
num_blocks_x_cpu,
nullptr,
nullptr,
nullptr,
@@ -939,6 +964,7 @@ void MultiQueryAppendAttention(
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
block_table.data<int>(),
meta_data.mask_offset,
max_seq_len,
max_dec_len,
max_block_num_per_seq,
@@ -947,6 +973,7 @@ void MultiQueryAppendAttention(
quant_min_bound,
in_scale,
chunk_size,
num_blocks_x_cpu,
reinterpret_cast<NV_TYPE *>(tmp_workspace->ptr()),
static_cast<float *>(tmp_m->ptr()),
static_cast<float *>(tmp_d->ptr()),
@@ -1061,12 +1088,18 @@ void MultiQueryAppendAttention(
if (!is_decoder) {
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
}
const int num_chunks = div_up(max_dec_len, chunk_size);
uint32_t attn_mask_len;
if (attn_mask) {
attn_mask_len = attn_mask.get().shape()[1];
} else {
attn_mask_len = -1;
}
const int num_chunks = div_up(max_seq_len, chunk_size);
dim3 grids(num_blocks_x_cpu, num_chunks, kv_num_heads);
dim3 blocks(32, num_warps);
if (num_chunks <= 1) {
if (num_chunks <= 0) {
auto nosplit_kv_kernel =
multi_query_append_attention_warp1_4_kernel<NV_TYPE,
false,
@@ -1104,6 +1137,9 @@ void MultiQueryAppendAttention(
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
block_table.data<int>(),
meta_data.mask_offset,
attn_mask ? const_cast<bool *>(attn_mask.get().data<bool>())
: nullptr,
max_seq_len,
max_dec_len,
max_block_num_per_seq,
@@ -1112,11 +1148,13 @@ void MultiQueryAppendAttention(
quant_min_bound,
in_scale,
chunk_size,
num_blocks_x_cpu,
nullptr,
nullptr,
nullptr,
reinterpret_cast<OUT_NV_TYPE *>(out->data<OutT>()),
speculate_max_draft_token_num);
speculate_max_draft_token_num,
attn_mask_len);
} else {
phi::Allocator::AllocationPtr tmp_workspace, tmp_m, tmp_d;
if (is_decoder) {
@@ -1161,8 +1199,8 @@ void MultiQueryAppendAttention(
reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_k.data<T>())),
reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_v.data<T>())),
shift_bias ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
smooth_weight ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(smooth_weight.get().data<T>()))
: nullptr,
@@ -1172,6 +1210,9 @@ void MultiQueryAppendAttention(
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
block_table.data<int>(),
meta_data.mask_offset,
attn_mask ? const_cast<bool *>(attn_mask.get().data<bool>())
: nullptr,
max_seq_len,
max_dec_len,
max_block_num_per_seq,
@@ -1180,11 +1221,13 @@ void MultiQueryAppendAttention(
quant_min_bound,
in_scale,
chunk_size,
num_blocks_x_cpu,
reinterpret_cast<NV_TYPE *>(tmp_workspace->ptr()),
static_cast<float *>(tmp_m->ptr()),
static_cast<float *>(tmp_d->ptr()),
reinterpret_cast<OUT_NV_TYPE *>(out->data<OutT>()),
speculate_max_draft_token_num);
speculate_max_draft_token_num,
attn_mask_len);
// merge
constexpr int vec_size = num_elems_per_128b<NV_TYPE>();
@@ -1208,8 +1251,8 @@ void MultiQueryAppendAttention(
seq_lens_encoder.data<int>(),
cu_seqlens_q.data<int>(),
shift_bias ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
smooth_weight ? reinterpret_cast<NV_TYPE *>(const_cast<T *>(
smooth_weight.get().data<T>()))
: nullptr,
@@ -1226,14 +1269,14 @@ void MultiQueryAppendAttention(
constexpr int blockx = HEAD_DIM / vec_size;
constexpr int blocky = (128 + blockx - 1) / blockx;
dim3 grids_merge(min(sm_count * 4, token_num),
num_heads);
num_heads);
dim3 blocks_merge(blockx, blocky);
merge_multi_chunks_v2_kernel<NV_TYPE,
vec_size,
blocky,
HEAD_DIM,
OUT_NV_TYPE,
ENABLE_PREFILL>
vec_size,
blocky,
HEAD_DIM,
OUT_NV_TYPE,
ENABLE_PREFILL>
<<<grids_merge, blocks_merge, 0, stream>>>(
reinterpret_cast<NV_TYPE *>(tmp_workspace->ptr()),
static_cast<float *>(tmp_m->ptr()),
@@ -1244,8 +1287,8 @@ void MultiQueryAppendAttention(
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
shift_bias ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
smooth_weight ? reinterpret_cast<NV_TYPE *>(const_cast<T *>(
smooth_weight.get().data<T>()))
: nullptr,

View File

@@ -48,6 +48,7 @@ __global__ void multi_query_append_attention_c4_kernel(
const int *__restrict__ tile_ids_per_batch,
const int *__restrict__ cu_seqlens_q,
const int *__restrict__ block_table, // [bsz, block_num_per_seq]
const int *__restrict__ mask_offset,
const int max_seq_len,
const int max_dec_len,
const int max_block_num_per_seq,
@@ -56,6 +57,7 @@ __global__ void multi_query_append_attention_c4_kernel(
const float quant_min_bound,
const float in_scale,
const uint32_t chunk_size,
const int num_blocks_x_cpu,
T *__restrict__ tmp_workspace, // split kv [token_num, num_chunks,
// num_heads, head_dim]
float *__restrict__ tmp_m, // [token_num, num_chunks, num_heads]
@@ -84,6 +86,11 @@ __global__ void multi_query_append_attention_c4_kernel(
block_table_now = block_table + batch_id * max_block_num_per_seq;
//When cudagraph capture prefill, may launch more gridDim.x
if(btid >= static_cast<uint32_t>(num_blocks_x_cpu)){
return;
}
const uint32_t q_len = seq_lens[batch_id];
if (q_len <= 0) {
return;
@@ -172,6 +179,7 @@ __global__ void multi_query_append_attention_c4_kernel(
} else {
o_base_ptr_int8 = out + o_offset;
}
const int *mask_offset_this_seq = mask_offset ? mask_offset + q_start_seq_id * 2 : nullptr;
smem_t qo_smem(smem);
uint32_t q_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head>(
@@ -248,7 +256,7 @@ __global__ void multi_query_append_attention_c4_kernel(
kv_len - q_len +
tile_id * num_rows_per_block / GROUP_SIZE,
chunk_start)))
: chunk_len) /
: mask_offset ? 0 : chunk_len) /
(num_frags_z * 16);
uint32_t k_smem_offset_r =
@@ -333,12 +341,15 @@ __global__ void multi_query_append_attention_c4_kernel(
NUM_WARPS,
num_frags_x,
num_frags_y,
num_frags_z>(q_base_seq_id_this_block,
num_frags_z>(nullptr,
q_base_seq_id_this_block,
kv_idx_base,
q_len,
kv_len,
chunk_end,
s_frag);
-1,
s_frag,
mask_offset_this_seq);
}
update_mdo_states<num_frags_x, num_frags_y, num_frags_z>(
@@ -505,6 +516,8 @@ __global__ void multi_query_append_attention_c4_warp1_4_kernel(
const int *__restrict__ tile_ids_per_batch,
const int *__restrict__ cu_seqlens_q,
const int *__restrict__ block_table, // [bsz, block_num_per_seq]
const int *__restrict__ mask_offset,
const bool *__restrict__ attn_mask, // [bsz, max_q, max_q] for tree-mask
const int max_seq_len,
const int max_dec_len,
const int max_block_num_per_seq,
@@ -513,12 +526,14 @@ __global__ void multi_query_append_attention_c4_warp1_4_kernel(
const float quant_min_bound,
const float in_scale,
const uint32_t chunk_size,
const int num_blocks_x_cpu,
T *__restrict__ tmp_workspace, // split kv [token_num, num_chunks,
// num_heads, head_dim]
float *__restrict__ tmp_m, // [token_num, num_chunks, num_heads]
float *__restrict__ tmp_d, // [token_num, num_chunks, num_heads]
OutT *__restrict__ out,
const int speculate_max_draft_token_num = 5) {
const int speculate_max_draft_token_num = 5,
const uint32_t attn_mask_len = -1) {
constexpr uint32_t num_vecs_per_head = HEAD_DIM / num_elems_per_128b<T>();
constexpr uint32_t num_vecs_per_head_k =
HEAD_DIM / 2 / num_elems_per_128b<CacheT>();
@@ -541,6 +556,11 @@ __global__ void multi_query_append_attention_c4_warp1_4_kernel(
const uint32_t num_rows_per_block = num_frags_x * 16;
const int *block_table_now = block_table + batch_id * max_block_num_per_seq;
//When cudagraph capture prefill, may launch more gridDim.x
if(btid >= static_cast<uint32_t>(num_blocks_x_cpu)){
return;
}
const uint32_t q_len = seq_lens[batch_id];
if (q_len <= 0) {
return;
@@ -627,7 +647,7 @@ __global__ void multi_query_append_attention_c4_warp1_4_kernel(
tid % 8 * num_elems_per_128b<T>();
}
}
const int *mask_offset_this_seq = mask_offset ? mask_offset + q_start_seq_id * 2 : nullptr;
smem_t qo_smem(smem);
uint32_t q_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head>(
@@ -703,10 +723,9 @@ __global__ void multi_query_append_attention_c4_warp1_4_kernel(
const uint32_t mask_check_iteration =
(CAUSAL ? (min(chunk_len,
sub_if_greater_or_zero(
kv_len - q_len +
tile_id * num_rows_per_block / GROUP_SIZE,
kv_len - q_len,
chunk_start)))
: chunk_len) /
: mask_offset ? 0 : chunk_len) /
(NUM_WARP_KV * num_frags_z * 16);
uint32_t k_smem_offset_r =
@@ -788,12 +807,15 @@ __global__ void multi_query_append_attention_c4_warp1_4_kernel(
NUM_WARPS,
num_frags_x,
num_frags_y,
num_frags_z>(q_base_seq_id_this_block,
num_frags_z>(attn_mask ? attn_mask + batch_id * attn_mask_len *attn_mask_len : nullptr,
q_base_seq_id_this_block,
kv_idx_base + wid * num_frags_z * 16,
q_len,
kv_len,
chunk_end,
s_frag);
attn_mask_len,
s_frag,
mask_offset_this_seq);
}
update_mdo_states<num_frags_x, num_frags_y, num_frags_z>(
@@ -1088,6 +1110,7 @@ void MultiQueryAppendC4Attention(
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
block_table.data<int>(),
meta_data.mask_offset,
max_seq_len,
max_dec_len,
max_block_num_per_seq,
@@ -1096,6 +1119,7 @@ void MultiQueryAppendC4Attention(
quant_min_bound,
in_scale,
chunk_size,
num_blocks_x_cpu,
nullptr,
nullptr,
nullptr,
@@ -1151,6 +1175,7 @@ void MultiQueryAppendC4Attention(
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
block_table.data<int>(),
meta_data.mask_offset,
max_seq_len,
max_dec_len,
max_block_num_per_seq,
@@ -1159,6 +1184,7 @@ void MultiQueryAppendC4Attention(
quant_min_bound,
in_scale,
chunk_size,
num_blocks_x_cpu,
reinterpret_cast<NV_TYPE *>(tmp_workspace->ptr()),
static_cast<float *>(tmp_m->ptr()),
static_cast<float *>(tmp_d->ptr()),
@@ -1285,10 +1311,18 @@ void MultiQueryAppendC4Attention(
if (!is_decoder) {
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
}
const int num_chunks = div_up(max_dec_len, chunk_size);
const int num_chunks = div_up(max_seq_len, chunk_size);
uint32_t attn_mask_len;
if (attn_mask) {
attn_mask_len = attn_mask.get().shape()[1];
} else {
attn_mask_len = -1;
}
dim3 grids(num_blocks_x_cpu, num_chunks, kv_num_heads);
dim3 blocks(32, num_warps);
if (num_chunks <= 1) {
if (num_chunks <= 0) {
auto nosplit_kv_kernel =
multi_query_append_attention_c4_warp1_4_kernel<NV_TYPE,
uint8_t,
@@ -1334,6 +1368,9 @@ void MultiQueryAppendC4Attention(
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
block_table.data<int>(),
meta_data.mask_offset,
attn_mask ? const_cast<bool *>(attn_mask.get().data<bool>())
: nullptr,
max_seq_len,
max_dec_len,
max_block_num_per_seq,
@@ -1342,11 +1379,13 @@ void MultiQueryAppendC4Attention(
quant_min_bound,
in_scale,
chunk_size,
num_blocks_x_cpu,
nullptr,
nullptr,
nullptr,
reinterpret_cast<OUT_NV_TYPE *>(out->data<OutT>()),
speculate_max_draft_token_num);
speculate_max_draft_token_num,
attn_mask_len);
} else {
phi::Allocator::AllocationPtr tmp_workspace, tmp_m, tmp_d;
if (is_decoder) {
@@ -1392,15 +1431,15 @@ void MultiQueryAppendC4Attention(
const_cast<uint8_t *>(cache_v.data<uint8_t>()),
reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_k_scale.data<T>())),
cache_k_zp ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(cache_k_zp.get().data<T>()))
: nullptr,
const_cast<T *>(cache_k_zp.get().data<T>()))
: nullptr,
reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_v_scale.data<T>())),
cache_v_zp ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(cache_v_zp.get().data<T>()))
: nullptr,
const_cast<T *>(cache_v_zp.get().data<T>()))
: nullptr,
shift_bias ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
smooth_weight ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(smooth_weight.get().data<T>()))
: nullptr,
@@ -1410,6 +1449,9 @@ void MultiQueryAppendC4Attention(
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
block_table.data<int>(),
meta_data.mask_offset,
attn_mask ? const_cast<bool *>(attn_mask.get().data<bool>())
: nullptr,
max_seq_len,
max_dec_len,
max_block_num_per_seq,
@@ -1418,11 +1460,13 @@ void MultiQueryAppendC4Attention(
quant_min_bound,
in_scale,
chunk_size,
num_blocks_x_cpu,
reinterpret_cast<NV_TYPE *>(tmp_workspace->ptr()),
static_cast<float *>(tmp_m->ptr()),
static_cast<float *>(tmp_d->ptr()),
reinterpret_cast<OUT_NV_TYPE *>(out->data<OutT>()),
speculate_max_draft_token_num);
speculate_max_draft_token_num,
attn_mask_len);
// merge
constexpr int vec_size = num_elems_per_128b<NV_TYPE>();
if (is_decoder) {
@@ -1445,8 +1489,8 @@ void MultiQueryAppendC4Attention(
seq_lens_encoder.data<int>(),
cu_seqlens_q.data<int>(),
shift_bias ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
smooth_weight ? reinterpret_cast<NV_TYPE *>(const_cast<T *>(
smooth_weight.get().data<T>()))
: nullptr,
@@ -1463,14 +1507,14 @@ void MultiQueryAppendC4Attention(
constexpr int blockx = HEAD_DIM / vec_size;
constexpr int blocky = (128 + blockx - 1) / blockx;
dim3 grids_merge(min(sm_count * 4, token_num),
num_heads);
num_heads);
dim3 blocks_merge(blockx, blocky);
merge_multi_chunks_v2_kernel<NV_TYPE,
vec_size,
blocky,
HEAD_DIM,
OUT_NV_TYPE,
ENABLE_PREFILL>
vec_size,
blocky,
HEAD_DIM,
OUT_NV_TYPE,
ENABLE_PREFILL>
<<<grids_merge, blocks_merge, 0, stream>>>(
reinterpret_cast<NV_TYPE *>(tmp_workspace->ptr()),
static_cast<float *>(tmp_m->ptr()),
@@ -1481,8 +1525,8 @@ void MultiQueryAppendC4Attention(
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
shift_bias ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
smooth_weight ? reinterpret_cast<NV_TYPE *>(const_cast<T *>(
smooth_weight.get().data<T>()))
: nullptr,

View File

@@ -32,14 +32,15 @@ template <typename T,
typename OutT = T,
bool ENABLE_PREFILL = true,
bool is_scale_channel_wise = false,
bool IsFP8=false>
bool IsFP8 = false,
bool IsDynamicC8 = false>
__global__ void multi_query_append_attention_c8_kernel(
T *__restrict__ q, // [token_num, (num_heads + 2* kv_num_head) * head_dim]
CacheT *__restrict__ cache_k, // [max_block_num, num_heads, block_size,
// head_dim]
CacheT *__restrict__ cache_v,
const T *__restrict__ cache_k_scale, // [num_kv_heads]
const T *__restrict__ cache_v_scale, // [num_kv_heads]
const T *__restrict__ cache_k_scale, // [num_kv_heads] or [max_block_num, num_heads, block_size]
const T *__restrict__ cache_v_scale, // [num_kv_heads] or [max_block_num, num_heads, block_size]
const T *__restrict__ shift_bias, // [q_num_heads * HEAD_DIM]
const T *__restrict__ smooth_weight, // [q_num_heads * HEAD_DIM]
const int *__restrict__ seq_lens,
@@ -48,6 +49,7 @@ __global__ void multi_query_append_attention_c8_kernel(
const int *__restrict__ tile_ids_per_batch,
const int *__restrict__ cu_seqlens_q,
const int *__restrict__ block_table, // [bsz, block_num_per_seq]
const int *__restrict__ mask_offset,
const int max_seq_len,
const int max_dec_len,
const int max_block_num_per_seq,
@@ -56,6 +58,7 @@ __global__ void multi_query_append_attention_c8_kernel(
const float quant_min_bound,
const float in_scale,
const uint32_t chunk_size,
const int num_blocks_x_cpu,
T *__restrict__ tmp_workspace, // split kv [token_num, num_chunks,
// num_heads, head_dim]
float *__restrict__ tmp_m, // [token_num, num_chunks, num_heads]
@@ -85,33 +88,40 @@ __global__ void multi_query_append_attention_c8_kernel(
block_table_now = block_table + batch_id * max_block_num_per_seq;
//When cudagraph capture prefill, may launch more gridDim.x
if(btid >= static_cast<uint32_t>(num_blocks_x_cpu)){
return;
}
const uint32_t q_len = seq_lens[batch_id];
if (q_len <= 0) {
return;
}
T cache_k_scale_reg[num_frags_y * 4];
T cache_v_scale_reg[num_frags_y * 2];
if (is_scale_channel_wise) {
int scale_col_base = threadIdx.x % 4 * 2 + kv_head_idx * HEAD_DIM;
const T *cache_k_scale_cur_head = cache_k_scale + scale_col_base;
for (int i = 0; i < num_frags_y; ++i) {
const int scale_idx = i * 16;
cache_k_scale_reg[i * 4] = cache_k_scale_cur_head[scale_idx];
cache_k_scale_reg[i * 4 + 1] = cache_k_scale_cur_head[scale_idx + 1];
cache_k_scale_reg[i * 4 + 2] = cache_k_scale_cur_head[scale_idx + 8];
cache_k_scale_reg[i * 4 + 3] = cache_k_scale_cur_head[scale_idx + 9];
T cache_k_scale_reg[IsDynamicC8 ? num_frags_z * 2 : num_frags_y * 4];
T cache_v_scale_reg[IsDynamicC8 ? num_frags_z * 4 : num_frags_y * 2];
if constexpr (!IsDynamicC8) {
if constexpr (is_scale_channel_wise) {
int scale_col_base = threadIdx.x % 4 * 2 + kv_head_idx * HEAD_DIM;
const T *cache_k_scale_cur_head = cache_k_scale + scale_col_base;
for (int i = 0; i < num_frags_y; ++i) {
const int scale_idx = i * 16;
cache_k_scale_reg[i * 4] = cache_k_scale_cur_head[scale_idx];
cache_k_scale_reg[i * 4 + 1] = cache_k_scale_cur_head[scale_idx + 1];
cache_k_scale_reg[i * 4 + 2] = cache_k_scale_cur_head[scale_idx + 8];
cache_k_scale_reg[i * 4 + 3] = cache_k_scale_cur_head[scale_idx + 9];
}
scale_col_base = threadIdx.x / 4 + kv_head_idx * HEAD_DIM;
const T *cache_v_scale_cur_head = cache_v_scale + scale_col_base;
for (int i = 0; i < num_frags_y; ++i) {
const int scale_idx = i * 16;
cache_v_scale_reg[i * 2] = cache_v_scale_cur_head[scale_idx];
cache_v_scale_reg[i * 2 + 1] = cache_v_scale_cur_head[scale_idx + 8];
}
} else {
cache_k_scale_reg[0] = cache_k_scale[kv_head_idx];
cache_v_scale_reg[0] = cache_v_scale[kv_head_idx];
}
scale_col_base = threadIdx.x / 4 + kv_head_idx * HEAD_DIM;
const T *cache_v_scale_cur_head = cache_v_scale + scale_col_base;
for (int i = 0; i < num_frags_y; ++i) {
const int scale_idx = i * 16;
cache_v_scale_reg[i * 2] = cache_v_scale_cur_head[scale_idx];
cache_v_scale_reg[i * 2 + 1] = cache_v_scale_cur_head[scale_idx + 8];
}
} else {
cache_k_scale_reg[0] = cache_k_scale[kv_head_idx];
cache_v_scale_reg[0] = cache_v_scale[kv_head_idx];
}
const uint32_t q_end =
@@ -179,6 +189,7 @@ __global__ void multi_query_append_attention_c8_kernel(
} else {
o_base_ptr_int8 = out + o_offset;
}
const int *mask_offset_this_seq = mask_offset ? mask_offset + q_start_seq_id * 2 : nullptr;
smem_t qo_smem(smem);
uint32_t q_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head>(
@@ -199,6 +210,13 @@ __global__ void multi_query_append_attention_c8_kernel(
smem_t k_smem(smem + NUM_WARPS * num_frags_x * 16 * HEAD_DIM * sizeof(T)),
v_smem(smem + NUM_WARPS * num_frags_x * 16 * HEAD_DIM * sizeof(T) +
num_frags_z * 16 * HEAD_DIM * sizeof(CacheT));
T* k_smem_scale = nullptr;
T* v_smem_scale = nullptr;
if constexpr (IsDynamicC8) {
k_smem_scale = reinterpret_cast<T*>(smem + NUM_WARPS * num_frags_x * 16 * HEAD_DIM * sizeof(T) +
num_frags_z * 16 * HEAD_DIM * sizeof(CacheT) * 2);
v_smem_scale = k_smem_scale + num_frags_z * 16;
}
const uint32_t num_iterations = div_up(
@@ -216,7 +234,7 @@ __global__ void multi_query_append_attention_c8_kernel(
kv_len - q_len +
tile_id * num_rows_per_block / GROUP_SIZE,
chunk_start)))
: chunk_len) /
: mask_offset ? 0 : chunk_len) /
(num_frags_z * 16);
uint32_t k_smem_offset_r =
@@ -280,10 +298,22 @@ __global__ void multi_query_append_attention_c8_kernel(
#pragma unroll 1
for (uint32_t iter = 0; iter < num_iterations; ++iter) {
if constexpr (IsDynamicC8) {
produce_k_dynamic_scale<BLOCK_SIZE, num_frags_z, NUM_WARP_Q, T>(
k_smem_scale,
cache_k_scale_reg,
block_table_now,
cache_k_scale,
kv_idx_base,
kv_num_heads,
kv_head_idx,
chunk_end
);
}
wait_group<1>();
__syncthreads();
// s = qk
compute_qk_c8<num_frags_x, num_frags_y, num_frags_z, T, CacheT, is_scale_channel_wise, IsFP8>(
compute_qk_c8<num_frags_x, num_frags_y, num_frags_z, T, CacheT, is_scale_channel_wise, IsFP8, IsDynamicC8>(
&qo_smem,
&q_smem_offset_r,
&k_smem,
@@ -300,12 +330,15 @@ __global__ void multi_query_append_attention_c8_kernel(
NUM_WARPS,
num_frags_x,
num_frags_y,
num_frags_z>(q_base_seq_id_this_block,
num_frags_z>(nullptr,
q_base_seq_id_this_block,
kv_idx_base,
q_len,
kv_len,
chunk_end,
s_frag);
-1,
s_frag,
mask_offset_this_seq);
}
// update m,d
@@ -313,6 +346,7 @@ __global__ void multi_query_append_attention_c8_kernel(
s_frag, o_frag, m_frag, d_frag);
__syncthreads();
const int ori_kv_idx_base = kv_idx_base;
kv_idx_base += num_frags_z * 16;
produce_k_blockwise_c8<SharedMemFillMode::kNoFill,
NUM_WARPS,
@@ -331,6 +365,18 @@ __global__ void multi_query_append_attention_c8_kernel(
chunk_end,
const_k_offset);
commit_group();
if constexpr (IsDynamicC8) {
produce_v_dynamic_scale<BLOCK_SIZE, num_frags_z, NUM_WARP_Q, T>(
v_smem_scale,
cache_v_scale_reg,
block_table_now,
cache_v_scale,
ori_kv_idx_base,
kv_num_heads,
kv_head_idx,
chunk_end
);
}
wait_group<1>();
__syncthreads();
@@ -341,7 +387,9 @@ __global__ void multi_query_append_attention_c8_kernel(
BLOCK_SIZE,
T,
CacheT,
is_scale_channel_wise, IsFP8>(
is_scale_channel_wise,
IsFP8,
IsDynamicC8>(
&v_smem, &v_smem_offset_r, s_frag, o_frag, d_frag, cache_v_scale_reg);
__syncthreads();
@@ -458,14 +506,15 @@ template <typename T,
typename OutT = T,
bool ENABLE_PREFILL = true,
bool is_scale_channel_wise=false,
bool IsFP8=false>
bool IsFP8 = false,
bool IsDynamicC8 = false>
__global__ void multi_query_append_attention_c8_warp1_4_kernel(
T *__restrict__ q, // [token_num, (num_heads + 2* kv_num_head) * head_dim]
CacheT *__restrict__ cache_k, // [max_block_num, num_heads, block_size,
// head_dim]
CacheT *__restrict__ cache_v,
const T *__restrict__ cache_k_scale, // [num_kv_heads, head_dim]
const T *__restrict__ cache_v_scale, // [num_kv_heads, head_dim]
const T *__restrict__ cache_k_scale, // [num_kv_heads] or [max_block_num, num_heads, block_size]
const T *__restrict__ cache_v_scale, // [num_kv_heads] or [max_block_num, num_heads, block_size]
const T *__restrict__ shift_bias, // [q_num_heads * HEAD_DIM]
const T *__restrict__ smooth_weight, // [q_num_heads * HEAD_DIM]
const int *__restrict__ seq_lens,
@@ -474,6 +523,8 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel(
const int *__restrict__ tile_ids_per_batch,
const int *__restrict__ cu_seqlens_q,
const int *__restrict__ block_table, // [bsz, block_num_per_seq]
const int *__restrict__ mask_offset,
const bool *__restrict__ attn_mask, // [bsz, max_q, max_q] for tree-mask
const int max_seq_len,
const int max_dec_len,
const int max_block_num_per_seq,
@@ -482,12 +533,14 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel(
const float quant_min_bound,
const float in_scale,
const uint32_t chunk_size,
const int num_blocks_x_cpu,
T *__restrict__ tmp_workspace, // split kv [token_num, num_chunks,
// num_heads, head_dim]
float *__restrict__ tmp_m, // [token_num, num_chunks, num_heads]
float *__restrict__ tmp_d, // [token_num, num_chunks, num_heads]
OutT *__restrict__ out,
const int speculate_max_draft_token_num = 5) {
const int speculate_max_draft_token_num = 5,
const uint32_t attn_mask_len = -1) {
constexpr uint32_t num_vecs_per_head = HEAD_DIM / num_elems_per_128b<T>();
constexpr uint32_t num_vecs_per_head_k =
HEAD_DIM / num_elems_per_128b<CacheT>();
@@ -510,32 +563,39 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel(
const uint32_t num_rows_per_block = num_frags_x * 16;
const int *block_table_now = block_table + batch_id * max_block_num_per_seq;
//When cudagraph capture prefill, may launch more gridDim.x
if(btid >= static_cast<uint32_t>(num_blocks_x_cpu)){
return;
}
const uint32_t q_len = seq_lens[batch_id];
if (q_len <= 0) {
return;
}
T cache_k_scale_reg[num_frags_y * 4];
T cache_v_scale_reg[num_frags_y * 2];
if (is_scale_channel_wise) {
int scale_col_base = threadIdx.x % 4 * 2 + kv_head_idx * HEAD_DIM;
const T *cache_k_scale_cur_head = cache_k_scale + scale_col_base;
for (int i = 0; i < num_frags_y; ++i) {
const int scale_idx = i * 16;
cache_k_scale_reg[i * 4] = cache_k_scale_cur_head[scale_idx];
cache_k_scale_reg[i * 4 + 1] = cache_k_scale_cur_head[scale_idx + 1];
cache_k_scale_reg[i * 4 + 2] = cache_k_scale_cur_head[scale_idx + 8];
cache_k_scale_reg[i * 4 + 3] = cache_k_scale_cur_head[scale_idx + 9];
T cache_k_scale_reg[IsDynamicC8 ? num_frags_z * 2 : num_frags_y * 4];
T cache_v_scale_reg[IsDynamicC8 ? num_frags_z * 4 : num_frags_y * 2];
if constexpr (!IsDynamicC8) {
if constexpr (is_scale_channel_wise) {
int scale_col_base = threadIdx.x % 4 * 2 + kv_head_idx * HEAD_DIM;
const T *cache_k_scale_cur_head = cache_k_scale + scale_col_base;
for (int i = 0; i < num_frags_y; ++i) {
const int scale_idx = i * 16;
cache_k_scale_reg[i * 4] = cache_k_scale_cur_head[scale_idx];
cache_k_scale_reg[i * 4 + 1] = cache_k_scale_cur_head[scale_idx + 1];
cache_k_scale_reg[i * 4 + 2] = cache_k_scale_cur_head[scale_idx + 8];
cache_k_scale_reg[i * 4 + 3] = cache_k_scale_cur_head[scale_idx + 9];
}
scale_col_base = threadIdx.x / 4 + kv_head_idx * HEAD_DIM;
const T *cache_v_scale_cur_head = cache_v_scale + scale_col_base;
for (int i = 0; i < num_frags_y; ++i) {
const int scale_idx = i * 16;
cache_v_scale_reg[i * 2] = cache_v_scale_cur_head[scale_idx];
cache_v_scale_reg[i * 2 + 1] = cache_v_scale_cur_head[scale_idx + 8];
}
} else {
cache_k_scale_reg[0] = cache_k_scale[kv_head_idx];
cache_v_scale_reg[0] = cache_v_scale[kv_head_idx];
}
scale_col_base = threadIdx.x / 4 + kv_head_idx * HEAD_DIM;
const T *cache_v_scale_cur_head = cache_v_scale + scale_col_base;
for (int i = 0; i < num_frags_y; ++i) {
const int scale_idx = i * 16;
cache_v_scale_reg[i * 2] = cache_v_scale_cur_head[scale_idx];
cache_v_scale_reg[i * 2 + 1] = cache_v_scale_cur_head[scale_idx + 8];
}
} else {
cache_k_scale_reg[0] = cache_k_scale[kv_head_idx];
cache_v_scale_reg[0] = cache_v_scale[kv_head_idx];
}
const uint32_t q_end =
min(q_len, div_up((tile_id + 1) * num_rows_per_block, GROUP_SIZE));
@@ -601,7 +661,7 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel(
tid % 8 * num_elems_per_128b<T>();
}
}
const int *mask_offset_this_seq = mask_offset ? mask_offset + q_start_seq_id * 2 : nullptr;
smem_t qo_smem(smem);
uint32_t q_smem_offset_r = smem_t::get_permuted_offset<num_vecs_per_head>(
@@ -626,6 +686,13 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel(
smem_t k_smem(smem + num_frags_x * 16 * HEAD_DIM * sizeof(T)),
v_smem(smem + num_frags_x * 16 * HEAD_DIM * sizeof(T) +
NUM_WARP_KV * num_frags_z * 16 * HEAD_DIM * sizeof(CacheT));
T* k_smem_scale = nullptr;
T* v_smem_scale = nullptr;
if constexpr (IsDynamicC8) {
k_smem_scale = reinterpret_cast<T*>(smem + num_frags_x * 16 * HEAD_DIM * sizeof(T) +
NUM_WARP_KV * num_frags_z * 16 * HEAD_DIM * sizeof(CacheT) * 2);
v_smem_scale = k_smem_scale + NUM_WARP_KV * num_frags_z * 16;
}
const uint32_t num_iterations = div_up(
CAUSAL
@@ -642,7 +709,7 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel(
kv_len - q_len +
tile_id * num_rows_per_block / GROUP_SIZE,
chunk_start)))
: chunk_len) /
: mask_offset ? 0 : chunk_len) /
(NUM_WARP_KV * num_frags_z * 16);
uint32_t k_smem_offset_r =
@@ -708,11 +775,23 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel(
commit_group();
#pragma unroll 1
for (uint32_t iter = 0; iter < num_iterations; ++iter) {
if constexpr (IsDynamicC8) {
produce_k_dynamic_scale<BLOCK_SIZE, num_frags_z, NUM_WARP_Q, T>(
k_smem_scale,
cache_k_scale_reg,
block_table_now,
cache_k_scale,
kv_idx_base,
kv_num_heads,
kv_head_idx,
chunk_end
);
}
wait_group<1>();
__syncthreads();
// s = qk
compute_qk_c8<num_frags_x, num_frags_y, num_frags_z, T, CacheT, is_scale_channel_wise, IsFP8>(
compute_qk_c8<num_frags_x, num_frags_y, num_frags_z, T, CacheT, is_scale_channel_wise, IsFP8, IsDynamicC8>(
&qo_smem,
&q_smem_offset_r,
&k_smem,
@@ -728,12 +807,16 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel(
NUM_WARPS,
num_frags_x,
num_frags_y,
num_frags_z>(q_base_seq_id_this_block,
num_frags_z>(attn_mask ? attn_mask + batch_id * attn_mask_len *attn_mask_len : nullptr,
q_base_seq_id_this_block,
kv_idx_base + wid * num_frags_z * 16,
q_len,
kv_len,
chunk_end,
s_frag);
attn_mask_len,
s_frag,
mask_offset_this_seq);
}
// update m,d
@@ -741,6 +824,7 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel(
s_frag, o_frag, m_frag, d_frag);
__syncthreads();
const uint32_t ori_kv_idx_base = kv_idx_base;
kv_idx_base += NUM_WARP_KV * num_frags_z * 16;
produce_k_blockwise_c8<SharedMemFillMode::kNoFill,
NUM_WARPS,
@@ -759,6 +843,18 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel(
chunk_end,
const_k_offset);
commit_group();
if constexpr (IsDynamicC8) {
produce_v_dynamic_scale<BLOCK_SIZE, num_frags_z, NUM_WARP_Q, T>(
v_smem_scale,
cache_v_scale_reg,
block_table_now,
cache_v_scale,
ori_kv_idx_base,
kv_num_heads,
kv_head_idx,
chunk_end
);
}
wait_group<1>();
__syncthreads();
@@ -769,7 +865,9 @@ __global__ void multi_query_append_attention_c8_warp1_4_kernel(
BLOCK_SIZE,
T,
CacheT,
is_scale_channel_wise, IsFP8>(
is_scale_channel_wise,
IsFP8,
IsDynamicC8>(
&v_smem, &v_smem_offset_r, s_frag, o_frag, d_frag, cache_v_scale_reg);
__syncthreads();
@@ -883,7 +981,8 @@ template <typename T,
uint32_t NUM_WARP_Q,
typename OutT = T,
bool ENABLE_PREFILL = true,
bool IsFP8=false>
bool IsFP8 = false,
bool IsDynamicC8 = false>
void MultiQueryAppendC8Attention(
const AppendAttnMetaData &meta_data,
const paddle::Tensor &qkv,
@@ -941,7 +1040,8 @@ void MultiQueryAppendC8Attention(
constexpr uint32_t num_frags_z = BLOCK_SIZE / 16;
constexpr uint32_t smem_size =
num_warps * num_frags_x * 16 * HEAD_DIM * sizeof(T) +
num_frags_z * 16 * HEAD_DIM * sizeof(uint8_t) * 2;
num_frags_z * 16 * HEAD_DIM * sizeof(uint8_t) * 2 +
num_frags_z * 16 * sizeof(T) * 2;
auto split_kv_kernel =
multi_query_append_attention_c8_kernel<NV_TYPE,
uint8_t,
@@ -958,7 +1058,9 @@ void MultiQueryAppendC8Attention(
num_frags_y,
OUT_NV_TYPE,
ENABLE_PREFILL,
false, IsFP8>;
false,
IsFP8,
IsDynamicC8>;
if (is_scale_channel_wise) {
split_kv_kernel =
multi_query_append_attention_c8_kernel<NV_TYPE,
@@ -976,7 +1078,9 @@ void MultiQueryAppendC8Attention(
num_frags_y,
OUT_NV_TYPE,
ENABLE_PREFILL,
true, IsFP8>;
true,
IsFP8,
IsDynamicC8>;
}
if (smem_size >= 48 * 1024) {
cudaFuncSetAttribute(split_kv_kernel,
@@ -1010,7 +1114,9 @@ void MultiQueryAppendC8Attention(
num_frags_y,
OUT_NV_TYPE,
ENABLE_PREFILL,
false, IsFP8>;
false,
IsFP8,
IsDynamicC8>;
if (is_scale_channel_wise) {
nosplit_kv_kernel =
multi_query_append_attention_c8_kernel<NV_TYPE,
@@ -1028,7 +1134,9 @@ void MultiQueryAppendC8Attention(
num_frags_y,
OUT_NV_TYPE,
ENABLE_PREFILL,
true, IsFP8>;
true,
IsFP8,
IsDynamicC8>;
}
if (smem_size >= 48 * 1024) {
cudaFuncSetAttribute(nosplit_kv_kernel,
@@ -1054,6 +1162,7 @@ void MultiQueryAppendC8Attention(
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
block_table.data<int>(),
meta_data.mask_offset,
max_seq_len,
max_dec_len,
max_block_num_per_seq,
@@ -1062,6 +1171,7 @@ void MultiQueryAppendC8Attention(
quant_min_bound,
in_scale,
chunk_size,
num_blocks_x_cpu,
nullptr,
nullptr,
nullptr,
@@ -1111,6 +1221,7 @@ void MultiQueryAppendC8Attention(
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
block_table.data<int>(),
meta_data.mask_offset,
max_seq_len,
max_dec_len,
max_block_num_per_seq,
@@ -1119,6 +1230,7 @@ void MultiQueryAppendC8Attention(
quant_min_bound,
in_scale,
chunk_size,
num_blocks_x_cpu,
reinterpret_cast<NV_TYPE *>(tmp_workspace->ptr()),
static_cast<float *>(tmp_m->ptr()),
static_cast<float *>(tmp_d->ptr()),
@@ -1204,7 +1316,8 @@ void MultiQueryAppendC8Attention(
constexpr uint32_t num_frags_z = BLOCK_SIZE / 16 / NUM_WARP_KV * 2;
constexpr uint32_t smem_size =
num_frags_x * 16 * HEAD_DIM * sizeof(T) +
NUM_WARP_KV * num_frags_z * 16 * HEAD_DIM * sizeof(uint8_t) * 2;
NUM_WARP_KV * num_frags_z * 16 * HEAD_DIM * sizeof(uint8_t) * 2 +
NUM_WARP_KV * num_frags_z * 16 * sizeof(T) * 2;
auto split_kv_kernel =
multi_query_append_attention_c8_warp1_4_kernel<NV_TYPE,
uint8_t,
@@ -1221,7 +1334,9 @@ void MultiQueryAppendC8Attention(
num_frags_y,
OUT_NV_TYPE,
ENABLE_PREFILL,
false, IsFP8>;
false,
IsFP8,
IsDynamicC8>;
if (is_scale_channel_wise) {
split_kv_kernel =
multi_query_append_attention_c8_warp1_4_kernel<NV_TYPE,
@@ -1239,7 +1354,9 @@ void MultiQueryAppendC8Attention(
num_frags_y,
OUT_NV_TYPE,
ENABLE_PREFILL,
true, IsFP8>;
true,
IsFP8,
IsDynamicC8>;
}
if (smem_size >= 48 * 1024) {
cudaFuncSetAttribute(split_kv_kernel,
@@ -1254,10 +1371,17 @@ void MultiQueryAppendC8Attention(
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
}
const int num_chunks = div_up(max_dec_len, chunk_size);
const int num_chunks = div_up(max_seq_len, chunk_size);
uint32_t attn_mask_len;
if (attn_mask) {
attn_mask_len = attn_mask.get().shape()[1];
} else {
attn_mask_len = -1;
}
dim3 grids(num_blocks_x_cpu, num_chunks, kv_num_heads);
dim3 blocks(32, num_warps);
if (num_chunks <= 1) {
if (num_chunks <= 0) {
auto nosplit_kv_kernel =
multi_query_append_attention_c8_warp1_4_kernel<NV_TYPE,
uint8_t,
@@ -1274,7 +1398,9 @@ void MultiQueryAppendC8Attention(
num_frags_y,
OUT_NV_TYPE,
ENABLE_PREFILL,
false, IsFP8>;
false,
IsFP8,
IsDynamicC8>;
if (is_scale_channel_wise) {
nosplit_kv_kernel =
multi_query_append_attention_c8_warp1_4_kernel<NV_TYPE,
@@ -1292,7 +1418,9 @@ void MultiQueryAppendC8Attention(
num_frags_y,
OUT_NV_TYPE,
ENABLE_PREFILL,
true, IsFP8>;
true,
IsFP8,
IsDynamicC8>;
}
if (smem_size >= 48 * 1024) {
cudaFuncSetAttribute(nosplit_kv_kernel,
@@ -1318,6 +1446,9 @@ void MultiQueryAppendC8Attention(
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
block_table.data<int>(),
meta_data.mask_offset,
attn_mask ? const_cast<bool *>(attn_mask.get().data<bool>())
: nullptr,
max_seq_len,
max_dec_len,
max_block_num_per_seq,
@@ -1326,11 +1457,13 @@ void MultiQueryAppendC8Attention(
quant_min_bound,
in_scale,
chunk_size,
num_blocks_x_cpu,
nullptr,
nullptr,
nullptr,
reinterpret_cast<OUT_NV_TYPE *>(out->data<OutT>()),
speculate_max_draft_token_num);
speculate_max_draft_token_num,
attn_mask_len);
} else {
phi::Allocator::AllocationPtr tmp_workspace, tmp_m, tmp_d;
if (is_decoder) {
@@ -1377,8 +1510,8 @@ void MultiQueryAppendC8Attention(
reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_k_scale.data<T>())),
reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_v_scale.data<T>())),
shift_bias ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
smooth_weight ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(smooth_weight.get().data<T>()))
: nullptr,
@@ -1388,6 +1521,9 @@ void MultiQueryAppendC8Attention(
tile_ids_per_batch.data<int>(),
cu_seqlens_q.data<int>(),
block_table.data<int>(),
meta_data.mask_offset,
attn_mask ? const_cast<bool *>(attn_mask.get().data<bool>())
: nullptr,
max_seq_len,
max_dec_len,
max_block_num_per_seq,
@@ -1396,11 +1532,13 @@ void MultiQueryAppendC8Attention(
quant_min_bound,
in_scale,
chunk_size,
num_blocks_x_cpu,
reinterpret_cast<NV_TYPE *>(tmp_workspace->ptr()),
static_cast<float *>(tmp_m->ptr()),
static_cast<float *>(tmp_d->ptr()),
reinterpret_cast<OUT_NV_TYPE *>(out->data<OutT>()),
speculate_max_draft_token_num);
speculate_max_draft_token_num,
attn_mask_len);
// merge
constexpr int vec_size = num_elems_per_128b<NV_TYPE>();
if (is_decoder) {
@@ -1418,8 +1556,8 @@ void MultiQueryAppendC8Attention(
seq_lens_encoder.data<int>(),
cu_seqlens_q.data<int>(),
shift_bias ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
smooth_weight ? reinterpret_cast<NV_TYPE *>(const_cast<T *>(
smooth_weight.get().data<T>()))
: nullptr,
@@ -1436,14 +1574,14 @@ void MultiQueryAppendC8Attention(
constexpr int blockx = HEAD_DIM / vec_size;
constexpr int blocky = (128 + blockx - 1) / blockx;
dim3 grids_merge(min(sm_count * 4, token_num),
num_heads);
num_heads);
dim3 blocks_merge(blockx, blocky);
merge_multi_chunks_v2_kernel<NV_TYPE,
vec_size,
blocky,
HEAD_DIM,
OUT_NV_TYPE,
ENABLE_PREFILL>
vec_size,
blocky,
HEAD_DIM,
OUT_NV_TYPE,
ENABLE_PREFILL>
<<<grids_merge, blocks_merge, 0, stream>>>(
reinterpret_cast<NV_TYPE *>(tmp_workspace->ptr()),
static_cast<float *>(tmp_m->ptr()),
@@ -1454,8 +1592,8 @@ void MultiQueryAppendC8Attention(
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
shift_bias ? reinterpret_cast<NV_TYPE *>(
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
const_cast<T *>(shift_bias.get().data<T>()))
: nullptr,
smooth_weight ? reinterpret_cast<NV_TYPE *>(const_cast<T *>(
smooth_weight.get().data<T>()))
: nullptr,
@@ -1517,6 +1655,7 @@ void CascadeAppendAttentionC8Kernel(
const bool causal,
const bool is_decoder,
const bool enable_prefill,
const std::string& cache_quant_type_str,
cudaStream_t& stream,
paddle::Tensor* out) {
const auto token_num = meta_data.token_nums;
@@ -1525,6 +1664,7 @@ void CascadeAppendAttentionC8Kernel(
const auto num_heads = meta_data.q_num_heads;
const auto group_size = meta_data.q_num_heads / meta_data.kv_num_heads;
const auto head_dim = meta_data.head_dims;
bool is_dynamic_cfp8 = cache_quant_type_str == "block_wise_fp8";
DISPATCH_CAUSAL(
causal,
@@ -1543,43 +1683,46 @@ void CascadeAppendAttentionC8Kernel(
BLOCK_SIZE,
{DISPATCH_BLOCKSHAPE_Q(
block_shape_q, BLOCK_SHAPE_Q, NUM_WARP_Q, {
MultiQueryAppendC8Attention<T,
GROUP_SIZE,
HEAD_DIM,
BLOCK_SIZE,
CAUSAL,
BLOCK_SHAPE_Q,
NUM_WARP_Q,
OutT,
ENABLE_PREFILL, IsFP8>(
meta_data,
qkv,
cache_k,
cache_v,
attn_mask,
cache_k_scale.get(),
cache_v_scale.get(),
shift_bias,
smooth_weight,
seq_lens_q,
seq_lens_kv,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
block_table,
batch_ids,
tile_ids_per_batch,
num_blocks,
max_seq_len,
max_dec_len,
quant_max_bound,
quant_min_bound,
in_scale,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
is_decoder,
stream,
out);
})})})})})})
DISPATCH_DyCfp8(is_dynamic_cfp8, IsDynamicC8, {
MultiQueryAppendC8Attention<T,
GROUP_SIZE,
HEAD_DIM,
BLOCK_SIZE,
CAUSAL,
BLOCK_SHAPE_Q,
NUM_WARP_Q,
OutT,
ENABLE_PREFILL,
IsFP8,
IsDynamicC8>(
meta_data,
qkv,
cache_k,
cache_v,
attn_mask,
cache_k_scale.get(),
cache_v_scale.get(),
shift_bias,
smooth_weight,
seq_lens_q,
seq_lens_kv,
seq_lens_encoder,
batch_id_per_token,
cu_seqlens_q,
block_table,
batch_ids,
tile_ids_per_batch,
num_blocks,
max_seq_len,
max_dec_len,
quant_max_bound,
quant_min_bound,
in_scale,
max_partition_size,
encoder_max_partition_size,
speculate_max_draft_token_num,
is_decoder,
stream,
out);
})})})})})})})
}

View File

@@ -384,6 +384,113 @@ __device__ __forceinline__ void produce_v_blockwise_c8(
}
}
template<uint32_t block_size,
uint32_t num_frags_z,
uint32_t NUM_WARP_Q,
typename T>
__device__ __forceinline__ void produce_k_dynamic_scale(
T* k_smem_scale,
T* cache_k_reg,
const int* block_table_now,
const T* cache_k_scale,
const uint32_t kv_idx,
const uint32_t kv_num_heads,
const uint32_t kv_head_idx,
const uint32_t chunk_end
) {
const uint32_t tx = threadIdx.x, ty = threadIdx.y;
if constexpr (NUM_WARP_Q == 4) {
// 4 warps shared block_size
const uint32_t tid = ty * 32 + tx;
int block_id = __ldg(&block_table_now[kv_idx / block_size]);
if (block_id < 0) block_id = 0;
const T* cache_k_scale_now = cache_k_scale + block_id * kv_num_heads * block_size + kv_head_idx * block_size;
if (tid < block_size) {
k_smem_scale[tid] = cache_k_scale_now[tid];
}
__syncthreads();
const uint32_t row_id = tx / 4;
for (uint32_t fz = 0; fz < num_frags_z; fz++) {
cache_k_reg[fz * 2] = k_smem_scale[fz * 16 + row_id];
cache_k_reg[fz * 2 + 1] = k_smem_scale[fz * 16 + row_id + 8];
}
} else {
// 1 warp 32 tokens
const uint32_t kv_idx_now = kv_idx + block_size * ty / 2;
int block_id = __ldg(&block_table_now[kv_idx_now / block_size]);
if (block_id < 0) block_id = 0;
const T* cache_k_scale_now = cache_k_scale + block_id * kv_num_heads * block_size + kv_head_idx * block_size;
const int kv_idx_this_thread = kv_idx + ty * 32 + tx;
if (kv_idx_this_thread < chunk_end) {
k_smem_scale[ty * 32 + tx] = cache_k_scale_now[(ty % 2) * 32 + tx];
} else {
k_smem_scale[ty * 32 + tx] = 0;
}
__syncwarp();
const uint32_t row_id = tx / 4;
for (uint32_t fz = 0; fz < num_frags_z; fz++) {
cache_k_reg[fz * 2] = k_smem_scale[ty * 32 + fz * 16 + row_id];
cache_k_reg[fz * 2 + 1] = k_smem_scale[ty * 32 + fz * 16 + row_id + 8];
}
}
}
template<uint32_t block_size,
uint32_t num_frags_z,
uint32_t NUM_WARP_Q,
typename T>
__device__ __forceinline__ void produce_v_dynamic_scale(
T* v_smem_scale,
T* cache_v_reg,
const int* block_table_now,
const T* cache_v_scale,
const uint32_t kv_idx,
const uint32_t kv_num_heads,
const uint32_t kv_head_idx,
const uint32_t chunk_end
) {
const uint32_t tx = threadIdx.x, ty = threadIdx.y;
if constexpr (NUM_WARP_Q == 4) {
// 4 warps shared block_size
const uint32_t tid = ty * 32 + tx;
int block_id = __ldg(&block_table_now[kv_idx / block_size]);
if (block_id < 0) block_id = 0;
const T* cache_v_scale_now = cache_v_scale + block_id * kv_num_heads * block_size + kv_head_idx * block_size;
if (tid < block_size) {
v_smem_scale[tid] = cache_v_scale_now[tid];
}
__syncthreads();
const uint32_t row_id = tx % 4 * 2;
for (uint32_t fz = 0; fz < num_frags_z; fz++) {
cache_v_reg[fz * 4] = v_smem_scale[fz * 16 + row_id];
cache_v_reg[fz * 4 + 1] = v_smem_scale[fz * 16 + row_id + 1];
cache_v_reg[fz * 4 + 2] = v_smem_scale[fz * 16 + row_id + 8];
cache_v_reg[fz * 4 + 3] = v_smem_scale[fz * 16 + row_id + 9];
}
} else {
// 1 warp 32 tokens
const uint32_t kv_idx_now = kv_idx + block_size * ty / 2;
int block_id = __ldg(&block_table_now[kv_idx_now / block_size]);
if (block_id < 0) block_id = 0;
const T* cache_v_scale_now = cache_v_scale + block_id * kv_num_heads * block_size + kv_head_idx * block_size;
const int kv_idx_this_thread = kv_idx + ty * 32 + tx;
if (kv_idx_this_thread < chunk_end) {
v_smem_scale[ty * 32 + tx] = cache_v_scale_now[(ty % 2) * 32 + tx];
} else {
v_smem_scale[ty * 32 + tx] = 0;
}
__syncwarp();
const uint32_t row_id = tx % 4 * 2;
for (uint32_t fz = 0; fz < num_frags_z; fz++) {
cache_v_reg[fz * 4] = v_smem_scale[ty * 32 + fz * 16 + row_id];
cache_v_reg[fz * 4 + 1] = v_smem_scale[ty * 32 + fz * 16 + row_id + 1];
cache_v_reg[fz * 4 + 2] = v_smem_scale[ty * 32 + fz * 16 + row_id + 8];
cache_v_reg[fz * 4 + 3] = v_smem_scale[ty * 32 + fz * 16 + row_id + 9];
}
}
}
template <SharedMemFillMode fill_mode,
uint32_t num_warps,
uint32_t block_size,
@@ -816,7 +923,8 @@ template <uint32_t num_frags_x,
typename T,
typename CacheT,
bool is_scale_channel_wise = false,
bool IsFP8=false>
bool IsFP8 = false,
bool IsDynamicC8 = false>
__device__ __forceinline__ void compute_qk_c8(smem_t* q_smem,
uint32_t* q_smem_offset_r,
smem_t* k_smem,
@@ -860,20 +968,27 @@ __device__ __forceinline__ void compute_qk_c8(smem_t* q_smem,
convert_c8<T,IsFP8>(b_frag_dq_T, b_frag[fy * 2]);
convert_c8<T,IsFP8>(b_frag_dq_T + 4, b_frag[fy * 2 + 1]);
// scale zp
if constexpr (is_scale_channel_wise) {
const int scale_col = (ky * 2 + fy) * 4;
b_frag_dq_T[0] *= cache_k_scale[scale_col];
b_frag_dq_T[1] *= cache_k_scale[scale_col + 1];
b_frag_dq_T[2] *= cache_k_scale[scale_col + 2];
b_frag_dq_T[3] *= cache_k_scale[scale_col + 3];
b_frag_dq_T[4] *= cache_k_scale[scale_col];
b_frag_dq_T[5] *= cache_k_scale[scale_col + 1];
b_frag_dq_T[6] *= cache_k_scale[scale_col + 2];
b_frag_dq_T[7] *= cache_k_scale[scale_col + 3];
if constexpr (!IsDynamicC8) {
if constexpr (is_scale_channel_wise) {
const int scale_col = (ky * 2 + fy) * 4;
b_frag_dq_T[0] *= cache_k_scale[scale_col];
b_frag_dq_T[1] *= cache_k_scale[scale_col + 1];
b_frag_dq_T[2] *= cache_k_scale[scale_col + 2];
b_frag_dq_T[3] *= cache_k_scale[scale_col + 3];
b_frag_dq_T[4] *= cache_k_scale[scale_col];
b_frag_dq_T[5] *= cache_k_scale[scale_col + 1];
b_frag_dq_T[6] *= cache_k_scale[scale_col + 2];
b_frag_dq_T[7] *= cache_k_scale[scale_col + 3];
} else {
#pragma unroll
for (uint32_t b_i = 0; b_i < 8; ++b_i) {
b_frag_dq_T[b_i] *= cache_k_scale[0];
}
}
} else {
#pragma unroll
for (uint32_t b_i = 0; b_i < 8; ++b_i) {
b_frag_dq_T[b_i] *= cache_k_scale[0];
b_frag_dq_T[b_i] *= cache_k_scale[fz * 2 + b_i / 4];
}
}
#pragma unroll
@@ -905,12 +1020,15 @@ template <typename T,
uint32_t num_frags_y,
uint32_t num_frags_z,
bool IS_SYSTEM = false>
__device__ __forceinline__ void mask_s(const uint32_t qo_idx_base,
__device__ __forceinline__ void mask_s(const bool* attn_mask,
const uint32_t qo_idx_base,
const uint32_t kv_idx_base,
const uint32_t qo_len,
const uint32_t kv_len,
const uint32_t chunk_end,
float (*s_frag)[num_frags_z][8]) {
const uint32_t attn_mask_len,
float (*s_frag)[num_frags_z][8],
const int *mask_offset = nullptr) {
const uint32_t tx = threadIdx.x;
#pragma unroll
for (uint32_t fx = 0; fx < num_frags_x; ++fx) {
@@ -924,10 +1042,21 @@ __device__ __forceinline__ void mask_s(const uint32_t qo_idx_base,
group_size,
kv_idx = kv_idx_base + fz * 16 + 2 * (tx % 4) +
8 * (reg_id / 4) + reg_id % 2;
const bool out_of_boundary =
(causal
? (kv_idx > kv_len + q_idx - qo_len || (kv_idx >= chunk_end))
: kv_idx >= chunk_end);
bool out_of_boundary;
if (mask_offset) {
out_of_boundary = q_idx < qo_len ? (kv_idx >= mask_offset[q_idx * 2 + 1] || kv_idx < mask_offset[q_idx * 2]) : true;
} else {
out_of_boundary =
(causal
? (kv_idx > kv_len + q_idx - qo_len || (kv_idx >= chunk_end))
: kv_idx >= chunk_end);
if (attn_mask != nullptr && kv_idx > kv_len - qo_len && kv_idx < chunk_end && q_idx < attn_mask_len) {
const int32_t mask_idx = q_idx * attn_mask_len + kv_idx - kv_len + qo_len;
bool mask = attn_mask[mask_idx];
out_of_boundary |= mask;
}
}
if constexpr (std::is_same<T, half>::value) {
s_frag[fx][fz][reg_id] =
out_of_boundary ? -5e4f : s_frag[fx][fz][reg_id];
@@ -935,6 +1064,7 @@ __device__ __forceinline__ void mask_s(const uint32_t qo_idx_base,
s_frag[fx][fz][reg_id] =
out_of_boundary ? -3.0e+30f : s_frag[fx][fz][reg_id];
}
// printf("tid: %d. qk[%u,%u] = %f, mask: %d \n ", threadIdx.x, kv_idx, q_idx, static_cast<float>(s_frag[fx][fz][reg_id]), int(out_of_boundary));
} else {
const uint32_t q_idx = qo_idx_base,
kv_idx = kv_idx_base + fz * 16 + 2 * (tx % 4) +
@@ -1078,7 +1208,9 @@ template <uint32_t num_frags_x,
uint32_t block_size,
typename T,
typename CacheT,
bool is_scale_channel_wise = false, bool IsFP8=false>
bool is_scale_channel_wise = false,
bool IsFP8 = false,
bool IsDynamicC8 = false>
__device__ __forceinline__ void compute_sfm_v_c8(
smem_t* v_smem,
uint32_t* v_smem_offset_r,
@@ -1120,16 +1252,28 @@ __device__ __forceinline__ void compute_sfm_v_c8(
convert_c8<T,IsFP8>(b_frag_dq_T, b_frag[fz * 2]);
convert_c8<T,IsFP8>(b_frag_dq_T + 4, b_frag[fz * 2 + 1]);
// scale zp
if constexpr (is_scale_channel_wise) {
if constexpr (!IsDynamicC8) {
if constexpr (is_scale_channel_wise) {
#pragma unroll
for (uint32_t b_i = 0; b_i < 8; ++b_i) {
b_frag_dq_T[b_i] *= cache_v_scale[b_i / 4 + fy * 2];
for (uint32_t b_i = 0; b_i < 8; ++b_i) {
b_frag_dq_T[b_i] *= cache_v_scale[b_i / 4 + fy * 2];
}
} else {
#pragma unroll
for (uint32_t b_i = 0; b_i < 8; ++b_i) {
b_frag_dq_T[b_i] *= cache_v_scale[0];
}
}
} else {
#pragma unroll
for (uint32_t b_i = 0; b_i < 8; ++b_i) {
b_frag_dq_T[b_i] *= cache_v_scale[0];
}
const int scale_col = (kz * 2 + fz) * 4;
b_frag_dq_T[0] *= cache_v_scale[scale_col];
b_frag_dq_T[1] *= cache_v_scale[scale_col + 1];
b_frag_dq_T[2] *= cache_v_scale[scale_col + 2];
b_frag_dq_T[3] *= cache_v_scale[scale_col + 3];
b_frag_dq_T[4] *= cache_v_scale[scale_col];
b_frag_dq_T[5] *= cache_v_scale[scale_col + 1];
b_frag_dq_T[6] *= cache_v_scale[scale_col + 2];
b_frag_dq_T[7] *= cache_v_scale[scale_col + 3];
}
#pragma unroll
for (uint32_t fx = 0; fx < num_frags_x; ++fx) { // m: num_frags_x * 16
@@ -1156,7 +1300,9 @@ template <uint32_t num_frags_x,
uint32_t block_size,
typename T,
typename CacheT,
bool is_scale_channel_wise = false, bool IsFP8=false>
bool is_scale_channel_wise = false,
bool IsFP8 = false,
bool IsDynamicC8 = false>
__device__ __forceinline__ void compute_sfm_v_c8_iter_sq_bvec(
smem_t* v_smem,
uint32_t* v_smem_offset_r,
@@ -1200,16 +1346,28 @@ __device__ __forceinline__ void compute_sfm_v_c8_iter_sq_bvec(
convert_c8<T,IsFP8>(b_frag_dq_T, b_frag[fz * 2]);
convert_c8<T,IsFP8>(b_frag_dq_T + 4, b_frag[fz * 2 + 1]);
// scale zp
if constexpr (is_scale_channel_wise) {
if constexpr (!IsDynamicC8) {
if constexpr (is_scale_channel_wise) {
#pragma unroll
for (uint32_t b_i = 0; b_i < 8; ++b_i) {
b_frag_dq_T[b_i] *= cache_v_scale[b_i / 4 + fy * 2];
for (uint32_t b_i = 0; b_i < 8; ++b_i) {
b_frag_dq_T[b_i] *= cache_v_scale[b_i / 4 + fy * 2];
}
} else {
#pragma unroll
for (uint32_t b_i = 0; b_i < 8; ++b_i) {
b_frag_dq_T[b_i] *= cache_v_scale[0];
}
}
} else {
#pragma unroll
for (uint32_t b_i = 0; b_i < 8; ++b_i) {
b_frag_dq_T[b_i] *= cache_v_scale[0];
}
const int scale_col = (kz * 2 + fz) * 4;
b_frag_dq_T[0] *= cache_v_scale[scale_col];
b_frag_dq_T[1] *= cache_v_scale[scale_col + 1];
b_frag_dq_T[2] *= cache_v_scale[scale_col + 2];
b_frag_dq_T[3] *= cache_v_scale[scale_col + 3];
b_frag_dq_T[4] *= cache_v_scale[scale_col];
b_frag_dq_T[5] *= cache_v_scale[scale_col + 1];
b_frag_dq_T[6] *= cache_v_scale[scale_col + 2];
b_frag_dq_T[7] *= cache_v_scale[scale_col + 3];
}
#pragma unroll
for (uint32_t fx = 0; fx < num_frags_x; ++fx) { // m: num_frags_x * 16

View File

@@ -103,6 +103,7 @@ void CascadeAppendAttentionC8Kernel(
const bool causal,
const bool is_decoder,
const bool enable_prefill,
const std::string& cache_quant_type_str,
cudaStream_t& stream,
paddle::Tensor* out);
@@ -264,9 +265,10 @@ void CascadeAppendAttentionKernel(
causal,
is_decoder,
enable_prefill,
cache_quant_type_str,
stream,
out);
} else if (cache_quant_type_str == "cache_fp8") {
} else if (cache_quant_type_str == "cache_fp8" or cache_quant_type_str == "block_wise_fp8") {
CascadeAppendAttentionC8Kernel<T, OutT, true>(meta_data,
qkv,
cache_k,
@@ -299,6 +301,7 @@ void CascadeAppendAttentionKernel(
causal,
is_decoder,
enable_prefill,
cache_quant_type_str,
stream,
out);
} else if (cache_quant_type_str == "cache_int4_zp") {

View File

@@ -16,12 +16,11 @@
#include "utils.cuh"
template <typename T, typename QKV_TYPE>
void append_decode_cache_rope(const QKV_TYPE* qkv,
void append_decode_cache_rope_qk_norm(const QKV_TYPE* qkv,
T* key_cache,
T* value_cache,
T* qkv_out,
const int* block_tables,
const int* batch_id_per_token,
const int* cu_seqlens_q,
const int* seq_lens,
const int* seq_lens_encoder,
@@ -38,6 +37,68 @@ void append_decode_cache_rope(const QKV_TYPE* qkv,
const int bsz,
const cudaStream_t& stream,
const bool use_neox_style,
const bool rope_3d,
const float* q_norm_weight,
const float* k_norm_weight,
const float rms_norm_eps) {
const uint32_t elem_nums =
use_neox_style ? bsz * (num_heads + 2 * kv_num_heads) * dim_head / 2
: bsz * (num_heads + 2 * kv_num_heads) * dim_head;
constexpr int HEAD_DIM = 128;
constexpr int PackSize = HEAD_DIM / kWarpSize;
const int pack_num = elem_nums / PackSize;
const int blocksize = 128;
int grid_size = 1;
GetNumBlocks<128>(pack_num, &grid_size);
dim3 block_dim(kWarpSize, blocksize / kWarpSize, 1);
append_decode_cache_T_rope_qk_norm_kernel<T, PackSize>
<<<grid_size, block_dim, 0, stream>>>(reinterpret_cast<const T*>(qkv),
key_cache,
value_cache,
qkv_out,
block_tables,
cu_seqlens_q,
seq_lens,
seq_lens_encoder,
cos_emb,
sin_emb,
max_seq_len,
max_blocks_per_seq,
num_heads,
dim_head,
block_size,
elem_nums,
kv_num_heads,
rope_3d,
q_norm_weight,
k_norm_weight,
rms_norm_eps);
}
template <typename T, typename QKV_TYPE>
void append_decode_cache_rope(const QKV_TYPE* qkv,
T* key_cache,
T* value_cache,
T* qkv_out,
const int* block_tables,
const int* cu_seqlens_q,
const int* seq_lens,
const int* seq_lens_encoder,
const float* cos_emb,
const float* sin_emb,
const float* qkv_out_scales,
const T* qkv_biases,
const int max_seq_len,
const int max_blocks_per_seq,
const int num_heads,
const int kv_num_heads,
const int dim_head,
const int rotary_dim,
const int block_size,
const int bsz,
const cudaStream_t& stream,
const bool use_neox_style,
const bool rope_3d) {
const uint32_t elem_nums =
use_neox_style ? bsz * (num_heads + 2 * kv_num_heads) * dim_head / 2
@@ -57,7 +118,6 @@ void append_decode_cache_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
seq_lens,
seq_lens_encoder,
@@ -71,15 +131,37 @@ void append_decode_cache_rope(const QKV_TYPE* qkv,
dim_head,
block_size,
elem_nums,
kv_num_heads);
kv_num_heads,
rope_3d);
} else {
append_decode_cache_T_neox_rope_kernel<T, PackSize>
if (rotary_dim < dim_head){
append_decode_cache_T_neox_partial_rope_kernel<T, PackSize>
<<<grid_size, blocksize, 0, stream>>>(reinterpret_cast<const T*>(qkv),
key_cache,
value_cache,
qkv_out,
block_tables,
cu_seqlens_q,
seq_lens,
seq_lens_encoder,
cos_emb,
sin_emb,
max_seq_len,
max_blocks_per_seq,
num_heads,
dim_head,
rotary_dim,
block_size,
elem_nums,
kv_num_heads,
rope_3d);
}else{
append_decode_cache_T_neox_rope_kernel<T, PackSize>
<<<grid_size, blocksize, 0, stream>>>(reinterpret_cast<const T*>(qkv),
key_cache,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
seq_lens,
seq_lens_encoder,
@@ -91,7 +173,9 @@ void append_decode_cache_rope(const QKV_TYPE* qkv,
dim_head,
block_size,
elem_nums,
kv_num_heads);
kv_num_heads,
rope_3d);
}
}
} else {
if (qkv_out_scales) {
@@ -102,7 +186,6 @@ void append_decode_cache_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
seq_lens,
seq_lens_encoder,
@@ -125,7 +208,6 @@ void append_decode_cache_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
seq_lens,
seq_lens_encoder,
@@ -149,7 +231,6 @@ void append_decode_cache_int8_rope(const QKV_TYPE* qkv,
uint8_t* value_cache,
T* qkv_out,
const int* block_tables,
const int* batch_id_per_token,
const int* cu_seqlens_q,
const int* seq_lens,
const int* seq_lens_encoder,
@@ -182,7 +263,6 @@ void append_decode_cache_int8_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
seq_lens,
seq_lens_encoder,
@@ -198,7 +278,8 @@ void append_decode_cache_int8_rope(const QKV_TYPE* qkv,
block_size,
127.0f,
-127.0f,
kv_num_heads);
kv_num_heads,
rope_3d);
} else {
append_decode_cache_int8_neox_rope_kernel<T, 4>
<<<grids, num_warps * 32, 0, stream>>>(
@@ -207,7 +288,6 @@ void append_decode_cache_int8_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
seq_lens,
seq_lens_encoder,
@@ -221,7 +301,8 @@ void append_decode_cache_int8_rope(const QKV_TYPE* qkv,
block_size,
127.0f,
-127.0f,
kv_num_heads);
kv_num_heads,
rope_3d);
}
} else {
if (qkv_out_scales) {
@@ -232,7 +313,6 @@ void append_decode_cache_int8_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
seq_lens,
seq_lens_encoder,
@@ -248,7 +328,8 @@ void append_decode_cache_int8_rope(const QKV_TYPE* qkv,
block_size,
127.0f,
-127.0f,
kv_num_heads);
kv_num_heads,
rope_3d);
} else {
append_decode_cache_int8_rope_kernel<T, 4, 0, 128, is_scale_channel_wise, IsFP8>
<<<grids, num_warps * 32, 0, stream>>>(
@@ -257,7 +338,6 @@ void append_decode_cache_int8_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
seq_lens,
seq_lens_encoder,
@@ -271,7 +351,8 @@ void append_decode_cache_int8_rope(const QKV_TYPE* qkv,
block_size,
127.0f,
-127.0f,
kv_num_heads);
kv_num_heads,
rope_3d);
}
}
}
@@ -282,7 +363,6 @@ void append_decode_cache_int4_rope(const QKV_TYPE* qkv,
uint8_t* value_cache,
T* qkv_out,
const int* block_tables,
const int* batch_id_per_token,
const int* cu_seqlens_q,
const int* seq_lens,
const int* seq_lens_encoder,
@@ -317,7 +397,6 @@ void append_decode_cache_int4_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
seq_lens,
seq_lens_encoder,
@@ -335,7 +414,8 @@ void append_decode_cache_int4_rope(const QKV_TYPE* qkv,
block_size,
7.0f,
-8.0f,
kv_num_heads);
kv_num_heads,
rope_3d);
} else {
append_decode_cache_int4_neox_rope_kernel<T, 4>
<<<grids, num_warps * 32, 0, stream>>>(
@@ -344,7 +424,6 @@ void append_decode_cache_int4_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
seq_lens,
seq_lens_encoder,
@@ -360,7 +439,8 @@ void append_decode_cache_int4_rope(const QKV_TYPE* qkv,
block_size,
7.0f,
-8.0f,
kv_num_heads);
kv_num_heads,
rope_3d);
}
} else {
if (qkv_out_scales) {
@@ -371,7 +451,6 @@ void append_decode_cache_int4_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
seq_lens,
seq_lens_encoder,
@@ -389,7 +468,8 @@ void append_decode_cache_int4_rope(const QKV_TYPE* qkv,
block_size,
7.0f,
-8.0f,
kv_num_heads);
kv_num_heads,
rope_3d);
} else {
append_decode_cache_int4_rope_kernel<T, 4>
<<<grids, num_warps * 32, 0, stream>>>(
@@ -398,7 +478,6 @@ void append_decode_cache_int4_rope(const QKV_TYPE* qkv,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
seq_lens,
seq_lens_encoder,
@@ -414,7 +493,8 @@ void append_decode_cache_int4_rope(const QKV_TYPE* qkv,
block_size,
7.0f,
-8.0f,
kv_num_heads);
kv_num_heads,
rope_3d);
}
}
}
@@ -424,7 +504,6 @@ void DecoderWriteCacheWithRoPEKernel(
const paddle::Tensor& qkv,
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_tables,
const paddle::optional<paddle::Tensor>& rotary_embs,
@@ -441,7 +520,10 @@ void DecoderWriteCacheWithRoPEKernel(
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out) {
paddle::Tensor* value_cache_out,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps) {
typedef cascade_attn_type_traits<T> traits_;
typedef cascade_attn_type_traits<QKV_TYPE> qkt_nv_type_;
typedef typename traits_::type DataType_;
@@ -458,119 +540,126 @@ void DecoderWriteCacheWithRoPEKernel(
const float* cos_emb =
rotary_embs ? rotary_embs.get().data<float>() : nullptr;
const float* sin_emb;
int rotary_dim = dim_head;
if (rotary_embs) {
sin_emb =
use_neox_rotary_style
? rotary_embs.get().data<float>() + max_seq_len * dim_head
: rotary_embs.get().data<float>() + max_seq_len * dim_head / 2;
rotary_dim = rotary_embs.get().dims()[rotary_embs.get().dims().size()-1] * 2;
if(rotary_dim < dim_head){
if (!use_neox_rotary_style || qkv_out_scales || q_norm_weight || k_norm_weight|| cache_quant_type_str != "none"){
PADDLE_THROW(phi::errors::Fatal(
"partial_rotary_factor < 1.0 only supports neox_rotary_style=True, qkv_out_scales is None, q_norm_weight/k_norm_weight) is None, and cache_quant_type_str is 'none'."));
}
sin_emb = rotary_embs.get().data<float>() + max_seq_len * rotary_dim / 2;
}
}
if (cache_quant_type_str == "none") {
append_decode_cache_rope(
reinterpret_cast<const QKV_TYPE*>(qkv_ptr),
reinterpret_cast<DataType_*>(key_cache_out->data<T>()),
reinterpret_cast<DataType_*>(value_cache_out->data<T>()),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
sin_emb,
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? reinterpret_cast<DataType_*>(
const_cast<T*>(qkv_biases.get().data<T>()))
: nullptr,
max_seq_len,
max_blocks_per_seq,
num_heads,
kv_num_heads,
dim_head,
block_size,
bsz,
stream,
use_neox_rotary_style,
rope_3d);
} else if (cache_quant_type_str == "cache_int8") {
bool is_scale_channel_wise = false;
if (cache_k_scale && cache_k_scale.get().dims()[0] == dim_head * kv_num_heads) {
is_scale_channel_wise = true;
}
if (is_scale_channel_wise) {
append_decode_cache_int8_rope<DataType_, QKV_TYPE, true>(
reinterpret_cast<const QKV_TYPE*>(qkv_ptr),
key_cache_out->data<uint8_t>(),
value_cache_out->data<uint8_t>(),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
sin_emb,
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? reinterpret_cast<DataType_*>(
const_cast<T*>(qkv_biases.get().data<T>()))
: nullptr,
cache_k_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_k_scale.get().data<T>()))
: nullptr,
cache_v_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_v_scale.get().data<T>()))
: nullptr,
max_seq_len,
max_blocks_per_seq,
num_heads,
kv_num_heads,
dim_head,
block_size,
bsz,
stream,
use_neox_rotary_style,
rope_3d);
if (q_norm_weight && k_norm_weight) {
if (cache_quant_type_str == "none") {
append_decode_cache_rope_qk_norm(
reinterpret_cast<const QKV_TYPE*>(qkv_ptr),
reinterpret_cast<DataType_*>(key_cache_out->data<T>()),
reinterpret_cast<DataType_*>(value_cache_out->data<T>()),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
sin_emb,
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? reinterpret_cast<DataType_*>(
const_cast<T*>(qkv_biases.get().data<T>()))
: nullptr,
max_seq_len,
max_blocks_per_seq,
num_heads,
kv_num_heads,
dim_head,
block_size,
bsz,
stream,
use_neox_rotary_style,
rope_3d,
q_norm_weight ? q_norm_weight.get().data<float>() : nullptr,
k_norm_weight ? k_norm_weight.get().data<float>() : nullptr,
rms_norm_eps);
} else if (cache_quant_type_str == "block_wise_fp8") {
constexpr int num_warps = 4;
const int all_warps =
((num_heads + 2 * kv_num_heads) + num_warps - 1) / num_warps * num_warps;
dim3 grids(bsz, all_warps / num_warps);
append_decode_cache_int8_rope_qk_norm_kernel<DataType_, 4, 0, 128, false, true>
<<<grids, num_warps * 32, 0, stream>>>(
reinterpret_cast<const DataType_*>(qkv_ptr),
key_cache_out->data<uint8_t>(),
value_cache_out->data<uint8_t>(),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
sin_emb,
const_cast<DataType_*>(reinterpret_cast<const DataType_*>(cache_k_scale.get().data<T>())),
const_cast<DataType_*>(reinterpret_cast<const DataType_*>((cache_v_scale.get().data<T>()))),
q_norm_weight.get().data<float>(),
k_norm_weight.get().data<float>(),
max_seq_len,
max_blocks_per_seq,
num_heads,
block_size,
127.0f,
-127.0f,
kv_num_heads,
rope_3d,
rms_norm_eps);
} else {
append_decode_cache_int8_rope<DataType_, QKV_TYPE, false>(
reinterpret_cast<const QKV_TYPE*>(qkv_ptr),
key_cache_out->data<uint8_t>(),
value_cache_out->data<uint8_t>(),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
sin_emb,
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? reinterpret_cast<DataType_*>(
const_cast<T*>(qkv_biases.get().data<T>()))
: nullptr,
cache_k_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_k_scale.get().data<T>()))
: nullptr,
cache_v_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_v_scale.get().data<T>()))
: nullptr,
max_seq_len,
max_blocks_per_seq,
num_heads,
kv_num_heads,
dim_head,
block_size,
bsz,
stream,
use_neox_rotary_style,
rope_3d);
PD_THROW(
"append_decode_cache_rope_qk_norm just supports cache_quant_type none/block_wise_fp8");
}
} else if (cache_quant_type_str == "cache_fp8") {
append_decode_cache_int8_rope<DataType_, QKV_TYPE, false, true>(
} else {
if (cache_quant_type_str == "none") {
append_decode_cache_rope(
reinterpret_cast<const QKV_TYPE*>(qkv_ptr),
reinterpret_cast<DataType_*>(key_cache_out->data<T>()),
reinterpret_cast<DataType_*>(value_cache_out->data<T>()),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
sin_emb,
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? reinterpret_cast<DataType_*>(
const_cast<T*>(qkv_biases.get().data<T>()))
: nullptr,
max_seq_len,
max_blocks_per_seq,
num_heads,
kv_num_heads,
dim_head,
rotary_dim,
block_size,
bsz,
stream,
use_neox_rotary_style,
rope_3d);
} else if (cache_quant_type_str == "cache_int8") {
bool is_scale_channel_wise = false;
if (cache_k_scale && cache_k_scale.get().dims()[0] == dim_head * kv_num_heads) {
is_scale_channel_wise = true;
}
if (is_scale_channel_wise) {
append_decode_cache_int8_rope<DataType_, QKV_TYPE, true>(
reinterpret_cast<const QKV_TYPE*>(qkv_ptr),
key_cache_out->data<uint8_t>(),
value_cache_out->data<uint8_t>(),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
@@ -596,49 +685,144 @@ void DecoderWriteCacheWithRoPEKernel(
stream,
use_neox_rotary_style,
rope_3d);
} else if (cache_quant_type_str == "cache_int4_zp") {
append_decode_cache_int4_rope(
reinterpret_cast<const QKV_TYPE*>(qkv_ptr),
key_cache_out->data<uint8_t>(),
value_cache_out->data<uint8_t>(),
reinterpret_cast<DataType_*>(const_cast<T*>(qkv_out->data<T>())),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
sin_emb,
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? reinterpret_cast<DataType_*>(
const_cast<T*>(qkv_biases.get().data<T>()))
: nullptr,
cache_k_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_k_scale.get().data<T>()))
} else {
append_decode_cache_int8_rope<DataType_, QKV_TYPE, false>(
reinterpret_cast<const QKV_TYPE*>(qkv_ptr),
key_cache_out->data<uint8_t>(),
value_cache_out->data<uint8_t>(),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
sin_emb,
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? reinterpret_cast<DataType_*>(
const_cast<T*>(qkv_biases.get().data<T>()))
: nullptr,
cache_v_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_v_scale.get().data<T>()))
cache_k_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_k_scale.get().data<T>()))
: nullptr,
cache_v_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_v_scale.get().data<T>()))
: nullptr,
max_seq_len,
max_blocks_per_seq,
num_heads,
kv_num_heads,
dim_head,
block_size,
bsz,
stream,
use_neox_rotary_style,
rope_3d);
}
} else if (cache_quant_type_str == "cache_fp8") {
append_decode_cache_int8_rope<DataType_, QKV_TYPE, false, true>(
reinterpret_cast<const QKV_TYPE*>(qkv_ptr),
key_cache_out->data<uint8_t>(),
value_cache_out->data<uint8_t>(),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
sin_emb,
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? reinterpret_cast<DataType_*>(
const_cast<T*>(qkv_biases.get().data<T>()))
: nullptr,
cache_k_zp ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_k_zp.get().data<T>()))
: nullptr,
cache_v_zp ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_v_zp.get().data<T>()))
: nullptr,
max_seq_len,
max_blocks_per_seq,
num_heads,
kv_num_heads,
dim_head,
block_size,
bsz,
stream,
use_neox_rotary_style,
rope_3d);
} else {
PD_THROW(
"cache_quant_type_str should be one of [none, cache_int8, cache_fp8 "
"cache_int4_zp]");
cache_k_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_k_scale.get().data<T>()))
: nullptr,
cache_v_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_v_scale.get().data<T>()))
: nullptr,
max_seq_len,
max_blocks_per_seq,
num_heads,
kv_num_heads,
dim_head,
block_size,
bsz,
stream,
use_neox_rotary_style,
rope_3d);
} else if (cache_quant_type_str == "block_wise_fp8") {
constexpr int num_warps = 4;
const int all_warps =
((num_heads + 2 * kv_num_heads) + num_warps - 1) / num_warps * num_warps;
dim3 grids(bsz, all_warps / num_warps);
append_decode_cache_int8_rope_qk_norm_kernel<DataType_, 4, 0, 128, false, true>
<<<grids, num_warps * 32, 0, stream>>>(
reinterpret_cast<const DataType_*>(qkv_ptr),
key_cache_out->data<uint8_t>(),
value_cache_out->data<uint8_t>(),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
sin_emb,
const_cast<DataType_*>(reinterpret_cast<const DataType_*>(cache_k_scale.get().data<T>())),
const_cast<DataType_*>(reinterpret_cast<const DataType_*>((cache_v_scale.get().data<T>()))),
nullptr,
nullptr,
max_seq_len,
max_blocks_per_seq,
num_heads,
block_size,
127.0f,
-127.0f,
kv_num_heads,
rope_3d,
rms_norm_eps);
} else if (cache_quant_type_str == "cache_int4_zp") {
append_decode_cache_int4_rope(
reinterpret_cast<const QKV_TYPE*>(qkv_ptr),
key_cache_out->data<uint8_t>(),
value_cache_out->data<uint8_t>(),
reinterpret_cast<DataType_*>(const_cast<T*>(qkv_out->data<T>())),
block_tables.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
sin_emb,
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? reinterpret_cast<DataType_*>(
const_cast<T*>(qkv_biases.get().data<T>()))
: nullptr,
cache_k_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_k_scale.get().data<T>()))
: nullptr,
cache_v_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_v_scale.get().data<T>()))
: nullptr,
cache_k_zp ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_k_zp.get().data<T>()))
: nullptr,
cache_v_zp ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_v_zp.get().data<T>()))
: nullptr,
max_seq_len,
max_blocks_per_seq,
num_heads,
kv_num_heads,
dim_head,
block_size,
bsz,
stream,
use_neox_rotary_style,
rope_3d);
} else {
PD_THROW(
"cache_quant_type_str should be one of [none, cache_int8, cache_fp8 "
"cache_int4_zp]");
}
}
}
@@ -650,7 +834,6 @@ template void DecoderWriteCacheWithRoPEKernel<paddle::bfloat16, int>(
// kv_num_heads, head_dim] if GQA)
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_tables,
const paddle::optional<paddle::Tensor>& rotary_embs,
@@ -667,7 +850,10 @@ template void DecoderWriteCacheWithRoPEKernel<paddle::bfloat16, int>(
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out);
paddle::Tensor* value_cache_out,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps);
template void
DecoderWriteCacheWithRoPEKernel<paddle::bfloat16, paddle::bfloat16>(
@@ -677,7 +863,6 @@ DecoderWriteCacheWithRoPEKernel<paddle::bfloat16, paddle::bfloat16>(
// kv_num_heads, head_dim] if GQA)
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_tables,
const paddle::optional<paddle::Tensor>& rotary_embs,
@@ -694,7 +879,10 @@ DecoderWriteCacheWithRoPEKernel<paddle::bfloat16, paddle::bfloat16>(
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out);
paddle::Tensor* value_cache_out,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps);
template void DecoderWriteCacheWithRoPEKernel<paddle::float16, int>(
const AppendAttnMetaData& meta_data,
@@ -703,7 +891,6 @@ template void DecoderWriteCacheWithRoPEKernel<paddle::float16, int>(
// kv_num_heads, head_dim] if GQA)
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_tables,
const paddle::optional<paddle::Tensor>& rotary_embs,
@@ -720,7 +907,10 @@ template void DecoderWriteCacheWithRoPEKernel<paddle::float16, int>(
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out);
paddle::Tensor* value_cache_out,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps);
template void DecoderWriteCacheWithRoPEKernel<paddle::float16, paddle::float16>(
const AppendAttnMetaData& meta_data,
@@ -729,7 +919,6 @@ template void DecoderWriteCacheWithRoPEKernel<paddle::float16, paddle::float16>(
// kv_num_heads, head_dim] if GQA)
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_tables,
const paddle::optional<paddle::Tensor>& rotary_embs,
@@ -746,4 +935,7 @@ template void DecoderWriteCacheWithRoPEKernel<paddle::float16, paddle::float16>(
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out);
paddle::Tensor* value_cache_out,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps);

View File

@@ -23,7 +23,6 @@ void DecoderWriteCacheWithRoPEKernel(
// kv_num_heads, head_dim] if GQA)
const paddle::Tensor& seq_lens,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_tables,
const paddle::optional<paddle::Tensor>& rotary_embs,
@@ -40,4 +39,6 @@ void DecoderWriteCacheWithRoPEKernel(
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out);
paddle::Tensor* value_cache_out,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight, const float rms_norm_eps);

View File

@@ -46,38 +46,32 @@ void EncoderWriteCacheWithRopeKernel(
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out) {
paddle::Tensor* value_cache_out,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps) {
auto token_num = meta_data.token_nums;
auto num_heads = meta_data.q_num_heads;
auto kv_num_heads = meta_data.kv_num_heads;
auto head_dim = meta_data.head_dims;
bool is_scale_channel_wise = false;
int rotary_dim = head_dim;
if (cache_k_scale && cache_k_scale.get().dims()[0] == head_dim * kv_num_heads) {
is_scale_channel_wise = true;
}
if (rotary_embs){
rotary_dim = rotary_embs.get().dims()[rotary_embs.get().dims().size()-1] * 2;
if(rotary_dim < head_dim){
if (!use_neox_style || q_norm_weight || k_norm_weight || num_heads == kv_num_heads || is_scale_channel_wise){
PADDLE_THROW(phi::errors::Fatal(
"partial_rotary_factor < 1.0 only supports use_neox_rotary_style=True, q_norm_weight/k_norm_weight) is None, GQA and is_scale_channel_wise=false."));
}
}
}
if (num_heads == kv_num_heads) {
rotary_qk_variable(
qkv_out->data<T>(),
qkv.data<QKV_TYPE>(),
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? qkv_biases.get().data<T>() : nullptr,
rotary_embs.get().data<float>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens_encoder.data<int>(),
seq_lens_decoder.data<int>(),
token_num,
num_heads,
max_seq_len,
rotary_embs.get().dims()[2],
head_dim,
stream,
use_neox_style,
rope_3d);
} else {
if (!is_scale_channel_wise) {
gqa_rotary_qk_variable(
if (q_norm_weight && k_norm_weight) {
if (num_heads != kv_num_heads && !is_scale_channel_wise && !use_neox_style) {
gqa_rotary_qk_norm_variable(
qkv_out->data<T>(),
qkv.data<QKV_TYPE>(),
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
@@ -95,31 +89,81 @@ void EncoderWriteCacheWithRopeKernel(
head_dim,
stream,
use_neox_style,
rope_3d);
rope_3d,
q_norm_weight ? q_norm_weight.get().data<float>() : nullptr,
k_norm_weight ? k_norm_weight.get().data<float>() : nullptr,
rms_norm_eps);
} else {
gqa_rotary_qk_quant_variable(
qkv_out->data<T>(),
qkv.data<QKV_TYPE>(),
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? qkv_biases.get().data<T>() : nullptr,
cache_k_scale ? cache_k_scale.get().data<T>() : nullptr,
cache_v_scale ? cache_v_scale.get().data<T>() : nullptr,
rotary_embs.get().data<float>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens_encoder.data<int>(),
seq_lens_decoder.data<int>(),
token_num,
num_heads,
kv_num_heads,
max_seq_len,
rotary_embs.get().dims()[2],
head_dim,
stream,
use_neox_style,
rope_3d);
PD_THROW(
"gqa_rotary_qk_norm_variable only support gqa mode. channel wise scale and neox style are not supported");
}
} else {
if (num_heads == kv_num_heads) {
rotary_qk_variable(
qkv_out->data<T>(),
qkv.data<QKV_TYPE>(),
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? qkv_biases.get().data<T>() : nullptr,
rotary_embs.get().data<float>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens_encoder.data<int>(),
seq_lens_decoder.data<int>(),
token_num,
num_heads,
max_seq_len,
rotary_embs.get().dims()[2],
head_dim,
stream,
use_neox_style,
rope_3d);
} else {
if (!is_scale_channel_wise) {
gqa_rotary_qk_variable(
qkv_out->data<T>(),
qkv.data<QKV_TYPE>(),
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? qkv_biases.get().data<T>() : nullptr,
rotary_embs.get().data<float>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens_encoder.data<int>(),
seq_lens_decoder.data<int>(),
token_num,
num_heads,
kv_num_heads,
max_seq_len,
rope_3d ? rotary_embs.get().dims()[3] : rotary_embs.get().dims()[2],
head_dim,
rotary_dim,
stream,
use_neox_style,
rope_3d);
} else {
gqa_rotary_qk_quant_variable(
qkv_out->data<T>(),
qkv.data<QKV_TYPE>(),
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? qkv_biases.get().data<T>() : nullptr,
cache_k_scale ? cache_k_scale.get().data<T>() : nullptr,
cache_v_scale ? cache_v_scale.get().data<T>() : nullptr,
rotary_embs.get().data<float>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens_encoder.data<int>(),
seq_lens_decoder.data<int>(),
token_num,
num_heads,
kv_num_heads,
max_seq_len,
rotary_embs.get().dims()[2],
head_dim,
stream,
use_neox_style,
rope_3d);
}
}
}
const uint32_t block_size = meta_data.block_size;
if (cache_quant_type_str == "none") {
@@ -134,7 +178,7 @@ void EncoderWriteCacheWithRopeKernel(
stream,
key_cache_out,
value_cache_out);
} else if (cache_quant_type_str == "cache_int8" or cache_quant_type_str == "cache_fp8") {
} else if (cache_quant_type_str == "cache_int8" or cache_quant_type_str == "cache_fp8" or cache_quant_type_str == "block_wise_fp8") {
DISPATCH_HEAD_DIM(
head_dim, HEAD_DIM, {DISPATCH_BLOCK_SIZE(block_size, BLOCK_SIZE, {
CascadeAppendWriteCacheKVC8QKV<T, HEAD_DIM, BLOCK_SIZE>(
@@ -154,7 +198,7 @@ void EncoderWriteCacheWithRopeKernel(
num_blocks,
max_seq_len,
is_scale_channel_wise,
cache_quant_type_str == "cache_fp8",
cache_quant_type_str,
stream,
key_cache_out,
value_cache_out);

View File

@@ -11,10 +11,11 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "cute/tensor.hpp"
#include "helper.h"
#include "paddle/extension.h"
#include "paddle/phi/core/memory/memcpy.h"
#include "utils.cuh"
template <int THREADBLOCK_SIZE>
__global__ void
@@ -116,6 +117,93 @@ void GetMaxLen(const paddle::Tensor &seq_lens_tensor,
max_len_tensor.data<int>(), batch_size);
}
template <uint32_t config_size>
__global__ void search_chunk_size_for_mla(
const int *__restrict__ seq_lens_q,
const int *__restrict__ seq_lens_encoder,
const int *__restrict__ seq_lens_decoder,
int *__restrict__ num_blocks_x,
int *__restrict__ res_chunk_size,
const int bsz,
const int set_chunk_size,
const int block_size,
const int sm_cout) {
const uint32_t conf_id = threadIdx.x;
int gridx = 0;
if (set_chunk_size > 0 && conf_id == 0) {
for (uint32_t bid = 0; bid < bsz; bid++) {
int seq_len = seq_lens_q[bid];
int seq_len_encoder = seq_lens_encoder[bid];
int seq_len_decoder = seq_lens_decoder[bid] + seq_len;
if (seq_len == 0 || seq_len_encoder > 0) continue;
int loop_times;
loop_times = cute::ceil_div(seq_len_decoder, set_chunk_size);
gridx += loop_times;
}
*num_blocks_x = gridx;
*res_chunk_size = set_chunk_size;
} else if (conf_id < config_size) {
__shared__ int gridx_shared[config_size];
// chunk_size is a multiple of 64
const int chunk_size = block_size << conf_id;
for (uint32_t bid = 0; bid < bsz; bid++) {
int seq_len = seq_lens_q[bid];
int seq_len_encoder = seq_lens_encoder[bid];
int seq_len_decoder = seq_lens_decoder[bid] + seq_len;
if (seq_len == 0 || seq_len_encoder > 0) continue;
int loop_times;
loop_times = cute::ceil_div(seq_len_decoder, chunk_size);
gridx += loop_times;
}
gridx_shared[conf_id] = gridx;
__syncthreads();
if (threadIdx.x == 0) {
uint32_t res_id = 0;
uint32_t max_last_wave_block = 0;
for (uint32_t i = 1; i < config_size; i++) {
uint32_t last_wave_block = gridx_shared[i] % sm_cout;
if (last_wave_block >= max_last_wave_block) {
res_id = i;
max_last_wave_block = last_wave_block;
}
}
*num_blocks_x = gridx_shared[res_id];
*res_chunk_size = block_size << res_id;
}
}
}
__global__ void split_block_for_mla(const int *__restrict__ seq_lens_q,
const int *__restrict__ seq_lens_encoder,
const int *__restrict__ seq_lens_decoder,
int *__restrict__ batch_ids,
int *__restrict__ tile_ids_per_batch,
const int bsz,
const int chunk_size) {
if (threadIdx.x == 0) {
int index = 0;
for (uint32_t bid = 0; bid < bsz; bid++) {
int seq_len = seq_lens_q[bid];
int seq_len_encoder = seq_lens_encoder[bid];
int seq_len_decoder = seq_lens_decoder[bid] + seq_len;
if (seq_len == 0) continue;
int loop_times;
loop_times = cute::ceil_div(seq_len_decoder, chunk_size);
if (seq_len_encoder > 0) {
loop_times = 0;
}
for (uint32_t tile_id = 0; tile_id < loop_times; tile_id++) {
batch_ids[index] = bid;
tile_ids_per_batch[index++] = tile_id;
}
}
}
}
__global__ void split_q_block(const int *__restrict__ seq_lens_q,
const int *__restrict__ seq_lens_encoder,
int *__restrict__ batch_ids,
@@ -191,14 +279,23 @@ get_max_len_kv_ernel(int *max_seq_lens_out, const int *seq_lens_this_time,
}
}
std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
void GetBlockShapeAndSplitKVBlock(
const paddle::Tensor &seq_lens_encoder,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &seq_lens_this_time,
paddle::Tensor &decoder_batch_ids, // Inplace
paddle::Tensor &decoder_tile_ids_per_batch, // Inplace
paddle::Tensor &decoder_num_blocks_x_cpu, // Inplace, Pinned Memory
paddle::Tensor &max_len_tensor_cpu, // Inplace, Pinned Memory
paddle::Tensor &decoder_num_blocks_cpu, // Inplace, Pinned Memory
paddle::Tensor &decoder_num_blocks_device, // Inplace
paddle::Tensor &decoder_chunk_size_device, // Inplace
paddle::Tensor &max_len_tensor_cpu, // Inplace, CPU
paddle::Tensor &encoder_batch_ids, // Inplace
paddle::Tensor &encoder_tile_ids_per_batch, // Inplace
paddle::Tensor &encoder_num_blocks_x_cpu, // Inplace, CPU
paddle::Tensor &kv_batch_ids, // Inplace
paddle::Tensor &kv_tile_ids_per_batch, // Inplace
paddle::Tensor &kv_num_blocks_x_cpu, // Inplace, CPU
paddle::Tensor &max_len_kv_cpu, // Inplace, CPU
const int encoder_block_shape_q,
const int decoder_block_shape_q,
const int group_size,
@@ -223,31 +320,120 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
int max_system_len = max_len_cpu_ptr[6];
int max_just_dec_len_without_system = max_len_cpu_ptr[7];
paddle::Tensor encoder_batch_ids;
paddle::Tensor encoder_tile_ids_per_batch;
paddle::Tensor encoder_num_blocks_x_cpu; /*cpu*/
paddle::Tensor kv_batch_ids;
paddle::Tensor kv_tile_ids_per_batch;
paddle::Tensor kv_num_blocks_x_cpu; /*cpu*/
paddle::Tensor max_len_kv_cpu; /*cpu*/
auto max_len_kv =
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_decoder.place());
get_max_len_kv_ernel<128><<<1, 128, 0, stream>>>(
max_len_kv.data<int>(), seq_lens_this_time.data<int>(),
seq_lens_decoder.data<int>(), bsz);
max_len_kv_cpu = max_len_kv.copy_to(paddle::CPUPlace(), false);
max_len_kv_cpu.copy_(max_len_kv, max_len_kv_cpu.place(), false);
// decoder
if (max_dec_len_this_time > 0) {
const bool mla_use_tensorcore = GetMlaUseTensorcore();
if (mla_use_tensorcore && group_size <= 64) {
const int set_chunk_size = get_mla_dec_chunk_size(bsz);
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
decoder_chunk_size_device.data<int>(), 64, sizeof(int32_t), stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
decoder_num_blocks_device.data<int>(), 0, sizeof(int32_t), stream));
int device;
cudaGetDevice(&device);
int sm_cout;
cudaDeviceGetAttribute(&sm_cout, cudaDevAttrMultiProcessorCount, device);
constexpr int config_size =
12; // search space for chunk size:[64, 128, 256, ... 131072]
search_chunk_size_for_mla<config_size>
<<<1, 32, 0, stream>>>(seq_lens_this_time.data<int>(),
seq_lens_encoder.data<int>(),
seq_lens_decoder.data<int>(),
decoder_num_blocks_device.data<int>(),
decoder_chunk_size_device.data<int>(),
bsz,
set_chunk_size,
block_size,
sm_cout);
decoder_num_blocks_cpu.copy_(
decoder_num_blocks_device, decoder_num_blocks_cpu.place(), false);
auto decoder_chunk_size_cpu =
decoder_chunk_size_device.copy_to(paddle::CPUPlace(), false);
const int chunk_size = decoder_chunk_size_cpu.data<int>()[0];
// NOTE: (changwenbin) When using auto_chunk,
// decode_max_tile_size must take into account the maximum case, where * 1024 can cover 128K.
// const uint32_t decoder_batch_shape = seq_lens_decoder.dims()[0] * 1024;
const uint32_t decoder_max_tile_size_per_bs_q =
div_up((decoder_step_token_num * group_size), decoder_block_shape_q);
const uint32_t decoder_batch_shape =
bsz * 1024 * decoder_max_tile_size_per_bs_q;
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemsetAsync(decoder_batch_ids.data<int>(),
0,
decoder_batch_shape * sizeof(int32_t),
stream));
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemsetAsync(decoder_tile_ids_per_batch.data<int>(),
0,
decoder_batch_shape * sizeof(int32_t),
stream));
split_block_for_mla<<<1, 32, 0, stream>>>(
seq_lens_this_time.data<int>(),
seq_lens_encoder.data<int>(),
seq_lens_decoder.data<int>(),
decoder_batch_ids.data<int>(),
decoder_tile_ids_per_batch.data<int>(),
bsz,
chunk_size);
} else {
// Note:(changwenbin)In order to adapt to cudagraph, the maximum value should be taken here
const uint32_t decoder_max_tile_size_per_bs_q = div_up((decoder_step_token_num * group_size), decoder_block_shape_q);
const uint32_t decoder_batch_shape = bsz * 1024 * decoder_max_tile_size_per_bs_q;
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(decoder_batch_ids.data<int>(), 0, decoder_batch_shape * sizeof(int32_t), stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(decoder_tile_ids_per_batch.data<int>(), 0, decoder_batch_shape * sizeof(int32_t), stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(decoder_num_blocks_device.data<int>(), 0, sizeof(int32_t), stream));
split_q_block<<<1, 32, 0, stream>>>(
seq_lens_this_time.data<int>(),
seq_lens_encoder.data<int>(),
decoder_batch_ids.data<int>(),
decoder_tile_ids_per_batch.data<int>(),
decoder_num_blocks_device.data<int>(),
bsz,
decoder_block_shape_q,
group_size);
decoder_num_blocks_cpu.copy_(
decoder_num_blocks_device, decoder_num_blocks_cpu.place(), false);
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
decoder_chunk_size_device.data<int>(), 64, sizeof(int32_t), stream));
}
} else {
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
decoder_chunk_size_device.data<int>(), 64, sizeof(int32_t), stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
decoder_num_blocks_device.data<int>(), 0, sizeof(int32_t), stream));
decoder_num_blocks_cpu.copy_(
decoder_num_blocks_device, decoder_num_blocks_cpu.place(), false);
}
// encoder
if (max_enc_len_this_time > 0) {
const uint32_t max_tile_size_per_bs_kv =
div_up(max_enc_dec_len_this_time, block_size);
kv_batch_ids =
GetEmptyTensor({bsz * max_tile_size_per_bs_kv}, paddle::DataType::INT32,
seq_lens_encoder.place());
kv_tile_ids_per_batch =
GetEmptyTensor({bsz * max_tile_size_per_bs_kv}, paddle::DataType::INT32,
seq_lens_encoder.place());
const uint32_t max_tile_size_per_bs_kv = div_up(max_enc_dec_len_this_time, block_size);
const uint32_t kv_batch_shape = bsz * max_tile_size_per_bs_kv;
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(kv_batch_ids.data<int>(), 0, kv_batch_shape * sizeof(int32_t), stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(kv_tile_ids_per_batch.data<int>(), 0, kv_batch_shape * sizeof(int32_t), stream));
auto kv_num_blocks_x =
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
@@ -258,16 +444,12 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
kv_tile_ids_per_batch.data<int>(), kv_num_blocks_x.data<int>(), bsz,
block_size, block_size);
kv_num_blocks_x_cpu = kv_num_blocks_x.copy_to(paddle::CPUPlace(), false);
const uint32_t encoder_max_tile_size_per_bs_q =
div_up((max_enc_dec_len_this_time * group_size), encoder_block_shape_q);
encoder_batch_ids =
GetEmptyTensor({bsz * encoder_max_tile_size_per_bs_q},
paddle::DataType::INT32, seq_lens_encoder.place());
encoder_tile_ids_per_batch =
GetEmptyTensor({bsz * encoder_max_tile_size_per_bs_q},
paddle::DataType::INT32, seq_lens_encoder.place());
kv_num_blocks_x_cpu.copy_(kv_num_blocks_x, kv_num_blocks_x_cpu.place(), false);
// Clear buffer
const uint32_t encoder_max_tile_size_per_bs_q = div_up((max_enc_dec_len_this_time * group_size), encoder_block_shape_q);
const uint32_t encoder_batch_shape = bsz * encoder_max_tile_size_per_bs_q;
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(encoder_batch_ids.data<int>(), 0, encoder_batch_shape * sizeof(int32_t), stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(encoder_tile_ids_per_batch.data<int>(), 0, encoder_batch_shape * sizeof(int32_t), stream));
auto encoder_num_blocks_x =
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
split_q_block<<<1, 32, 0, stream>>>(seq_lens_encoder.data<int>(), nullptr,
@@ -275,54 +457,9 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
encoder_tile_ids_per_batch.data<int>(),
encoder_num_blocks_x.data<int>(), bsz,
encoder_block_shape_q, group_size);
encoder_num_blocks_x_cpu =
encoder_num_blocks_x.copy_to(paddle::CPUPlace(), false);
} else {
encoder_batch_ids =
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
encoder_tile_ids_per_batch =
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
encoder_num_blocks_x_cpu =
GetEmptyTensor({0}, paddle::DataType::INT32, paddle::CPUPlace());
kv_batch_ids =
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
kv_tile_ids_per_batch =
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
kv_num_blocks_x_cpu =
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
encoder_num_blocks_x_cpu.copy_(encoder_num_blocks_x, encoder_num_blocks_x_cpu.place(), false);
}
if (max_just_dec_len_this_time > 0) {
// Clear buffer
const uint32_t decoder_max_tile_size_per_bs_q = div_up((decoder_step_token_num * group_size), decoder_block_shape_q);
const uint32_t decoder_batch_shape = bsz * decoder_max_tile_size_per_bs_q;
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(decoder_batch_ids.data<int>(), 0, decoder_batch_shape * sizeof(int32_t), stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(decoder_tile_ids_per_batch.data<int>(), 0, decoder_batch_shape * sizeof(int32_t), stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(decoder_num_blocks_x_cpu.data<int>(), 0, sizeof(int32_t), stream));
auto decoder_num_blocks_x =
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
split_q_block<<<1, 32, 0, stream>>>(
seq_lens_this_time.data<int>(),
seq_lens_encoder.data<int>(),
decoder_batch_ids.data<int>(),
decoder_tile_ids_per_batch.data<int>(),
decoder_num_blocks_x.data<int>(),
bsz,
decoder_block_shape_q,
group_size);
decoder_num_blocks_x_cpu.copy_(decoder_num_blocks_x, decoder_num_blocks_x_cpu.place(), false);
}
return {
encoder_batch_ids,
encoder_tile_ids_per_batch,
encoder_num_blocks_x_cpu, /*cpu*/
kv_batch_ids,
kv_tile_ids_per_batch,
kv_num_blocks_x_cpu, /*cpu*/
max_len_kv_cpu, /*cpu*/
};
}
PD_BUILD_STATIC_OP(get_block_shape_and_split_kv_block)
@@ -332,17 +469,20 @@ PD_BUILD_STATIC_OP(get_block_shape_and_split_kv_block)
"seq_lens_this_time",
"decoder_batch_ids",
"decoder_tile_ids_per_batch",
"decoder_num_blocks_x_cpu",
"max_len_tensor_cpu"
"decoder_num_blocks_cpu",
"decoder_num_blocks_device",
"decoder_chunk_size_device",
"max_len_tensor_cpu",
"encoder_batch_ids",
"encoder_tile_ids_per_batch",
"encoder_num_blocks_x_cpu",
"kv_batch_ids",
"kv_tile_ids_per_batch",
"kv_num_blocks_x_cpu",
"max_len_kv_cpu"
})
.Outputs({
paddle::Optional("encoder_batch_ids"),
paddle::Optional("encoder_tile_ids_per_batch"),
paddle::Optional("encoder_num_blocks_x_cpu"),
paddle::Optional("kv_batch_ids"),
paddle::Optional("kv_tile_ids_per_batch"),
paddle::Optional("kv_num_blocks_x_cpu"),
"max_len_kv_cpu"
})
.Attrs({
"encoder_block_shape_q: int",

View File

@@ -37,7 +37,8 @@ __global__ void GQAVariableLengthRotarySplitKernel(
const int q_num_head,
const int kv_num_head,
const int seq_len,
const int last_dim) {
const int last_dim,
const bool rope_3d) {
using LoadT = AlignedVector<T, VecSize>;
constexpr int HalfVecSize = VecSize / 2;
using LoadEmbT = AlignedVector<float, HalfVecSize>;
@@ -62,6 +63,7 @@ __global__ void GQAVariableLengthRotarySplitKernel(
const int kv_write_idx = cu_seqlens_k[ori_bi] + ori_seq_id;
const int64_t emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
int64_t new_emb_idx = rope_3d ? emb_idx + ori_bi * last_dim * seq_len : emb_idx;
const int64_t base_idx =
token_idx * (q_num_head + 2 * kv_num_head) * last_dim + hi * last_dim +
h_bias;
@@ -80,8 +82,8 @@ __global__ void GQAVariableLengthRotarySplitKernel(
Load<T, VecSize>(&qkv[base_idx], &src_vec);
// do rope
if (hi < q_num_head + kv_num_head) {
Load<float, HalfVecSize>(&cos_emb[emb_idx], &cos_emb_vec);
Load<float, HalfVecSize>(&sin_emb[emb_idx], &sin_emb_vec);
Load<float, HalfVecSize>(&cos_emb[new_emb_idx], &cos_emb_vec);
Load<float, HalfVecSize>(&sin_emb[new_emb_idx], &sin_emb_vec);
#pragma unroll
for (int i = 0; i < HalfVecSize; i++) {
const float input_left = static_cast<float>(src_vec[2 * i]);
@@ -118,6 +120,7 @@ void gqa_rotary_qk_split_variable(
const int seq_len,
const int input_output_len,
const int dim_head,
const bool rope_3d,
const cudaStream_t &stream) {
int64_t elem_nums = token_num * (num_heads + 2 * kv_num_heads) * dim_head;
constexpr int PackSize = 16 / sizeof(T);
@@ -146,7 +149,8 @@ void gqa_rotary_qk_split_variable(
num_heads,
kv_num_heads,
seq_len,
dim_head);
dim_head,
rope_3d);
}
template <typename T,
@@ -213,7 +217,7 @@ __global__ void append_cache_kv_c16(
// load k_smem 64 rows 128 cols
for (int fz = 0; fz < 4; fz++) { // 4 rows pre warp once, 16 rows all 4 warps once, need 4 iter
for (int fy = 0; fy < 2; fy++) { // 8 * 128b = 64 * bf16 noce, need 2 iter
for (int fy = 0; fy < 2; fy++) { // 8 * 128b = 64 * bf16 once, need 2 iter
k_smem.load_128b_async<SharedMemFillMode::kNoFill>(
k_smem_offset_w, cur_cache_k + k_read_idx, end_idx > 0);
k_smem_offset_w =
@@ -231,7 +235,7 @@ __global__ void append_cache_kv_c16(
// deal k_smem 64 rows 128 cols
for (int fz = 0; fz < 1; fz++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 1 iter
uint32_t row_idx = wid * 16 + tid / 4;
for (int fy = 0; fy < 8; fy++) { // 2 * 128b = 16 * bf16 noce, need 8 iter
for (int fy = 0; fy < 8; fy++) { // 2 * 128b = 16 * bf16 once, need 8 iter
uint32_t col_idx = fy * 16 + tid % 4 * 2;
k_smem.ldmatrix_m8n8x4(k_smem_offset_r, kv_frag);
// layout
@@ -274,7 +278,7 @@ __global__ void append_cache_kv_c16(
// load v_smem 64 rows 128 cols
for (int fz = 0; fz < 4; fz++) { // // 4 rows pre warp once, 16 rows all 4 warps once, need 4 iter
for (int fy = 0; fy < 2; fy++) { // 8 * 128b = 64 * bf16 noce, need 2 iter
for (int fy = 0; fy < 2; fy++) { // 8 * 128b = 64 * bf16 once, need 2 iter
v_smem.load_128b_async<SharedMemFillMode::kNoFill>(
v_smem_offset_w, cur_cache_v + v_read_idx, end_idx > 0);
v_smem_offset_w =
@@ -292,7 +296,7 @@ __global__ void append_cache_kv_c16(
// deal v_smem 64 rows 128 cols
for (int fz = 0; fz < 1; fz++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 1 iter
uint32_t row_idx = wid * 16 + tid / 4;
for (int fy = 0; fy < 8; fy++) { // 2 * 128b = 16 * bf16 noce, need 8 iter
for (int fy = 0; fy < 8; fy++) { // 2 * 128b = 16 * bf16 once, need 8 iter
uint32_t col_idx = fy * 16 + tid % 4 * 2;
v_smem.ldmatrix_m8n8x4(v_smem_offset_r, kv_frag);
// layout
@@ -396,7 +400,7 @@ __global__ void append_cache_kv_c8(
// load v_smem 64 rows, 128 cols
for (int fz = 0; fz < 4; fz++) { // 4 rows pre warp once, 16 rows all 4 warps once, need 4 iter
for (int fy = 0; fy < 1; fy++) { // 8 * 128b = 128 * uint8 noce, need 1 iter
for (int fy = 0; fy < 1; fy++) { // 8 * 128b = 128 * uint8 once, need 1 iter
k_smem.load_128b_async<SharedMemFillMode::kNoFill>(
k_smem_offset_w, cur_cache_k + k_read_idx, end_idx > 0);
k_smem_offset_w =
@@ -414,7 +418,7 @@ __global__ void append_cache_kv_c8(
// deal k_smem 64 rows, 128 cols
for (int fz = 0; fz < 1; fz++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 1 iter
uint32_t row_idx = wid * 16 + tid / 4;
for (int fy = 0; fy < 4; fy++) { // 2 * 128b = 32 * uint8 noce, need 4 iter
for (int fy = 0; fy < 4; fy++) { // 2 * 128b = 32 * uint8 once, need 4 iter
uint32_t col_idx = fy * 32 + tid % 4 * 2;
k_smem.ldmatrix_m8n8x4(k_smem_offset_r, k_frag);
// layout
@@ -462,7 +466,7 @@ __global__ void append_cache_kv_c8(
tid % 4 * num_elems_per_128b<CacheT>();
// load v_smem 128 rows 64 cols
for (int fy = 0; fy < 4; fy++) { // 8 rows pre warp once, 32 rows all 4 warps once, need 4 iter
for (int fz = 0; fz < 1; fz++) { // 4 * 128b = 64 * uint8 noce, need 1 iter
for (int fz = 0; fz < 1; fz++) { // 4 * 128b = 64 * uint8 once, need 1 iter
v_smem.load_128b_async<SharedMemFillMode::kNoFill>(
v_smem_offset_w, cur_cache_v + v_read_idx, end_idx > 0);
v_smem_offset_w =
@@ -481,7 +485,7 @@ __global__ void append_cache_kv_c8(
// deal v_smem 128 rows 64 cols
for (int fy = 0; fy < 2; fy++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 2 iter
uint32_t dim_idx = fy * NUM_WARPS * 16 + wid * 16 + tid / 4;
for (int fz = 0; fz < 2; fz++) { // 2 * 128b = 32 * uint8 noce, need 2 iter
for (int fz = 0; fz < 2; fz++) { // 2 * 128b = 32 * uint8 once, need 2 iter
uint32_t kv_idx = fz * 32 + tid % 4 * 2;
v_smem.ldmatrix_m8n8x4(v_smem_offset_r, v_frag);
// layout
@@ -610,7 +614,7 @@ __global__ void append_cache_kv_c4(
// load k_smem 64 rows 128 cols
for (int fz = 0; fz < 2; fz++) { // 4 rows pre warp once, 16 rows all 4 warps once, need 4 iter
for (int fy = 0; fy < 1; fy++) { // 4 * 128b = 128 * int4 noce, need 1 iter
for (int fy = 0; fy < 1; fy++) { // 4 * 128b = 128 * int4 once, need 1 iter
k_smem.load_128b_async<SharedMemFillMode::kNoFill>(
k_smem_offset_w, cur_cache_k + k_read_idx, end_idx > 0);
k_smem_offset_w =
@@ -628,7 +632,7 @@ __global__ void append_cache_kv_c4(
// deal k_smem 64 rows 128 cols
for (int fz = 0; fz < 1; fz++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 1 iter
uint32_t row_idx = wid * 16 + tid / 4;
for (int fy = 0; fy < 2; fy++) { // 2 * 128b = 64 * int4 noce, need 2 iter
for (int fy = 0; fy < 2; fy++) { // 2 * 128b = 64 * int4 once, need 2 iter
uint32_t col_idx = fy * 64 + tid % 4 * 2;
k_smem.ldmatrix_m8n8x4(k_smem_offset_r, k_frag);
@@ -681,7 +685,7 @@ __global__ void append_cache_kv_c4(
tid % 2 * num_elems_per_128b<CacheT>();
// load v_smem 128 rows 64 rows
for (int fy = 0; fy < 2; fy++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 2 iter
for (int fz = 0; fz < 1; fz++) { // 2 * 128b = 64 * int4 noce, need 1 iter
for (int fz = 0; fz < 1; fz++) { // 2 * 128b = 64 * int4 once, need 1 iter
v_smem.load_128b_async<SharedMemFillMode::kNoFill>(
v_smem_offset_w, cur_cache_v + v_read_idx, end_idx > 0);
v_smem_offset_w =
@@ -700,7 +704,7 @@ __global__ void append_cache_kv_c4(
// deal v_smem 128 rows 64 cols
for (int fy = 0; fy < 2; fy++) { // 16 rows pre warp once, 64 rows all 4 warps once, need 2 iter
uint32_t dim_idx = fy * NUM_WARPS * 16 + wid * 16 + tid / 4;
for (int fz = 0; fz < 1; fz++) { // 2 * 128b = 64 * int4 noce, need 1 iter
for (int fz = 0; fz < 1; fz++) { // 2 * 128b = 64 * int4 once, need 1 iter
uint32_t kv_idx = fz * 64 + tid % 4 * 2;
v_smem.ldmatrix_m8n8x4(v_smem_offset_r, v_frag);
// layout
@@ -890,7 +894,8 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
const paddle::optional<paddle::Tensor>& kv_signal_data,
const int kv_token_num,
const int max_seq_len,
const std::string& cache_quant_type) {
const std::string& cache_quant_type,
const bool rope_3d) {
typedef PDTraits<paddle::DataType::BFLOAT16> traits_;
typedef typename traits_::DataType DataType_;
typedef typename traits_::data_t data_t;
@@ -953,8 +958,9 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
num_heads,
kv_num_heads,
max_seq_len,
rotary_embs.dims()[2],
rope_3d ? rotary_embs.dims()[3] : rotary_embs.dims()[2],
head_dim,
rope_3d,
stream);
if (token_num < kv_token_num) {
@@ -994,7 +1000,7 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
stream,
const_cast<paddle::Tensor*>(&key_cache),
const_cast<paddle::Tensor*>(&value_cache));
} else if (cache_quant_type == "cache_int8" || cache_quant_type == "cache_fp8") {
} else if (cache_quant_type == "cache_int8" || cache_quant_type == "cache_fp8" || cache_quant_type == "block_wise_fp8") {
CascadeAppendWriteCacheKVC8QKV<data_t, 128, 64>(
meta_data,
*const_cast<paddle::Tensor*>(&key_cache),
@@ -1012,7 +1018,7 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
kv_num_blocks_data,
max_seq_len,
false, // is_scale_channel_wise
cache_quant_type == "cache_fp8", // is_fp8
cache_quant_type,
stream,
const_cast<paddle::Tensor*>(&key_cache),
const_cast<paddle::Tensor*>(&value_cache));

View File

@@ -18,6 +18,168 @@
#include "mma_tensor_op.cuh"
#include "utils.cuh"
template <typename T, int VecSize = 1, typename InT = T>
__global__ void append_speculate_cache_T_rope_qk_norm_kernel(
const InT* __restrict__ qkv, // [token_num, num_heads + 2 * gqa_group_size,
// head_size]
T* __restrict__ key_cache, // [num_blocks, gqa_group_size, block_size,
// head_size // 2]
T* __restrict__ value_cache, // [num_blocks, gqa_group_size, block_size,
// head_size // 2]
T* __restrict__ q_out,
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
const int* __restrict__ batch_id_per_token, // [num_tokens]
const int* __restrict__ cu_seqlens_q,
const int* __restrict__ seq_lens_decoder, // [bsz]
const float* __restrict__ cos_emb,
const float* __restrict__ sin_emb,
const float*
qkv_out_scales, // [(num_heads + 2 * gqa_group_size) * head_size]
const T* qkv_biases, // [num_head + 2 * gqa_group_size, dim_head]
const int max_seq_len,
const int max_blocks_per_seq,
const int num_heads,
const int output_inner_dim,
const int head_size,
const int block_size,
const int elem_cnt,
const int gqa_group_size,
const float* q_norm_weight,
const float* k_norm_weight,
const float rms_norm_eps,
const bool rope_3d) {
using LoadT = AlignedVector<T, VecSize>;
using LoadFloat = AlignedVector<float, VecSize>;
using LoadInT = AlignedVector<InT, VecSize>;
constexpr int HalfVecSize = VecSize / 2;
using LoadEmbT = AlignedVector<float, HalfVecSize>;
LoadInT src_vec;
LoadFloat scale_vec;
LoadT bias_vec;
LoadEmbT cos_emb_vec;
LoadEmbT sin_emb_vec;
LoadFloat tmp_vec;
LoadFloat q_norm_vec;
LoadFloat k_norm_vec;
int64_t global_warp_idx = blockDim.y * blockIdx.x + threadIdx.y;
int64_t all_warp_num = gridDim.x * blockDim.y;
int64_t all_head_dim = elem_cnt / head_size;
const int64_t hidden_size = (num_heads + 2 * gqa_group_size) * head_size;
const int half_head_size = head_size / 2;
for (int global_hi = global_warp_idx; global_hi < all_head_dim; global_hi += all_warp_num) {
int64_t linear_index = global_hi * head_size + threadIdx.x * VecSize;
const int token_id = linear_index / hidden_size;
const int ori_bi = batch_id_per_token[token_id];
if (seq_lens_decoder[ori_bi] == 0) continue;
const int bias = linear_index % hidden_size;
const int hi = bias / head_size; // q + k + v
const int h_bias = bias % head_size;
const int start_token_idx = cu_seqlens_q[ori_bi];
const int write_seq_id =
seq_lens_decoder[ori_bi] + token_id - start_token_idx;
if (write_seq_id == 0) continue;
const int* block_table_now = block_tables + ori_bi * max_blocks_per_seq;
const int block_idx = block_table_now[write_seq_id / block_size];
if (block_idx < 0) {
printf(
"Fatal Error!!!, block idx %d when write_seq_id is %d\n some key var "
"%d %d %d %d\n",
block_idx,
write_seq_id,
ori_bi,
seq_lens_decoder[ori_bi],
token_id,
cu_seqlens_q[ori_bi]);
}
const int block_offset = write_seq_id % block_size;
const int write_q_idx =
token_id * output_inner_dim * head_size + hi * head_size + h_bias;
const int bias_idx = hi * head_size + h_bias;
Load<InT, VecSize>(&qkv[linear_index], &src_vec);
if (qkv_biases) {
Load<T, VecSize>(&qkv_biases[bias_idx], &bias_vec);
}
if (qkv_out_scales) {
Load<float, VecSize>(&qkv_out_scales[bias_idx], &scale_vec);
}
if (hi < num_heads + gqa_group_size) {
// q k rope
const int64_t emb_idx = write_seq_id * half_head_size + h_bias / 2;
uint32_t new_emb_idx = rope_3d ? emb_idx + ori_bi * max_seq_len * head_size : emb_idx;
Load<float, HalfVecSize>(&cos_emb[new_emb_idx], &cos_emb_vec);
Load<float, HalfVecSize>(&sin_emb[new_emb_idx], &sin_emb_vec);
}
float thread_m2 = 0.0f;
float warp_m2 = 0.0f;
#pragma unroll
for (int i = 0; i < HalfVecSize; i++) {
// add_bias + rope
float input_left = static_cast<float>(src_vec[2 * i]);
float input_right = static_cast<float>(src_vec[2 * i + 1]);
if (qkv_out_scales) {
input_left *= scale_vec[2 * i];
input_right *= scale_vec[2 * i + 1];
}
if (qkv_biases) {
input_left = input_left + static_cast<float>(bias_vec[2 * i]);
input_right = input_right + static_cast<float>(bias_vec[2 * i + 1]);
}
if (hi < num_heads + gqa_group_size) {
const float cos_tmp = cos_emb_vec[i];
const float sin_tmp = sin_emb_vec[i];
float tmp1 = input_left * cos_tmp - input_right * sin_tmp;
float tmp2 = input_right * cos_tmp + input_left * sin_tmp;
thread_m2 += tmp1 * tmp1 + tmp2 * tmp2;
tmp_vec[2 * i] = tmp1;
tmp_vec[2 * i + 1] = tmp2;
} else {
bias_vec[2 * i] = static_cast<T>(input_left);
bias_vec[2 * i + 1] = static_cast<T>(input_right);
}
}
if (hi < (num_heads + gqa_group_size)) {
WelfordWarpAllReduce<float, 32>(thread_m2, &warp_m2);
float row_variance =
max(warp_m2 / head_size, 0.0f);
float row_inv_var = Rsqrt(row_variance + rms_norm_eps);
if (hi < num_heads) {
Load<float, VecSize>(&q_norm_weight[threadIdx.x * VecSize], &q_norm_vec);
#pragma unroll
for (int i = 0; i < VecSize; i++) {
bias_vec[i] = static_cast<T>(tmp_vec[i] * row_inv_var * q_norm_vec[i]);
}
} else {
Load<float, VecSize>(&k_norm_weight[threadIdx.x * VecSize], &k_norm_vec);
#pragma unroll
for (int i = 0; i < VecSize; i++) {
bias_vec[i] = static_cast<T>(tmp_vec[i] * row_inv_var * k_norm_vec[i]);
}
}
}
if (hi < num_heads) {
// write q
Store<T, VecSize>(bias_vec, &q_out[write_q_idx]);
} else {
// write k/v
const int kv_head_idx = (hi - num_heads) % gqa_group_size;
const int tgt_idx = (block_idx * gqa_group_size * block_size * head_size +
kv_head_idx * block_size * head_size +
block_offset * head_size + h_bias);
// write
if (hi < num_heads + gqa_group_size) {
Store<T, VecSize>(bias_vec, &key_cache[tgt_idx]);
} else {
Store<T, VecSize>(bias_vec, &value_cache[tgt_idx]);
}
}
}
}
template <int VecSize = 4, int HeadDim = 128>
__global__ void append_clear_cache_int8_block(
uint8_t* __restrict__ key_cache, // [num_blocks, gqa_group_size,
@@ -193,7 +355,8 @@ __global__ void append_speculate_cache_rope_kernel(
const int head_size,
const int block_size,
const int elem_cnt,
const int gqa_group_size) {
const int gqa_group_size,
const bool rope_3d) {
using LoadT = AlignedVector<T, VecSize>;
using LoadFloat = AlignedVector<float, VecSize>;
using LoadInT = AlignedVector<InT, VecSize>;
@@ -253,8 +416,9 @@ __global__ void append_speculate_cache_rope_kernel(
if (hi < num_heads + gqa_group_size) {
// q k rope
const int64_t emb_idx = write_seq_id * half_head_size + h_bias / 2;
Load<float, HalfVecSize>(&cos_emb[emb_idx], &cos_emb_vec);
Load<float, HalfVecSize>(&sin_emb[emb_idx], &sin_emb_vec);
int64_t new_emb_idx = rope_3d ? emb_idx + ori_bi * max_seq_len * head_size : emb_idx;
Load<float, HalfVecSize>(&cos_emb[new_emb_idx], &cos_emb_vec);
Load<float, HalfVecSize>(&sin_emb[new_emb_idx], &sin_emb_vec);
}
#pragma unroll
for (int i = 0; i < HalfVecSize; i++) {
@@ -326,7 +490,8 @@ __global__ void append_speculate_cache_neox_rope_kernel(
const int head_size,
const int block_size,
const int elem_cnt,
const int gqa_group_size) {
const int gqa_group_size,
const bool rope_3d) {
using LoadT = AlignedVector<T, VecSize>;
using LoadFloat = AlignedVector<float, VecSize>;
using LoadInT = AlignedVector<InT, VecSize>;
@@ -390,8 +555,9 @@ __global__ void append_speculate_cache_neox_rope_kernel(
if (hi < num_heads + gqa_group_size) {
// q k rope
const int64_t emb_idx = write_seq_id * head_size + h_bias;
Load<float, VecSize>(&cos_emb[emb_idx], &cos_emb_vec);
Load<float, VecSize>(&sin_emb[emb_idx], &sin_emb_vec);
int64_t new_emb_idx = rope_3d ? emb_idx + ori_bi * max_seq_len * head_size * 2: emb_idx;
Load<float, VecSize>(&cos_emb[new_emb_idx], &cos_emb_vec);
Load<float, VecSize>(&sin_emb[new_emb_idx], &sin_emb_vec);
}
#pragma unroll
for (int i = 0; i < VecSize; i++) {
@@ -476,7 +642,8 @@ __global__ void append_speculate_cache_int8_rope_kernel(
const int block_size,
const float max_bound,
const float min_bound,
const int gqa_group_size) {
const int gqa_group_size,
const bool rope_3d) {
static_assert(HeadDim == 128, "just support HeadDim be 128 now!");
static_assert(VecSize == 4, "just support VecSize be 4 now, 32 * 4!");
constexpr int NUM_WARPS = 4;
@@ -522,8 +689,9 @@ __global__ void append_speculate_cache_int8_rope_kernel(
// q rope
const uint32_t emb_idx = write_seq_id * half_head_size + head_bias / 2;
Load<float, HalfVecSize>(&cos_emb[emb_idx], &cos_emb_vec);
Load<float, HalfVecSize>(&sin_emb[emb_idx], &sin_emb_vec);
uint32_t new_emb_idx = rope_3d ? emb_idx + bid * max_seq_len * HeadDim : emb_idx;
Load<float, HalfVecSize>(&cos_emb[new_emb_idx], &cos_emb_vec);
Load<float, HalfVecSize>(&sin_emb[new_emb_idx], &sin_emb_vec);
if (qkv_out_scales) {
Load<float, VecSize>(&qkv_out_scales[bias_idx], &out_scale_vec);
}
@@ -583,10 +751,11 @@ __global__ void append_speculate_cache_int8_rope_kernel(
T scale;
if (head_idx < num_heads + gqa_group_size) {
const uint32_t emb_idx = write_seq_id * half_head_size + head_bias / 2;
Load<float, 1>(&cos_emb[emb_idx], &cos_emb_vec1);
Load<float, 1>(&cos_emb[emb_idx + 4], &cos_emb_vec2);
Load<float, 1>(&sin_emb[emb_idx], &sin_emb_vec1);
Load<float, 1>(&sin_emb[emb_idx + 4], &sin_emb_vec2);
uint32_t new_emb_idx = rope_3d ? emb_idx + bid * max_seq_len * HeadDim : emb_idx;
Load<float, 1>(&cos_emb[new_emb_idx], &cos_emb_vec1);
Load<float, 1>(&cos_emb[new_emb_idx + 4], &cos_emb_vec2);
Load<float, 1>(&sin_emb[new_emb_idx], &sin_emb_vec1);
Load<float, 1>(&sin_emb[new_emb_idx + 4], &sin_emb_vec2);
scale = __ldg(&cache_k_scales[kv_head_idx]);
} else {
scale = __ldg(&cache_v_scales[kv_head_idx]);
@@ -708,7 +877,8 @@ __global__ void append_speculate_cache_int8_neox_rope_kernel(
const int block_size,
const float max_bound,
const float min_bound,
const int gqa_group_size) {
const int gqa_group_size,
const bool rope_3d) {
static_assert(HeadDim == 128, "just support HeadDim be 128 now!");
static_assert(VecSize == 4, "just support VecSize be 4 now, 32 * 4!");
constexpr int NUM_WARPS = 4;
@@ -757,8 +927,9 @@ __global__ void append_speculate_cache_int8_neox_rope_kernel(
// q rope
const uint32_t emb_idx = write_seq_id * HeadDim + head_bias;
Load<float, VecSize>(&cos_emb[emb_idx], &cos_emb_vec);
Load<float, VecSize>(&sin_emb[emb_idx], &sin_emb_vec);
uint32_t new_emb_idx = rope_3d ? emb_idx + bid * max_seq_len * HeadDim * 2 : emb_idx;
Load<float, VecSize>(&cos_emb[new_emb_idx], &cos_emb_vec);
Load<float, VecSize>(&sin_emb[new_emb_idx], &sin_emb_vec);
if (qkv_out_scales) {
Load<float, VecSize>(&qkv_out_scales[bias_idx_left],
&left_out_scale_vec);
@@ -853,10 +1024,11 @@ __global__ void append_speculate_cache_int8_neox_rope_kernel(
T scale;
const uint32_t emb_idx = write_seq_id * HeadDim + head_bias;
Load<float, HALF_K_VEC_SIZE>(&cos_emb[emb_idx], &cos_emb_vec1);
Load<float, HALF_K_VEC_SIZE>(&cos_emb[emb_idx + 8], &cos_emb_vec2);
Load<float, HALF_K_VEC_SIZE>(&sin_emb[emb_idx], &sin_emb_vec1);
Load<float, HALF_K_VEC_SIZE>(&sin_emb[emb_idx + 8], &sin_emb_vec2);
uint32_t new_emb_idx = rope_3d ? emb_idx + bid * max_seq_len * HeadDim * 2 : emb_idx;
Load<float, HALF_K_VEC_SIZE>(&cos_emb[new_emb_idx], &cos_emb_vec1);
Load<float, HALF_K_VEC_SIZE>(&cos_emb[new_emb_idx + 8], &cos_emb_vec2);
Load<float, HALF_K_VEC_SIZE>(&sin_emb[new_emb_idx], &sin_emb_vec1);
Load<float, HALF_K_VEC_SIZE>(&sin_emb[new_emb_idx + 8], &sin_emb_vec2);
scale = __ldg(&cache_k_scales[kv_head_idx]);
#pragma unroll
for (int i = 0; i < HALF_K_VEC_SIZE; i++) {
@@ -1088,7 +1260,8 @@ __global__ void append_speculate_cache_int4_rope_kernel(
const int block_size,
const float max_bound,
const float min_bound,
const int gqa_group_size) {
const int gqa_group_size,
const bool rope_3d) {
static_assert(HeadDim == 128, "just support HeadDim be 128 now!");
static_assert(VecSize == 4, "just support VecSize be 4 now, 32 * 4!");
constexpr int NUM_WARPS = 4;
@@ -1145,8 +1318,9 @@ __global__ void append_speculate_cache_int4_rope_kernel(
// Load<float, VecSize>(&qkv_out_scales[bias_idx], &out_scale_vec);
// q rope
const uint32_t emb_idx = write_seq_id * half_head_size + head_bias / 2;
Load<float, HalfVecSize>(&cos_emb[emb_idx], &cos_emb_vec);
Load<float, HalfVecSize>(&sin_emb[emb_idx], &sin_emb_vec);
uint32_t new_emb_idx = rope_3d ? emb_idx + bid * max_seq_len * HeadDim : emb_idx;
Load<float, HalfVecSize>(&cos_emb[new_emb_idx], &cos_emb_vec);
Load<float, HalfVecSize>(&sin_emb[new_emb_idx], &sin_emb_vec);
#pragma unroll
for (int i = 0; i < HalfVecSize; i++) {
// dequant + add_bias + rope
@@ -1235,10 +1409,11 @@ __global__ void append_speculate_cache_int4_rope_kernel(
// &out_scale_vec2);
if (head_idx < num_heads + gqa_group_size) {
const uint32_t emb_idx = write_seq_id * half_head_size + head_bias / 2;
Load<float, 1>(&cos_emb[emb_idx], &cos_emb_vec1);
Load<float, 1>(&cos_emb[emb_idx + 4], &cos_emb_vec2);
Load<float, 1>(&sin_emb[emb_idx], &sin_emb_vec1);
Load<float, 1>(&sin_emb[emb_idx + 4], &sin_emb_vec2);
uint32_t new_emb_idx = rope_3d ? emb_idx + bid * max_seq_len * HeadDim : emb_idx;
Load<float, 1>(&cos_emb[new_emb_idx], &cos_emb_vec1);
Load<float, 1>(&cos_emb[new_emb_idx + 4], &cos_emb_vec2);
Load<float, 1>(&sin_emb[new_emb_idx], &sin_emb_vec1);
Load<float, 1>(&sin_emb[new_emb_idx + 4], &sin_emb_vec2);
Load<T, HALF_K_VEC_SIZE>(&cache_k_scales[cache_idx], &scale_vec1);
Load<T, HALF_K_VEC_SIZE>(&cache_k_scales[cache_idx + 8], &scale_vec2);
Load<T, HALF_K_VEC_SIZE>(&cache_k_zero_points[cache_idx], &zp_vec1);
@@ -1431,7 +1606,8 @@ __global__ void append_speculate_cache_int4_neox_rope_kernel(
const int block_size,
const float max_bound,
const float min_bound,
const int gqa_group_size) {
const int gqa_group_size,
const bool rope_3d) {
static_assert(HeadDim == 128, "just support HeadDim be 128 now!");
static_assert(VecSize == 4, "just support VecSize be 4 now, 32 * 4!");
constexpr int NUM_WARPS = 4;
@@ -1581,10 +1757,11 @@ __global__ void append_speculate_cache_int4_neox_rope_kernel(
&right_out_scale_vec2);
const uint32_t emb_idx = write_seq_id * HeadDim + head_bias;
Load<float, HALF_K_VEC_SIZE>(&cos_emb[emb_idx], &cos_emb_vec1);
Load<float, HALF_K_VEC_SIZE>(&cos_emb[emb_idx + 8], &cos_emb_vec2);
Load<float, HALF_K_VEC_SIZE>(&sin_emb[emb_idx], &sin_emb_vec1);
Load<float, HALF_K_VEC_SIZE>(&sin_emb[emb_idx + 8], &sin_emb_vec2);
uint32_t new_emb_idx = rope_3d ? emb_idx + bid * max_seq_len * HeadDim : emb_idx;
Load<float, HALF_K_VEC_SIZE>(&cos_emb[new_emb_idx], &cos_emb_vec1);
Load<float, HALF_K_VEC_SIZE>(&cos_emb[new_emb_idx + 8], &cos_emb_vec2);
Load<float, HALF_K_VEC_SIZE>(&sin_emb[new_emb_idx], &sin_emb_vec1);
Load<float, HALF_K_VEC_SIZE>(&sin_emb[new_emb_idx + 8], &sin_emb_vec2);
Load<T, HALF_K_VEC_SIZE>(&cache_k_scales[left_cache_idx],
&left_scale_vec1);
Load<T, HALF_K_VEC_SIZE>(&cache_k_scales[left_cache_idx + 8],

View File

@@ -15,6 +15,78 @@
#include "speculate_write_cache_with_rope_kernel.h"
#include "utils.cuh"
template <typename T, typename QKV_TYPE>
void append_speculate_cache_rope_qk_norm(const QKV_TYPE* qkv,
T* key_cache,
T* value_cache,
T* qkv_out,
const int* block_tables,
const int* batch_id_per_token,
const int* cu_seqlens_q,
const int* seq_lens,
const int* seq_lens_encoder,
const float* cos_emb,
const float* sin_emb,
const float* qkv_out_scales,
const T* qkv_biases,
const int max_seq_len,
const int max_blocks_per_seq,
const int num_heads,
const int kv_num_heads,
const int dim_head,
const int block_size,
const int bsz,
const int token_num,
const cudaStream_t& stream,
const bool use_neox_style,
const float* q_norm_weight,
const float* k_norm_weight,
const float rms_norm_eps,
const bool rope_3d) {
int output_inner_dim = num_heads + 2 * kv_num_heads;
const uint32_t elem_nums =
use_neox_style ? token_num * (num_heads + 2 * kv_num_heads) * dim_head / 2
: token_num * (num_heads + 2 * kv_num_heads) * dim_head;
constexpr int HEAD_DIM = 128;
constexpr int PackSize = HEAD_DIM / kWarpSize;
const int pack_num = elem_nums / PackSize;
const int blocksize = 128;
int grid_size = 1;
GetNumBlocks<128>(pack_num, &grid_size);
if (use_neox_style) {
PD_THROW(
"append_speculate_cache_rope_qk_norm not support neox rope yet");
} else {
dim3 block_dim(kWarpSize, blocksize / kWarpSize, 1);
append_speculate_cache_T_rope_qk_norm_kernel<T, PackSize>
<<<grid_size, block_dim, 0, stream>>>(qkv,
key_cache,
value_cache,
qkv_out,
block_tables,
batch_id_per_token,
cu_seqlens_q,
seq_lens,
cos_emb,
sin_emb,
qkv_out_scales,
qkv_biases,
max_seq_len,
max_blocks_per_seq,
num_heads,
output_inner_dim,
dim_head,
block_size,
elem_nums,
kv_num_heads,
q_norm_weight,
k_norm_weight,
rms_norm_eps,
rope_3d);
}
}
// rope + write
template <typename T, typename QKV_TYPE>
void append_speculate_cache_rope(const QKV_TYPE* qkv,
@@ -39,7 +111,8 @@ void append_speculate_cache_rope(const QKV_TYPE* qkv,
const int bsz,
const int token_num,
const cudaStream_t& stream,
const bool use_neox_style) {
const bool use_neox_style,
const bool rope_3d) {
int output_inner_dim = num_heads + 2 * kv_num_heads;
const uint32_t elem_nums =
@@ -73,7 +146,8 @@ void append_speculate_cache_rope(const QKV_TYPE* qkv,
dim_head,
block_size,
elem_nums,
kv_num_heads);
kv_num_heads,
rope_3d);
} else {
append_speculate_cache_rope_kernel<T, PackSize>
<<<grid_size, threads_per_block, 0, stream>>>(
@@ -96,7 +170,8 @@ void append_speculate_cache_rope(const QKV_TYPE* qkv,
dim_head,
block_size,
elem_nums,
kv_num_heads);
kv_num_heads,
rope_3d);
}
}
@@ -125,7 +200,8 @@ void append_speculate_cache_int8_rope(const QKV_TYPE* qkv,
const int bsz,
const int token_num,
const cudaStream_t& stream,
const bool use_neox_style) {
const bool use_neox_style,
const bool rope_3d) {
constexpr int num_warps = 4;
const int all_warps =
((num_heads + 2 * kv_num_heads) + num_warps - 1) / num_warps * num_warps;
@@ -167,7 +243,8 @@ void append_speculate_cache_int8_rope(const QKV_TYPE* qkv,
block_size,
127.0f,
-127.0f,
kv_num_heads);
kv_num_heads,
rope_3d);
} else {
append_speculate_cache_int8_rope_kernel<T, 4, 0, 128, QKV_TYPE, IsFP8>
<<<grids, num_warps * 32, 0, stream>>>(qkv,
@@ -191,7 +268,8 @@ void append_speculate_cache_int8_rope(const QKV_TYPE* qkv,
block_size,
127.0f,
-127.0f,
kv_num_heads);
kv_num_heads,
rope_3d);
}
}
@@ -222,7 +300,8 @@ void append_speculate_cache_int4_rope(const QKV_TYPE* qkv,
const int bsz,
const int token_num,
const cudaStream_t& stream,
const bool use_neox_style) {
const bool use_neox_style,
const bool rope_3d) {
constexpr int num_warps = 4;
const int all_warps =
((num_heads + 2 * kv_num_heads) + num_warps - 1) / num_warps * num_warps;
@@ -266,7 +345,8 @@ void append_speculate_cache_int4_rope(const QKV_TYPE* qkv,
block_size,
7.0f,
-8.0f,
kv_num_heads);
kv_num_heads,
rope_3d);
} else {
append_speculate_cache_int4_rope_kernel<T, 4>
<<<grids, num_warps * 32, 0, stream>>>(qkv,
@@ -292,7 +372,8 @@ void append_speculate_cache_int4_rope(const QKV_TYPE* qkv,
block_size,
7.0f,
-8.0f,
kv_num_heads);
kv_num_heads,
rope_3d);
}
}
template <typename T, typename QKV_TYPE>
@@ -313,11 +394,15 @@ void SpeculateWriteCacheWithRoPEKernel(
const paddle::optional<paddle::Tensor>& cache_v_zp,
const std::string& cache_quant_type_str,
const bool use_neox_rotary_style,
const bool rope_3d,
const int max_seq_len,
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out) {
paddle::Tensor* value_cache_out,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps) {
typedef cascade_attn_type_traits<T> traits_;
typedef cascade_attn_type_traits<QKV_TYPE> qkt_nv_type_;
typedef typename traits_::type DataType_;
@@ -342,142 +427,185 @@ void SpeculateWriteCacheWithRoPEKernel(
? rotary_embs.get().data<float>() + max_seq_len * dim_head
: rotary_embs.get().data<float>() + max_seq_len * dim_head / 2;
}
if (cache_quant_type_str == "none") {
append_speculate_cache_rope(
reinterpret_cast<const QKV_TYPE*>(qkv_ptr),
reinterpret_cast<DataType_*>(key_cache_out->data<T>()),
reinterpret_cast<DataType_*>(value_cache_out->data<T>()),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
sin_emb,
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? reinterpret_cast<DataType_*>(
const_cast<T*>(qkv_biases.get().data<T>()))
: nullptr,
max_seq_len,
max_blocks_per_seq,
num_heads,
kv_num_heads,
dim_head,
block_size,
bsz,
token_nums,
stream,
use_neox_rotary_style);
} else if (cache_quant_type_str == "cache_int8") {
append_speculate_cache_int8_rope(
reinterpret_cast<const QKV_TYPE*>(qkv_ptr),
key_cache_out->data<uint8_t>(),
value_cache_out->data<uint8_t>(),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
sin_emb,
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? reinterpret_cast<DataType_*>(
const_cast<T*>(qkv_biases.get().data<T>()))
: nullptr,
cache_k_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_k_scale.get().data<T>()))
: nullptr,
cache_v_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_v_scale.get().data<T>()))
: nullptr,
max_seq_len,
max_blocks_per_seq,
num_heads,
kv_num_heads,
dim_head,
block_size,
bsz,
token_nums,
stream,
use_neox_rotary_style);
} else if (cache_quant_type_str == "cache_fp8") {
append_speculate_cache_int8_rope<DataType_, QKV_TYPE, true>(
reinterpret_cast<const QKV_TYPE*>(qkv_ptr),
key_cache_out->data<uint8_t>(),
value_cache_out->data<uint8_t>(),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
sin_emb,
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? reinterpret_cast<DataType_*>(
const_cast<T*>(qkv_biases.get().data<T>()))
: nullptr,
cache_k_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_k_scale.get().data<T>()))
: nullptr,
cache_v_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_v_scale.get().data<T>()))
: nullptr,
max_seq_len,
max_blocks_per_seq,
num_heads,
kv_num_heads,
dim_head,
block_size,
bsz,
token_nums,
stream,
use_neox_rotary_style);
} else if (cache_quant_type_str == "cache_int4_zp") {
append_speculate_cache_int4_rope(
reinterpret_cast<const QKV_TYPE*>(qkv_ptr),
key_cache_out->data<uint8_t>(),
value_cache_out->data<uint8_t>(),
reinterpret_cast<DataType_*>(const_cast<T*>(qkv_out->data<T>())),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
sin_emb,
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? reinterpret_cast<DataType_*>(
const_cast<T*>(qkv_biases.get().data<T>()))
: nullptr,
cache_k_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_k_scale.get().data<T>()))
: nullptr,
cache_v_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_v_scale.get().data<T>()))
: nullptr,
cache_k_zp ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_k_zp.get().data<T>()))
: nullptr,
cache_v_zp ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_v_zp.get().data<T>()))
: nullptr,
max_seq_len,
max_blocks_per_seq,
num_heads,
kv_num_heads,
dim_head,
block_size,
bsz,
token_nums,
stream,
use_neox_rotary_style);
if (q_norm_weight && k_norm_weight) {
if (cache_quant_type_str == "none") {
append_speculate_cache_rope_qk_norm(
reinterpret_cast<const QKV_TYPE*>(qkv_ptr),
reinterpret_cast<DataType_*>(key_cache_out->data<T>()),
reinterpret_cast<DataType_*>(value_cache_out->data<T>()),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
sin_emb,
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? reinterpret_cast<DataType_*>(
const_cast<T*>(qkv_biases.get().data<T>()))
: nullptr,
max_seq_len,
max_blocks_per_seq,
num_heads,
kv_num_heads,
dim_head,
block_size,
bsz,
token_nums,
stream,
use_neox_rotary_style,
reinterpret_cast<const float*>(q_norm_weight.get().data<float>()),
reinterpret_cast<const float*>(k_norm_weight.get().data<float>()),
rms_norm_eps,
rope_3d);
} else {
PD_THROW(
"append_decode_cache_rope_qk_norm not support cachekv quant yet");
}
} else {
PD_THROW(
"cache_quant_type_str should be one of [none, cache_int8, "
"cache_int4_zp]");
if (cache_quant_type_str == "none") {
append_speculate_cache_rope(
reinterpret_cast<const QKV_TYPE*>(qkv_ptr),
reinterpret_cast<DataType_*>(key_cache_out->data<T>()),
reinterpret_cast<DataType_*>(value_cache_out->data<T>()),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
sin_emb,
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? reinterpret_cast<DataType_*>(
const_cast<T*>(qkv_biases.get().data<T>()))
: nullptr,
max_seq_len,
max_blocks_per_seq,
num_heads,
kv_num_heads,
dim_head,
block_size,
bsz,
token_nums,
stream,
use_neox_rotary_style,
rope_3d);
} else if (cache_quant_type_str == "cache_int8") {
append_speculate_cache_int8_rope(
reinterpret_cast<const QKV_TYPE*>(qkv_ptr),
key_cache_out->data<uint8_t>(),
value_cache_out->data<uint8_t>(),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
sin_emb,
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? reinterpret_cast<DataType_*>(
const_cast<T*>(qkv_biases.get().data<T>()))
: nullptr,
cache_k_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_k_scale.get().data<T>()))
: nullptr,
cache_v_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_v_scale.get().data<T>()))
: nullptr,
max_seq_len,
max_blocks_per_seq,
num_heads,
kv_num_heads,
dim_head,
block_size,
bsz,
token_nums,
stream,
use_neox_rotary_style,
rope_3d);
} else if (cache_quant_type_str == "cache_fp8") {
append_speculate_cache_int8_rope<DataType_, QKV_TYPE, true>(
reinterpret_cast<const QKV_TYPE*>(qkv_ptr),
key_cache_out->data<uint8_t>(),
value_cache_out->data<uint8_t>(),
reinterpret_cast<DataType_*>(qkv_out->data<T>()),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
sin_emb,
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? reinterpret_cast<DataType_*>(
const_cast<T*>(qkv_biases.get().data<T>()))
: nullptr,
cache_k_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_k_scale.get().data<T>()))
: nullptr,
cache_v_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_v_scale.get().data<T>()))
: nullptr,
max_seq_len,
max_blocks_per_seq,
num_heads,
kv_num_heads,
dim_head,
block_size,
bsz,
token_nums,
stream,
use_neox_rotary_style,
rope_3d);
} else if (cache_quant_type_str == "cache_int4_zp") {
append_speculate_cache_int4_rope(
reinterpret_cast<const QKV_TYPE*>(qkv_ptr),
key_cache_out->data<uint8_t>(),
value_cache_out->data<uint8_t>(),
reinterpret_cast<DataType_*>(const_cast<T*>(qkv_out->data<T>())),
block_tables.data<int>(),
batch_id_per_token.data<int>(),
cu_seqlens_q.data<int>(),
seq_lens.data<int>(),
seq_lens_encoder.data<int>(),
cos_emb,
sin_emb,
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
qkv_biases ? reinterpret_cast<DataType_*>(
const_cast<T*>(qkv_biases.get().data<T>()))
: nullptr,
cache_k_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_k_scale.get().data<T>()))
: nullptr,
cache_v_scale ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_v_scale.get().data<T>()))
: nullptr,
cache_k_zp ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_k_zp.get().data<T>()))
: nullptr,
cache_v_zp ? reinterpret_cast<DataType_*>(
const_cast<T*>(cache_v_zp.get().data<T>()))
: nullptr,
max_seq_len,
max_blocks_per_seq,
num_heads,
kv_num_heads,
dim_head,
block_size,
bsz,
token_nums,
stream,
use_neox_rotary_style,
rope_3d);
} else {
PD_THROW(
"cache_quant_type_str should be one of [none, cache_int8, "
"cache_int4_zp]");
}
}
}
@@ -500,11 +628,15 @@ template void SpeculateWriteCacheWithRoPEKernel<paddle::bfloat16, int>(
const paddle::optional<paddle::Tensor>& cache_v_zp,
const std::string& cache_quant_type_str,
const bool use_neox_rotary_style,
const bool rope_3d,
const int max_seq_len,
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out);
paddle::Tensor* value_cache_out,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps);
template void
SpeculateWriteCacheWithRoPEKernel<paddle::bfloat16, paddle::bfloat16>(
@@ -526,11 +658,15 @@ SpeculateWriteCacheWithRoPEKernel<paddle::bfloat16, paddle::bfloat16>(
const paddle::optional<paddle::Tensor>& cache_v_zp,
const std::string& cache_quant_type_str,
const bool use_neox_rotary_style,
const bool rope_3d,
const int max_seq_len,
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out);
paddle::Tensor* value_cache_out,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps);
template void SpeculateWriteCacheWithRoPEKernel<paddle::float16, int>(
const AppendAttnMetaData& meta_data,
@@ -551,11 +687,15 @@ template void SpeculateWriteCacheWithRoPEKernel<paddle::float16, int>(
const paddle::optional<paddle::Tensor>& cache_v_zp,
const std::string& cache_quant_type_str,
const bool use_neox_rotary_style,
const bool rope_3d,
const int max_seq_len,
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out);
paddle::Tensor* value_cache_out,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps);
template void
@@ -578,8 +718,12 @@ SpeculateWriteCacheWithRoPEKernel<paddle::float16, paddle::float16>(
const paddle::optional<paddle::Tensor>& cache_v_zp,
const std::string& cache_quant_type_str,
const bool use_neox_rotary_style,
const bool rope_3d,
const int max_seq_len,
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out);
paddle::Tensor* value_cache_out,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps);

View File

@@ -35,8 +35,12 @@ void SpeculateWriteCacheWithRoPEKernel(
const paddle::optional<paddle::Tensor>& cache_v_zp,
const std::string& cache_quant_type_str,
const bool use_neox_rotary_style,
const bool rope_3d,
const int max_seq_len,
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out);
paddle::Tensor* value_cache_out,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps);

View File

@@ -56,6 +56,7 @@ CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::bfloat16, false>(
const bool causal,
const bool is_decoder,
const bool enable_prefill,
const std::string& cache_quant_type_str,
cudaStream_t& stream,
paddle::Tensor* out);
@@ -103,5 +104,6 @@ CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::bfloat16, true>(
const bool causal,
const bool is_decoder,
const bool enable_prefill,
const std::string& cache_quant_type_str,
cudaStream_t& stream,
paddle::Tensor* out);

View File

@@ -54,6 +54,7 @@ template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::float8_e4
const bool causal,
const bool is_decoder,
const bool enable_prefill,
const std::string& cache_quant_type_str,
cudaStream_t& stream,
paddle::Tensor* out);
@@ -98,5 +99,6 @@ template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::float8_e4
const bool causal,
const bool is_decoder,
const bool enable_prefill,
const std::string& cache_quant_type_str,
cudaStream_t& stream,
paddle::Tensor* out);

View File

@@ -54,6 +54,7 @@ template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, int8_t, false>(
const bool causal,
const bool is_decoder,
const bool enable_prefill,
const std::string& cache_quant_type_str,
cudaStream_t& stream,
paddle::Tensor* out);
@@ -100,5 +101,6 @@ template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, int8_t, true>(
const bool causal,
const bool is_decoder,
const bool enable_prefill,
const std::string& cache_quant_type_str,
cudaStream_t& stream,
paddle::Tensor* out);

View File

@@ -54,6 +54,7 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float16, f
const bool causal,
const bool is_decoder,
const bool enable_prefill,
const std::string& cache_quant_type_str,
cudaStream_t& stream,
paddle::Tensor* out);
@@ -100,5 +101,6 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float16, t
const bool causal,
const bool is_decoder,
const bool enable_prefill,
const std::string& cache_quant_type_str,
cudaStream_t& stream,
paddle::Tensor* out);

View File

@@ -54,6 +54,7 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float8_e4m
const bool causal,
const bool is_decoder,
const bool enable_prefill,
const std::string& cache_quant_type_str,
cudaStream_t& stream,
paddle::Tensor* out);
@@ -99,5 +100,6 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float8_e4m
const bool causal,
const bool is_decoder,
const bool enable_prefill,
const std::string& cache_quant_type_str,
cudaStream_t& stream,
paddle::Tensor* out);

View File

@@ -54,6 +54,7 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, int8_t, false>(
const bool causal,
const bool is_decoder,
const bool enable_prefill,
const std::string& cache_quant_type_str,
cudaStream_t& stream,
paddle::Tensor* out);
@@ -99,5 +100,6 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, int8_t, true>(
const bool causal,
const bool is_decoder,
const bool enable_prefill,
const std::string& cache_quant_type_str,
cudaStream_t& stream,
paddle::Tensor* out);

View File

@@ -43,4 +43,7 @@ EncoderWriteCacheWithRopeKernel<paddle::bfloat16, paddle::bfloat16>(
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out);
paddle::Tensor* value_cache_out,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps);

View File

@@ -42,4 +42,7 @@ template void EncoderWriteCacheWithRopeKernel<paddle::bfloat16, int>(
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out);
paddle::Tensor* value_cache_out,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps);

View File

@@ -42,4 +42,7 @@ template void EncoderWriteCacheWithRopeKernel<paddle::float16, paddle::float16>(
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out);
paddle::Tensor* value_cache_out,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps);

View File

@@ -42,4 +42,7 @@ template void EncoderWriteCacheWithRopeKernel<paddle::float16, int>(
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
paddle::Tensor* value_cache_out);
paddle::Tensor* value_cache_out,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps);

View File

@@ -27,6 +27,7 @@ struct AppendAttnMetaData {
int head_dims;
int head_dims_v;
int max_blocks_per_seq;
const int *mask_offset = nullptr;
};
__forceinline__ __host__ __device__ int div_up(int a, int b) {
@@ -430,6 +431,9 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
} else if (group_size == 12) { \
constexpr size_t GROUP_SIZE = 12; \
__VA_ARGS__ \
} else if (group_size == 14) { \
constexpr size_t GROUP_SIZE = 14; \
__VA_ARGS__ \
} else if (group_size == 16) { \
constexpr size_t GROUP_SIZE = 16; \
__VA_ARGS__ \
@@ -437,6 +441,15 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
PD_THROW("not support the group_size", group_size); \
}
#define DISPATCH_DyCfp8(is_dynamic_cfp8, IsDynamicC8, ...) \
if (is_dynamic_cfp8) { \
constexpr bool IsDynamicC8 = true; \
__VA_ARGS__ \
} else { \
constexpr bool IsDynamicC8 = false; \
__VA_ARGS__ \
}
#define DISPATCH_MLA_GROUP_SIZE(group_size, GROUP_SIZE, ...) \
if (group_size == 8) { \
constexpr size_t GROUP_SIZE = 8; \
@@ -474,6 +487,9 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
if (causal) { \
constexpr bool CAUSAL = true; \
__VA_ARGS__ \
} else { \
constexpr bool CAUSAL = false; \
__VA_ARGS__ \
}
#define DISPATCH_ENABLE_PREFILL(enable_prefill, ENABLE_PREFILL, ...) \
@@ -559,3 +575,37 @@ template <typename T, bool IsFP8>inline __device__ static void convert_c8(T * re
convert_int8(result, source);
}
}
constexpr int kWarpSize = 32;
template<typename T>
inline __device__ void WelfordCombine1(T b_m2, T* m2) {
*m2 += b_m2;
}
template<typename T, int thread_group_width = kWarpSize>
__inline__ __device__ void WelfordWarpReduce(T thread_m2, T* m2) {
*m2 = thread_m2;
for (int mask = thread_group_width / 2; mask > 0; mask >>= 1) {
T b_m2 = __shfl_xor_sync(0xffffffff, *m2, mask);
WelfordCombine1(b_m2, m2);
}
}
template<typename T, int thread_group_width = kWarpSize>
__inline__ __device__ void WelfordWarpAllReduce(T thread_m2, T* m2) {
WelfordWarpReduce<T, thread_group_width>(thread_m2, m2);
}
template <typename T>
__inline__ __device__ T Rsqrt(T x);
template <>
__inline__ __device__ float Rsqrt<float>(float x) {
return rsqrt(x);
}
template <>
__inline__ __device__ double Rsqrt<double>(double x) {
return rsqrt(x);
}

View File

@@ -63,7 +63,7 @@ std::vector<paddle::Tensor> AppendAttention(
const paddle::Tensor &kv_num_blocks,
const paddle::Tensor &decoder_batch_ids,
const paddle::Tensor &decoder_tile_ids_per_batch,
const paddle::Tensor &decoder_num_blocks,
const paddle::Tensor &decoder_num_blocks_cpu,
const paddle::Tensor &set_max_lengths, const paddle::Tensor &max_len_kv,
const paddle::optional<paddle::Tensor> &rotary_embs,
const paddle::optional<paddle::Tensor> &attn_mask,
@@ -77,7 +77,54 @@ std::vector<paddle::Tensor> AppendAttention(
const paddle::optional<paddle::Tensor> &cache_v_zp,
const paddle::optional<paddle::Tensor> &out_linear_shifts,
const paddle::optional<paddle::Tensor> &out_linear_smooths,
const paddle::optional<paddle::Tensor> &mask_offset,
const paddle::optional<paddle::Tensor> &kv_signal_data,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps,
const std::string &compute_dtype, const std::string &cache_quant_type_str,
const bool use_neox_rotary_style, const bool rope_3d,
const int max_input_length, const float quant_max_bound,
const float quant_min_bound, const float out_linear_in_scale,
const int encoder_block_shape_q, const int decoder_block_shape_q,
const int max_partition_size, const int encoder_max_partition_size,
const int speculate_max_draft_token_num, const bool causal,
const bool speculate_decoder);
void AppendAttentionWithOutput(
const paddle::Tensor &qkv, const paddle::Tensor &key_cache,
const paddle::Tensor &value_cache, const paddle::Tensor &seq_lens_encoder,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &seq_lens_this_time,
const paddle::Tensor &batch_id_per_token, const paddle::Tensor &cu_seqlens_q,
const paddle::Tensor &block_tables, const paddle::Tensor &encoder_batch_ids,
const paddle::Tensor &encoder_tile_ids_per_batch,
const paddle::Tensor &encoder_num_blocks,
const paddle::Tensor &kv_batch_ids,
const paddle::Tensor &kv_tile_ids_per_batch,
const paddle::Tensor &kv_num_blocks,
const paddle::Tensor &decoder_batch_ids,
const paddle::Tensor &decoder_tile_ids_per_batch,
const paddle::Tensor &decoder_num_blocks_cpu,
const paddle::Tensor &set_max_lengths, const paddle::Tensor &max_len_kv,
paddle::Tensor &fmha_out,
const paddle::optional<paddle::Tensor> &rotary_embs,
const paddle::optional<paddle::Tensor> &attn_mask,
const paddle::optional<paddle::Tensor> &qkv_bias,
const paddle::optional<paddle::Tensor> &qkv_out_scales,
const paddle::optional<paddle::Tensor> &cache_k_quant_scales,
const paddle::optional<paddle::Tensor> &cache_v_quant_scales,
const paddle::optional<paddle::Tensor> &cache_k_dequant_scales,
const paddle::optional<paddle::Tensor> &cache_v_dequant_scales,
const paddle::optional<paddle::Tensor> &cache_k_zp,
const paddle::optional<paddle::Tensor> &cache_v_zp,
const paddle::optional<paddle::Tensor> &out_linear_shifts,
const paddle::optional<paddle::Tensor> &out_linear_smooths,
const paddle::optional<paddle::Tensor> &mask_offset,
const paddle::optional<paddle::Tensor> &kv_signal_data,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const float rms_norm_eps,
const std::string &compute_dtype, const std::string &cache_quant_type_str,
const bool use_neox_rotary_style, const bool rope_3d,
const int max_input_length, const float quant_max_bound,
@@ -107,7 +154,8 @@ std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
const paddle::optional<paddle::Tensor> &cache_v_zp,
const paddle::optional<paddle::Tensor> &kv_signal_data,
const int kv_token_num, const int max_seq_len,
const std::string &cache_quant_type);
const std::string &cache_quant_type,
const bool rope_3d);
std::vector<paddle::Tensor>
PreCacheLenConcat(const paddle::Tensor &seq_lens_decoder,
@@ -124,11 +172,29 @@ paddle::Tensor FusedExpertMoeFunc(
const std::string &quant_method, const int moe_topk,
const bool norm_topk_prob, const bool group_moe);
std::vector<paddle::Tensor> MacheteMMKernel(
paddle::Tensor const& A, paddle::Tensor const& B,
paddle::optional<paddle::Tensor> const& maybe_group_scales,
paddle::optional<paddle::Tensor> const& maybe_group_zeros,
paddle::optional<paddle::Tensor> const& maybe_channel_scales,
paddle::optional<paddle::Tensor> const& maybe_token_scales,
std::string const& b_type_str,
std::string const& maybe_out_type_str,
int64_t const& maybe_group_size,
std::string const& maybe_schedule);
std::vector<paddle::Tensor> MachetePrepackBKernel(
paddle::Tensor const& B, std::string const& a_type_str, std::string const& b_type_str,
std::string const& maybe_group_scales_type_str);
std::vector<std::string> MacheteSupportedSchedules(
std::string const& a_type_str, std::string const& b_type_str);
std::vector<paddle::Tensor> MoeExpertDispatch(
const paddle::Tensor &input, const paddle::Tensor &gating_output,
const paddle::optional<paddle::Tensor> &gating_correction_bias,
const paddle::optional<paddle::Tensor> &w4a8_in_scale, const int moe_topk,
const bool group_moe, const bool topk_only_mode);
const bool group_moe, const std::string &moe_quant_type, const bool topk_only_mode);
std::vector<paddle::Tensor>
MoETopKSelectKernel(const paddle::Tensor &gating_logits,
@@ -188,7 +254,9 @@ paddle::Tensor MoeExpertFFNFunc(
const paddle::optional<paddle::Tensor>& down_proj_scale,
const paddle::optional<paddle::Tensor>& down_proj_in_scale,
const paddle::optional<paddle::Tensor>& expert_idx_per_token,
const std::string& quant_method, const bool used_in_ep_low_latency);
const std::string& quant_method, const bool used_in_ep_low_latency,
const int estimate_total_token_nums,
const int hadamard_block_size);
paddle::Tensor MoeExpertFFNWint2Func(
const paddle::Tensor& permute_input,
@@ -231,14 +299,23 @@ paddle::Tensor OpenShmAndGetMetaSignalFunc(const int rank, const int device_id,
paddle::Tensor InitSignalLayerwiseFunc(const paddle::Tensor &kv_signal_metadata,
const int layer_id);
std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
void GetBlockShapeAndSplitKVBlock(
const paddle::Tensor &seq_lens_encoder,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &seq_lens_this_time,
paddle::Tensor &decoder_batch_ids, // Inplace
paddle::Tensor &decoder_tile_ids_per_batch, // Inplace
paddle::Tensor &decoder_num_blocks_x_cpu, // Inplace, Pinned Memory
paddle::Tensor &decoder_num_blocks_cpu, // Inplace, Pinned Memory
paddle::Tensor &decoder_num_blocks_device, // Inplace
paddle::Tensor &decoder_chunk_size_device, // Inplace
paddle::Tensor &max_len_tensor_cpu, // Inplace, Pinned Memory
paddle::Tensor &encoder_batch_ids, // Inplace
paddle::Tensor &encoder_tile_ids_per_batch, // Inplace
paddle::Tensor &encoder_num_blocks_x_cpu, // Inplace, Pinned Memory
paddle::Tensor &kv_batch_ids, // Inplace
paddle::Tensor &kv_tile_ids_per_batch, // Inplace
paddle::Tensor &kv_num_blocks_x_cpu, // Inplace, Pinned Memory
paddle::Tensor &max_len_kv_cpu, // Inplace, Pinned Memory
const int encoder_block_shape_q,
const int decoder_block_shape_q,
const int group_size,
@@ -311,9 +388,11 @@ void RecoverDecodeTask(const paddle::Tensor &stop_flags,
const paddle::Tensor &step_seq_lens_decoder,
const paddle::Tensor &block_tables,
const paddle::Tensor &is_block_step,
const int block_size);
const paddle::optional<paddle::Tensor> &draft_tokens,
const paddle::optional<paddle::Tensor> &step_draft_tokens,
const paddle::optional<paddle::Tensor> &step_seq_lens_this_time,
const int block_size,
const int max_draft_tokens);
paddle::Tensor
GroupSwigluWithMasked(const paddle::Tensor &fc1_out_tensor,
@@ -323,7 +402,7 @@ std::vector<paddle::Tensor> ExtractTextTokenOutput(
const paddle::Tensor &max_seq_len, const paddle::Tensor &max_seq_len_index,
const paddle::Tensor &mm_token_num_len,
const paddle::Tensor &seq_lens_this_time,
const paddle::Tensor &cu_seqlens_q, const paddle::Tensor &score_text);
const paddle::Tensor &cu_seqlens_q, const paddle::Tensor &hidden_states);
std::vector<paddle::Tensor> MoEDeepGEMMPermute(const paddle::Tensor &x,
const paddle::Tensor &topk_idx,
@@ -337,8 +416,8 @@ std::vector<paddle::Tensor> MoEDeepGEMMDePermute(
const paddle::Tensor &topk_idx, const paddle::Tensor &topk_weights);
void TextImageIndexOut(const paddle::Tensor &token_type_ids,
const paddle::Tensor &text_input,
const paddle::Tensor &image_input);
paddle::Tensor &text_input,
paddle::Tensor &image_input);
void TextImageGatherScatter(paddle::Tensor &input, paddle::Tensor &text_input,
paddle::Tensor &image_input,
@@ -396,23 +475,18 @@ std::vector<paddle::Tensor> MultiHeadLatentAttention(
const paddle::Tensor& query,
const paddle::Tensor& key_cache,
const paddle::Tensor& value_cache,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& block_tables,
const paddle::Tensor& encoder_batch_ids,
const paddle::Tensor& encoder_tile_ids_per_batch,
const paddle::Tensor& encoder_num_blocks,
const paddle::Tensor& kv_batch_ids,
const paddle::Tensor& kv_tile_ids_per_batch,
const paddle::Tensor& kv_num_blocks,
const paddle::Tensor& decoder_batch_ids,
const paddle::Tensor& decoder_tile_ids_per_batch,
const paddle::Tensor& decoder_num_blocks,
const paddle::Tensor& decoder_num_blocks_cpu,
const paddle::Tensor& max_enc_len_this_time,
const paddle::Tensor& decoder_num_blocks_device,
const paddle::Tensor& decoder_chunk_size_device,
const paddle::Tensor& max_dec_len_this_time,
const paddle::Tensor& max_len_kv,
const paddle::optional<paddle::Tensor>& attn_mask,
@@ -497,6 +571,7 @@ std::vector<paddle::Tensor> NoauxTc(
int n_group,
int topk_group,
int topk,
bool renormalize,
float routed_scaling_factor);
#ifdef ENABLE_FP8
@@ -526,7 +601,7 @@ paddle::Tensor FusedHadamardQuantFp8Func(
int64_t init_custom_all_reduce(const std::vector<int64_t>& fake_ipc_ptrs,
paddle::Tensor& rank_data, int64_t rank, bool full_nvlink);
void all_reduce(int64_t _fa, paddle::Tensor& inp, paddle::Tensor& out,
void all_reduce(paddle::Tensor& inp, paddle::Tensor& out, int64_t _fa,
int64_t reg_buffer, int64_t reg_buffer_sz_bytes);
void dispose(int64_t _fa);
@@ -548,6 +623,8 @@ int64_t open_mem_handle(paddle::Tensor& mem_handle);
void free_shared_buffer(int64_t buffer);
void clear_ipc_handles(int64_t _fa);
// speculative decoding Kernel
std::vector<paddle::Tensor> SpeculateGetPaddingOffset(
const paddle::Tensor& input_ids,
@@ -609,7 +686,7 @@ void SpeculateVerify(
const paddle::Tensor &actual_draft_token_nums, const paddle::Tensor &topp,
int max_seq_len, int verify_window, bool enable_topp, bool benchmark_mode);
void SpeculateUpdateV3(const paddle::Tensor &seq_lens_encoder,
void SpeculateUpdate(const paddle::Tensor &seq_lens_encoder,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &not_need_stop,
const paddle::Tensor &draft_tokens,
@@ -640,6 +717,22 @@ void SpeculateSaveWithOutputMsgStatic(const paddle::Tensor& accept_tokens,
void SpeculateClearAcceptNums(const paddle::Tensor& accept_num,
const paddle::Tensor& seq_lens_decoder);
void SpeculateScheduleCache(const paddle::Tensor &draft_tokens,
const paddle::Tensor &block_tables,
const paddle::Tensor &stop_flags,
const paddle::Tensor &seq_lens_this_time,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &step_seq_lens_decoder,
const paddle::Tensor &step_draft_tokens,
const paddle::Tensor &step_seq_lens_this_time,
const paddle::Tensor &accept_num,
const paddle::Tensor &accept_tokens,
const paddle::Tensor &is_block_step,
const paddle::Tensor &not_need_stop,
const paddle::Tensor &stop_nums,
const int block_size,
const int max_draft_tokens);
void NgramMatch(const paddle::Tensor &input_ids,
const paddle::Tensor &input_ids_len,
const paddle::Tensor &pre_ids,
@@ -654,6 +747,20 @@ void NgramMatch(const paddle::Tensor &input_ids,
const int max_draft_tokens);
void HybridMtpNgram(const paddle::Tensor &input_ids,
const paddle::Tensor &input_ids_len,
const paddle::Tensor &pre_ids,
const paddle::Tensor &step_idx,
const paddle::Tensor &draft_token_num,
const paddle::Tensor &draft_tokens,
const paddle::Tensor &seq_lens_this_time,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &max_dec_len,
const int max_ngram_size,
const int min_ngram_size,
const int max_draft_tokens);
// MTP
void DraftModelPostprocess(const paddle::Tensor& base_model_draft_tokens,
const paddle::Tensor& base_model_seq_lens_this_time,
@@ -669,9 +776,12 @@ void DraftModelPreprocess(const paddle::Tensor& draft_tokens,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& step_idx,
const paddle::Tensor& not_need_stop,
const paddle::Tensor& is_block_step,
const paddle::Tensor& batch_drop,
const paddle::Tensor& pre_ids,
const paddle::Tensor& accept_tokens,
const paddle::Tensor& accept_num,
const paddle::Tensor& base_model_seq_lens_this_time,
const paddle::Tensor& base_model_seq_lens_encoder,
const paddle::Tensor& base_model_seq_lens_decoder,
const paddle::Tensor& base_model_step_idx,
@@ -680,7 +790,8 @@ void DraftModelPreprocess(const paddle::Tensor& draft_tokens,
const paddle::Tensor& base_model_draft_tokens,
const int max_draft_token,
const bool truncate_first_token,
const bool splitwise_prefill);
const bool splitwise_prefill,
const bool kvcache_scheduler_v1);
void DraftModelUpdate(const paddle::Tensor& inter_next_tokens,
@@ -761,6 +872,33 @@ void SpeculateStepPaddle(
const int encoder_decoder_block_num,
const int max_draft_tokens);
void MergePrefillDecodeOutput(
const paddle::Tensor &encoder_res,
const paddle::Tensor &decoder_res,
const paddle::Tensor &seq_lens_encoder,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &seq_lens_this_time,
const paddle::Tensor &cu_seq_q,
const int head_num,
const int head_dim,
const int max_token);
std::vector<paddle::Tensor> TopPSamplingReject(const paddle::Tensor &probs,
const paddle::Tensor &top_p,
const paddle::optional<paddle::Tensor> &top_k,
int64_t seed);
std::vector<paddle::Tensor> TopKRenorm(const paddle::Tensor &probs,
const paddle::Tensor &top_k);
std::vector<paddle::Tensor> MinPSamplingFromProbs(const paddle::Tensor &probs,
const paddle::Tensor &min_p);
void SaveOutMmsgStatic(const paddle::Tensor& x,
const paddle::Tensor& not_need_stop,
int64_t rank_id,
bool save_each_rank);
PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("get_expert_token_num", &GetExpertTokenNum, py::arg("topk_ids"),
@@ -814,6 +952,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
* append_attention
*/
m.def("append_attention", &AppendAttention, "append attention function");
m.def("append_attention_with_output", &AppendAttentionWithOutput, "append attention with output function");
/**
* gqa_rope_write_cache.cu
* gqa_rope_write_cache
@@ -845,7 +984,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("moe_expert_dispatch", &MoeExpertDispatch, py::arg("input"),
py::arg("gating_output"), py::arg("gating_correction_bias"),
py::arg("w4a8_in_scale"), py::arg("moe_topk"), py::arg("group_moe"),
py::arg("topk_only_mode"), "moe export dispatch function");
py::arg("moe_quant_type"), py::arg("topk_only_mode"), "moe export dispatch function");
/**
* moe/fused_moe/ep_moe_prefill_func.cu
@@ -869,12 +1008,33 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("per_token_quant_padding", &PerTokenQuantPadding, py::arg("input"),
py::arg("block_size"),
"per token per block quant and padding tranpose scale");
"per token per block quant and padding transpose scale");
m.def("masked_per_token_quant", &MaskedPerTokenQuant, py::arg("input"),
py::arg("recv_expert_count"), py::arg("block_size"),
"per token per block quant");
#ifdef ENABLE_MACHETE
/*machete/machete_mm.cu
* machete_mm
*/
m.def("machete_mm", &MacheteMMKernel, py::arg("A"), py::arg("B"), py::arg("maybe_group_scale"),
py::arg("maybe_group_zeros"), py::arg("maybe_channel_scales"), py::arg("maybe_token_scales"),
py::arg("b_type_str"), py::arg("maybe_out_type_str"), py::arg("maybe_group_size"),
py::arg("maybe_schedule"),
"machete mm function");
/*machete/machete_prepack_B.cu
* machete_prepack_B
*/
m.def("machete_prepack_B", &MachetePrepackBKernel, "machete prepacked B function");
/*machete/machete_supported_schedules.cu
* machete_supported_schedules
*/
m.def("machete_supported_schedules", &MacheteSupportedSchedules, "machete supported schedules function");
#endif
/**
* moe/fused_moe/moe_topk_select.cu
* moe_topk_select
@@ -891,7 +1051,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("moe_expert_ffn", &MoeExpertFFNFunc, "moe export ffn function");
/**
* moe/fused_moe/moe_ffn_wint2.cu
* moe/fused_moe/moe_expert_ffn_wint2.cu
* moe_expert_ffn_wint2
*/
m.def("moe_expert_ffn_wint2", &MoeExpertFFNWint2Func, "moe export ffn wint2 function");
@@ -1071,6 +1231,8 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("free_shared_buffer", &free_shared_buffer, "free_shared_buffer");
m.def("clear_ipc_handles", &clear_ipc_handles, "clear_ipc_handles");
m.def("open_mem_handle", &open_mem_handle, "open_mem_handle");
m.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta, "get_graph_buffer_ipc_meta");
@@ -1088,7 +1250,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("speculate_verify",&SpeculateVerify, "speculate_verify function");
m.def("speculate_update_v3",&SpeculateUpdateV3, "noaux_tc for Deepseekv3 MoE compute function");
m.def("speculate_update",&SpeculateUpdate, "Speculate Update Kernel");
m.def("speculate_set_value_by_flags_and_idx",&SpeculateSetValueByFlagsAndIdx, "speculate_set_value_by_flags_and_idx function");
@@ -1096,8 +1258,12 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("speculate_clear_accept_nums",&SpeculateClearAcceptNums, "speculate_clear_accept_nums function");
m.def("speculate_schedule_cache",&SpeculateScheduleCache, "SpeculateScheduleCache function");
m.def("ngram_match", &NgramMatch, "ngram_match function");
m.def("hybird_mtp_ngram", &HybridMtpNgram, "ngram_match_mixed function");
m.def("draft_model_postprocess",&DraftModelPostprocess, "draft_model_postprocess function");
m.def("draft_model_preprocess",&DraftModelPreprocess, "draft_model_preprocess function");
@@ -1111,4 +1277,14 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("mtp_step_paddle",&MTPStepPaddle, "mtp_step_paddle function");
m.def("speculate_step_paddle",&SpeculateStepPaddle, "speculate_step_paddle function");
m.def("merge_prefill_decode_output", &MergePrefillDecodeOutput, "merge_prefill_decode_output function");
m.def("rejection_top_p_sampling", &TopPSamplingReject, "rejection_top_p_sampling function");
m.def("top_k_renorm_probs", &TopKRenorm, "top_k_renorm_probs function");
m.def("min_p_sampling", &MinPSamplingFromProbs, "min_p_sampling function");
m.def("save_output", &SaveOutMmsgStatic, "save_output function");
}

View File

@@ -49,7 +49,7 @@ fptr_t init_custom_all_reduce(const std::vector<fptr_t>& fake_ipc_ptrs,
* Otherwise, _reg_buffer is assumed to be IPC-registered and inp is first
* copied into _reg_buffer.
*/
void all_reduce(fptr_t _fa, paddle::Tensor& inp, paddle::Tensor& out,
void all_reduce(paddle::Tensor& inp, paddle::Tensor& out, fptr_t _fa,
fptr_t _reg_buffer, int64_t reg_buffer_sz_bytes) {
auto fa = reinterpret_cast<paddle::CustomAllreduce*>(_fa);
auto stream = inp.stream();
@@ -122,10 +122,14 @@ void register_graph_buffers(fptr_t _fa,
for (int i = 0; i < handles.size(); i++) {
bytes.emplace_back(handles[i].begin(), handles[i].end());
}
bytes.reserve(handles.size());
fa->register_graph_buffers(bytes, offsets);
}
void clear_ipc_handles(fptr_t _fa) {
auto fa = reinterpret_cast<paddle::CustomAllreduce*>(_fa);
fa->clear_ipc_handles();
}
std::tuple<fptr_t, paddle::Tensor> allocate_shared_buffer_and_handle(
int64_t size) {
@@ -163,3 +167,12 @@ fptr_t open_mem_handle(paddle::Tensor& mem_handle) {
void free_shared_buffer(fptr_t buffer) {
CUDACHECK(cudaFree(reinterpret_cast<void*>(buffer)));
}
PD_BUILD_STATIC_OP(all_reduce)
.Inputs({"inp",
"out"})
.Outputs({"new_out"})
.Attrs({"_fa: int64_t", "_reg_buffer: int64_t", "reg_buffer_sz_bytes: int64_t"})
.SetInplaceMap({{"out", "new_out"}})
.SetKernelFn(PD_KERNEL(all_reduce));

View File

@@ -303,7 +303,7 @@ class CustomAllreduce {
bool full_nvlink_;
RankSignals sg_;
// Stores an map from a pointer to its peer pointters from all ranks.
// Stores an map from a pointer to its peer pointers from all ranks.
std::unordered_map<void*, RankData*> buffers_;
Signal* self_sg_;
@@ -517,10 +517,15 @@ class CustomAllreduce {
#undef KL
}
~CustomAllreduce() {
void clear_ipc_handles(){
for (auto [_, ptr] : ipc_handles_) {
CUDACHECK(cudaIpcCloseMemHandle(ptr));
}
ipc_handles_.clear();
}
~CustomAllreduce() {
clear_ipc_handles();
}
};
} // namespace paddle

View File

@@ -6,6 +6,8 @@
// clang-format off
#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
#include "cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp"
#include "helper.h"
// clang-format on
/*

View File

@@ -89,11 +89,11 @@ public:
GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN,
Shape::kK / WarpGemm::kK>;
/// Number of warp-level GEMM oeprations
/// Number of warp-level GEMM operations
static int const kWarpGemmIterations =
(WarpGemm::kK / Operator::Policy::MmaShape::kK);
/// Number of warp-level GEMM oeprations per load for B
/// Number of warp-level GEMM operations per load for B
static constexpr int kWarpGemmIterationsPerLoadForB =
Operator::IteratorB::InstructionShape::kRow / Operator::InstructionShape::kK;
static_assert(!(kWarpGemmIterations % kWarpGemmIterationsPerLoadForB), "");

View File

@@ -117,7 +117,7 @@ class LeftGELUAndMul {
CUTLASS_HOST_DEVICE
FragmentOutput operator()(FragmentAccumulator const &lhs,
FragmentAccumulator const &rhs) const {
// Convert source to interal compute numeric type
// Convert source to internal compute numeric type
NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
accumulator_to_compute;

View File

@@ -117,7 +117,7 @@ class LeftSiLUAndMul {
CUTLASS_HOST_DEVICE
FragmentOutput operator()(FragmentAccumulator const &lhs,
FragmentAccumulator const &rhs) const {
// Convert source to interal compute numeric type
// Convert source to internal compute numeric type
NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
accumulator_to_compute;

View File

@@ -92,7 +92,7 @@ class DualMmaBase {
Shape::kN / WarpGemm::kN,
Shape::kK / WarpGemm::kK>;
/// Number of warp-level GEMM oeprations
/// Number of warp-level GEMM operations
static int const kWarpGemmIterations =
(WarpGemm::kK / Operator0::Policy::MmaShape::kK);

View File

@@ -219,7 +219,7 @@ class EpilogueVisitorPerRowPerColNf4 {
iterator_C_.clear_mask();
}
// NOTE(wangbojun) Currently, this kernel don't hanve implantention for
// adding elementwise beta, we keep this here for future useage beta_ =
// adding elementwise beta, we keep this here for future usage beta_ =
// (params.elementwise.beta_ptr ? *params.elementwise.beta_ptr :
// params.elementwise.beta); if (beta_ == ElementAccumulator()) {
// iterator_C_.clear_mask();

View File

@@ -176,7 +176,7 @@ struct Nf4DefaultIteratorsTensorOp<cutlass::bfloat16_t,
///
/// Satisfies: ReadableTileIterator
///
template <typename ThreadMap_ ///< Thread map (conept: OutputTileThreadMap)
template <typename ThreadMap_ ///< Thread map (concept: OutputTileThreadMap)
>
class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 16, 8, 8> {
public:

View File

@@ -64,7 +64,7 @@ template <
typename InstructionShape_,
/// Number of stages used in the pipelined mainloop
int Stages,
/// Operation perfomed by GEMM
/// Operation performed by GEMM
typename Operator,
/// Store the accumulators in row major or column major. Row major is used
/// when output layout is interleaved.

View File

@@ -133,7 +133,7 @@ public:
/// Shape describing the number of warps filling the CTA
using WarpCount = GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
/// Number of warp-level GEMM oeprations
/// Number of warp-level GEMM operations
static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
static_assert(Operator::IteratorB::InstructionShape::kRow>=Operator::InstructionShape::kK,"");
static constexpr int kNumKIterationsPerWarpBLoad =

View File

@@ -509,7 +509,7 @@ public:
this->warp_tile_iterator_B_.load(warp_frag_B[(warp_tileB_k_load_offset + 1) % 2]);
++this->warp_tile_iterator_B_;
}
// TOOD(wangbojun) lds_converter can be remove for int8 B input
// TODO(wangbojun) lds_converter can be remove for int8 B input
typename TransformBAfterLDS::result_type converted_frag_B =
lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]);

View File

@@ -96,7 +96,7 @@ public:
/// Shape describing the number of warps filling the CTA
using WarpCount = GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
/// Number of warp-level GEMM oeprations
/// Number of warp-level GEMM operations
static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
static_assert(Operator::IteratorB::InstructionShape::kRow>=Operator::InstructionShape::kK,"");
static constexpr int kNumKIterationsPerWarpBLoad =

View File

@@ -646,7 +646,7 @@ public:
// );
// }
}
// TOOD(wangbojun) lds_converter can be remove for int8 B input
// TODO(wangbojun) lds_converter can be remove for int8 B input
// int4
// typename TransformBAfterLDS::result_type converted_frag_B =
// lds_converter(warp_frag_B[warp_tileB_k_load_offset % 2]);

View File

@@ -59,6 +59,15 @@ inline uint32_t get_cascade_attention_num_threads() {
inline bool get_mla_use_tensorcore() {
static const char* mla_use_tensorcore_env = std::getenv("FLAGS_mla_use_tensorcore");
static const uint32_t mla_use_tensorcore =
mla_use_tensorcore_env == nullptr ? 1 : std::stoul(std::string(mla_use_tensorcore_env));
mla_use_tensorcore_env == nullptr ? 0 : std::stoul(std::string(mla_use_tensorcore_env));
return mla_use_tensorcore != 0 ? true : false;
}
inline int get_mla_dec_chunk_size(int bsz) {
static const char* mla_dec_chunk_size_env =
std::getenv("FLAGS_mla_dec_chunk_size");
static const int mla_dec_chunk_size =
mla_dec_chunk_size_env == nullptr
? -1
: std::stoi(std::string(mla_dec_chunk_size_env));
return bsz > 1 ? mla_dec_chunk_size : 64;
}

View File

@@ -20,7 +20,7 @@ __global__ void extract_text_token_output_kernel(int *max_seq_len,
int *mm_token_num_len,
int *seq_lens_this_time,
int *cu_seqlens_q,
float *score_text,
float *hidden_states,
float *output,
const int bsz,
const int hidden_size) {
@@ -32,14 +32,11 @@ __global__ void extract_text_token_output_kernel(int *max_seq_len,
int max_seq_len_index_data = max_seq_len_index[0];
int mm_token_num_len_data = mm_token_num_len[0];
int true_bsz = cu_seqlens_q[bsz_index + 1] - 1;
if (bsz_index >= max_seq_len_index_data) {
true_bsz = true_bsz - mm_token_num_len_data;
}
if (max_seq_len_data == mm_token_num_len_data && bsz_index == max_seq_len_index_data) {
output[bsz_index * hidden_size + block_idx] = 0.0;
} else {
if (seq_lens_this_time[bsz_index] != 0) {
output[bsz_index * hidden_size + block_idx] = score_text[true_bsz * hidden_size + block_idx];
output[bsz_index * hidden_size + block_idx] = hidden_states[true_bsz * hidden_size + block_idx];
}
}
__syncthreads();
@@ -51,19 +48,19 @@ std::vector<paddle::Tensor> ExtractTextTokenOutput(
const paddle::Tensor& mm_token_num_len,
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& score_text) {
const paddle::Tensor& hidden_states) {
const int bsz = seq_lens_this_time.shape()[0];
const int hidden_size = score_text.shape()[1];
paddle::Tensor output = paddle::full({bsz, hidden_size}, 1, paddle::DataType::FLOAT32, score_text.place());
const int hidden_size = hidden_states.shape()[1];
paddle::Tensor output = paddle::full({bsz, hidden_size}, 1, paddle::DataType::FLOAT32, hidden_states.place());
extract_text_token_output_kernel<1024><<<hidden_size, 1024, 0, score_text.stream()>>>(
extract_text_token_output_kernel<1024><<<hidden_size, 1024, 0, hidden_states.stream()>>>(
const_cast<int*>(max_seq_len.data<int>()),
const_cast<int*>(max_seq_len_index.data<int>()),
const_cast<int*>(mm_token_num_len.data<int>()),
const_cast<int*>(seq_lens_this_time.data<int>()),
const_cast<int*>(cu_seqlens_q.data<int>()),
const_cast<float*>(score_text.data<float>()),
const_cast<float*>(hidden_states.data<float>()),
output.data<float>(),
bsz,
hidden_size
@@ -76,9 +73,9 @@ std::vector<std::vector<int64_t>> ExtractTextTokenOutputInferShape(const std::ve
const std::vector<int64_t>& mm_token_num_len_shape,
const std::vector<int64_t>& seq_lens_this_time_shape,
const std::vector<int64_t>& cu_seqlens_q_shape,
const std::vector<int64_t>& score_text_shape) {
const std::vector<int64_t>& hidden_states_shape) {
const int bsz = seq_lens_this_time_shape[0];
const int hidden_size = score_text_shape[1];
const int hidden_size = hidden_states_shape[1];
return {{bsz, hidden_size}};
}
@@ -87,8 +84,8 @@ std::vector<paddle::DataType> ExtractTextTokenOutputInferDtype(const paddle::Dat
const paddle::DataType& mm_token_num_len_dtype,
const paddle::DataType& seq_lens_this_time_dtype,
const paddle::DataType& cu_seqlens_q_dtype,
const paddle::DataType& score_text_dtype) {
return {score_text_dtype};
const paddle::DataType& hidden_states_dtype) {
return {hidden_states_dtype};
}
PD_BUILD_STATIC_OP(extract_text_token_output)
@@ -97,7 +94,7 @@ PD_BUILD_STATIC_OP(extract_text_token_output)
"mm_token_num_len",
"seq_lens_this_time",
"cu_seqlens_q",
"score_text"})
"hidden_states"})
.Outputs({"output"})
.SetKernelFn(PD_KERNEL(ExtractTextTokenOutput))
.SetInferShapeFn(PD_INFER_SHAPE(ExtractTextTokenOutputInferShape))

View File

@@ -0,0 +1,163 @@
/******************************************************************************
* Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
******************************************************************************/
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/extension.h"
#include "kernel_traits.h"
#include "flash_mask_attn_kernel.hpp"
template <typename paddle_type>
struct cuteType;
template <>
struct cuteType<phi::dtype::float16> {
using type = cutlass::half_t;
};
template <>
struct cuteType<phi::dtype::bfloat16> {
using type = cutlass::bfloat16_t;
};
template <typename T>
std::vector<paddle::Tensor> DispatchFlashAttentionMask(
const paddle::Tensor& q_input,
const paddle::Tensor& k_input,
const paddle::Tensor& v_input,
const paddle::Tensor& cu_seq_q,
const paddle::Tensor& cu_seq_k,
const paddle::Tensor& seq_len_encoder,
const paddle::optional<paddle::Tensor>& mask,
const int head_num,
const int kv_head_num,
const int head_dim,
const int max_seq_len,
const int max_enc_len_this_time,
const int max_dec_len_this_time) {
constexpr int kBlockM = 128;
constexpr int kBlockN = 128;
const int batch_size = cu_seq_q.dims()[0];
paddle::Tensor out = paddle::empty(
{q_input.dims()[0], head_num * head_dim}, q_input.dtype(), q_input.place());
Flash_mask_params params;
memset(&params, 0, sizeof(Flash_mask_params));
params.q_ptr = const_cast<T*>(q_input.data<T>());
params.k_ptr = const_cast<T*>(k_input.data<T>());
params.v_ptr = const_cast<T*>(v_input.data<T>());
params.o_ptr = const_cast<T*>(out.data<T>());
params.cu_seq_q = const_cast<int*>(cu_seq_q.data<int>());
params.cu_seq_k = const_cast<int*>(cu_seq_k.data<int>());
params.seq_len_encoder = const_cast<int*>(seq_len_encoder.data<int>());
params.head_num = head_num;
params.kv_head_num = kv_head_num;
params.max_seq_len_q = max_enc_len_this_time;
params.max_seq_len_k = max_enc_len_this_time + max_dec_len_this_time;
params.batch_size = batch_size;
params.gqa_group_size = head_num / kv_head_num;
constexpr float kLog2e = 1.4426950408889634074;
params.scale_softmax_log2 = 1.0f / std::sqrt(head_dim) * kLog2e;
using cute_type = typename cuteType<T>::type;
if (mask) {
params.mask = const_cast<int*>(mask.get().data<int>());
flash_attn_headdim128<kBlockM, kBlockN, true, cute_type>(params, 0);
} else {
flash_attn_headdim128<kBlockM, kBlockN, false, cute_type>(params, 0);
}
return {out};
}
std::vector<paddle::Tensor> FlashAttentionMask(
const paddle::Tensor& q_input,
const paddle::Tensor& k_input,
const paddle::Tensor& v_input,
const paddle::Tensor& cu_seq_q,
const paddle::Tensor& cu_seq_k,
const paddle::Tensor& seq_len_encoder,
const paddle::optional<paddle::Tensor> &mask,
const int head_num,
const int kv_head_num,
const int head_dim,
const int max_seq_len,
const int max_enc_len_this_time,
const int max_dec_len_this_time) {
if (q_input.dtype() == paddle::DataType::FLOAT16) {
using T = phi::dtype::float16;
return std::move(
DispatchFlashAttentionMask<T>(
q_input,
k_input,
v_input,
cu_seq_q,
cu_seq_k,
seq_len_encoder,
mask,
head_num,
kv_head_num,
head_dim,
max_seq_len,
max_enc_len_this_time,
max_dec_len_this_time));
} else if (q_input.dtype() == paddle::DataType::BFLOAT16) {
using T = phi::dtype::bfloat16;
return std::move(
DispatchFlashAttentionMask<T>(
q_input,
k_input,
v_input,
cu_seq_q,
cu_seq_k,
seq_len_encoder,
mask,
head_num,
kv_head_num,
head_dim,
max_seq_len,
max_enc_len_this_time,
max_dec_len_this_time));
}
}
PD_BUILD_OP(flash_attention_mask)
.Inputs({
"q_input",
"k_input",
"v_input",
"cu_seq_q",
"cu_seq_k",
"seq_len_encoder",
paddle::Optional("mask")})
.Attrs({
"head_num: int",
"kv_head_num: int",
"head_dim: int",
"max_seq_len: int",
"max_enc_len_this_time: int",
"max_dec_len_this_time: int"})
.Outputs({
"out"})
.SetKernelFn(PD_KERNEL(FlashAttentionMask));

View File

@@ -0,0 +1,231 @@
/******************************************************************************
* Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
******************************************************************************/
#pragma once
#include "cute/algorithm/copy.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/cutlass.h"
#include "cutlass/layout/layout.h"
#include "cutlass/numeric_types.h"
#include "cutlass/pipeline/pipeline.hpp"
#include "cutlass/cluster_launch.hpp"
#include "cutlass/arch/reg_reconfig.h"
#include "kernel_traits.h"
#include "mainloop_attn.hpp"
#include "softmax.hpp"
using namespace cute;
template <int kHeadDim>
auto get_gmem_layout(int token_num, int head_num) {
return make_layout(
make_shape(token_num, kHeadDim, head_num),
make_stride(head_num * kHeadDim, cute::_1{}, kHeadDim));
}
template <typename Ktraits>
__global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp, 1)
compute_attn_ws(
CUTE_GRID_CONSTANT typename CollectiveMainloopAttn<Ktraits>::Params const mainloop_params,
CUTE_GRID_CONSTANT Flash_mask_params const data_params) {
using Element = typename Ktraits::Element;
using ElementAccum = typename Ktraits::ElementAccum;
using SoftType = ElementAccum;
using TileShape_MNK = typename Ktraits::TileShape_MNK;
using ClusterShape = typename Ktraits::ClusterShape_MNK;
static constexpr int NumMmaThreads = size(typename Ktraits::TiledMma0{});
static constexpr int NumCopyThreads = cutlass::NumThreadsPerWarpGroup;
static constexpr int kBlockM = Ktraits::kBlockM;
static constexpr int kBlockN = Ktraits::kBlockN;
constexpr int kHeadDim = Ktraits::kHeadDim;
constexpr bool NeedMask = Ktraits::NeedMask;
using CollectiveMainloop = CollectiveMainloopAttn<Ktraits>;
using MainloopPipeline = typename Ktraits::MainloopPipeline;
using PipelineParams = typename MainloopPipeline::Params;
using PipelineState = typename MainloopPipeline::PipelineState;
extern __shared__ char shared_memory[];
auto &shared_storage = *reinterpret_cast<typename Ktraits::SharedStorage*>(shared_memory);
__align__(16) __shared__ int mask[kBlockM];
const int m_block = blockIdx.x;
const int bidh = blockIdx.y;
const int bidb = blockIdx.z;
if constexpr (NeedMask) {
const int *mask_this_batch = data_params.mask + data_params.cu_seq_q[bidb] + m_block * kBlockM;
for (int i = threadIdx.x; i < kBlockM; i += Ktraits::kNWarps * cutlass::NumThreadsPerWarp) {
mask[i] = mask_this_batch[i];
}
}
const int seq_len_q = data_params.seq_len_encoder[bidb];
const int seq_len_k = data_params.cu_seq_k[bidb + 1] - data_params.cu_seq_k[bidb];
if (m_block * kBlockM >= seq_len_q) {
return;
}
int const lane_predicate = cute::elect_one_sync();
int const warp_idx = cutlass::canonical_warp_idx_sync();
if (warp_idx == 0 && lane_predicate) {
CollectiveMainloop::prefetch_tma_descriptors(mainloop_params);
}
int const warp_group_thread_idx = threadIdx.x % cutlass::NumThreadsPerWarpGroup;
PipelineParams pipeline_params;
pipeline_params.transaction_bytes = CollectiveMainloop::TmaTransactionBytesK;
int warp_group_idx = cutlass::canonical_warp_group_idx();
pipeline_params.role = warp_group_idx == 0
? MainloopPipeline::ThreadCategory::Producer
: MainloopPipeline::ThreadCategory::Consumer;
pipeline_params.is_leader = warp_group_thread_idx == 0;
pipeline_params.num_consumers = NumMmaThreads;
if (warp_idx == 0 && lane_predicate) {
shared_storage.barrier_Q.init(1);
}
MainloopPipeline pipeline_k(shared_storage.pipeline_k, pipeline_params, ClusterShape{});
MainloopPipeline pipeline_v(shared_storage.pipeline_v, pipeline_params, ClusterShape{});
__syncthreads();
CollectiveMainloop collective_mainloop;
const int real_seq = seq_len_q - m_block * kBlockM;
const int n_block_max = NeedMask ? cute::ceil_div(mask[min(kBlockM - 1, real_seq - 1)], kBlockN) : cute::ceil_div((m_block + 1) * kBlockM + seq_len_k - seq_len_q, kBlockN);
if (warp_group_idx == 0) { // Producer
cutlass::arch::warpgroup_reg_dealloc<Ktraits::kNWarps == 8 ? 56 : 24>();
int warp_idx_in_warpgroup = __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
if (warp_idx_in_warpgroup == 0) { // Load Q, K, V
PipelineState smem_pipe_write_k = cutlass::make_producer_start_state<MainloopPipeline>();
PipelineState smem_pipe_write_v = cutlass::make_producer_start_state<MainloopPipeline>();
collective_mainloop.load(
mainloop_params,
pipeline_k,
pipeline_v,
smem_pipe_write_k,
smem_pipe_write_v,
shared_storage,
n_block_max,
m_block,
bidh,
bidb,
data_params.cu_seq_q,
data_params.cu_seq_k,
seq_len_q,
seq_len_k);
}
} else { // Consumer
cutlass::arch::warpgroup_reg_alloc<Ktraits::kNWarps == 8 ? 256 : 240>();
typename Ktraits::TiledMma1 tiled_mma1;
PipelineState smem_pipe_read_k, smem_pipe_read_v;
Tensor tOrO = partition_fragment_C(tiled_mma1, select<0, 2>(TileShape_MNK{}));
Softmax<2 * (2 * kBlockM / NumMmaThreads)> softmax;
collective_mainloop.mma(
mainloop_params,
pipeline_k,
pipeline_v,
smem_pipe_read_k,
smem_pipe_read_v,
tOrO,
softmax,
mask,
n_block_max,
threadIdx.x - NumCopyThreads,
m_block,
seq_len_q,
seq_len_k,
shared_storage);
const int o_head_stride = data_params.head_num * kHeadDim;
const int store_offset = (data_params.cu_seq_q[bidb] + m_block * kBlockM) * o_head_stride + bidh * kHeadDim;
collective_mainloop.store<NumMmaThreads>(
mainloop_params,
tOrO,
shared_storage,
tiled_mma1,
threadIdx.x - NumCopyThreads,
o_head_stride,
real_seq,
reinterpret_cast<Element*>(data_params.o_ptr) + store_offset);
}
}
template<typename Kernel_traits>
void run_flash_mask(Flash_mask_params &params, cudaStream_t stream) {
using Element = typename Kernel_traits::Element;
using TileShape_MNK = typename Kernel_traits::TileShape_MNK;
using ClusterShape = typename Kernel_traits::ClusterShape_MNK;
using CollectiveMainloop = CollectiveMainloopAttn<Kernel_traits>;
constexpr int kHeadDim = Kernel_traits::kHeadDim;
typename CollectiveMainloop::Params mainloop_params =
CollectiveMainloop::to_underlying_arguments({
static_cast<Element const*>(params.q_ptr),
get_gmem_layout<kHeadDim>(params.max_seq_len_q, params.head_num),
static_cast<Element const*>(params.k_ptr),
get_gmem_layout<kHeadDim>(params.max_seq_len_k, params.kv_head_num),
static_cast<Element const*>(params.v_ptr),
get_gmem_layout<kHeadDim>(params.max_seq_len_k, params.kv_head_num),
params.scale_softmax_log2
});
int num_blocks_m = cutlass::ceil_div(params.max_seq_len_q, Kernel_traits::kBlockM);
num_blocks_m = cutlass::ceil_div(num_blocks_m, size<0>(ClusterShape{})) * size<0>(ClusterShape{});
void *kernel;
kernel = (void *)compute_attn_ws<Kernel_traits>;
int smem_size = sizeof(typename Kernel_traits::SharedStorage);
if (smem_size >= 48 * 1024) {
cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
}
dim3 grid_dims;
grid_dims.x = num_blocks_m;
grid_dims.y = params.head_num;
grid_dims.z = params.batch_size;
static constexpr int ctaSize = Kernel_traits::kNWarps * 32;
dim3 block_dims(ctaSize);
dim3 cluster_dims(size<0>(ClusterShape{}), size<1>(ClusterShape{}), size<2>(ClusterShape{}));
cutlass::ClusterLaunchParams launch_params{grid_dims, block_dims, cluster_dims, smem_size, stream};
cutlass::launch_kernel_on_cluster(launch_params, kernel, mainloop_params, params);
}
template <int kBlockM, int kBlockN, bool NeedMask, typename InputType>
void flash_attn_headdim128(Flash_mask_params &params, cudaStream_t stream) {
constexpr static int Headdim = 128;
constexpr static int kNWarps = kBlockM / 16 + 4;
constexpr static int kStages = 2;
using Ktraits = Flash_mask_kernel_traits<Headdim, kBlockM, kBlockN, kNWarps, kStages, NeedMask, InputType>;
run_flash_mask<Ktraits>(params, stream);
}

View File

@@ -0,0 +1,124 @@
/******************************************************************************
* Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
******************************************************************************/
#pragma once
#include "cute/atom/mma_atom.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/cutlass.h"
#include "cutlass/layout/layout.h"
#include "cutlass/numeric_types.h"
#include "cutlass/pipeline/pipeline.hpp"
using namespace cute;
struct Flash_mask_params {
void *__restrict__ q_ptr;
void *__restrict__ k_ptr;
void *__restrict__ v_ptr;
void * __restrict__ o_ptr;
int * __restrict__ cu_seq_q;
int * __restrict__ cu_seq_k;
int * __restrict__ mask;
int * seq_len_encoder;
int head_num;
int kv_head_num;
int max_seq_len_q;
int max_seq_len_k;
int batch_size;
int gqa_group_size;
float scale_softmax_log2;
};
template <int kStages, class Gemm1Type, class Gemm2Type, class OutputType, class SmemLayoutQ,
class SmemLayoutK, class SmemLayoutV, class SmemLayoutO>
struct SharedStorageQKVO {
cute::array_aligned<Gemm1Type, cute::cosize_v<SmemLayoutQ>> smem_q;
cute::array_aligned<Gemm1Type, cute::cosize_v<SmemLayoutK>> smem_k;
union {
cute::array_aligned<Gemm2Type, cute::cosize_v<SmemLayoutV>> smem_v;
cute::array_aligned<OutputType, cute::cosize_v<SmemLayoutO>> smem_o;
};
struct {
cutlass::arch::ClusterTransactionBarrier barrier_Q;
typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_k;
typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_v;
};
};
template<int kHeadDim_, int kBlockM_, int kBlockN_, int kNWarps_, int kStages_, bool NeedMask_, typename elem_type=cutlass::half_t>
struct Flash_mask_kernel_traits {
using Element = elem_type;
using ElementAccum = float;
using index_t = int32_t;
static constexpr int kNWarps = kNWarps_;
static constexpr int kNThreads = kNWarps * cutlass::NumThreadsPerWarp;
static constexpr int kBlockM = kBlockM_;
static constexpr int kBlockN = kBlockN_;
static constexpr int kHeadDim = kHeadDim_;
static_assert(kHeadDim % 32 == 0);
using TileShape_MNK = Shape<Int<kBlockM>, Int<kBlockN>, Int<kHeadDim>>;
using ClusterShape_MNK = Shape<Int<1>, Int<1>, Int<1>>;
static constexpr int kStages = kStages_;
static constexpr int NeedMask = NeedMask_;
using AtomLayoutMNK = Layout<Shape<Int<kBlockM / 64>, _1, _1>>;
using TiledMma0 = decltype(cute::make_tiled_mma(
cute::GMMA::ss_op_selector<Element, Element, ElementAccum, TileShape_MNK>(),
AtomLayoutMNK{}));
using TiledMma1 = decltype(cute::make_tiled_mma(
cute::GMMA::rs_op_selector<Element, Element, ElementAccum, decltype(select<0, 2, 1>(TileShape_MNK{})),
GMMA::Major::K, GMMA::Major::MN>(),
AtomLayoutMNK{}));
using SmemLayoutAtomQ = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
using SmemLayoutQ = decltype(tile_to_shape(SmemLayoutAtomQ{}, select<0, 2>(TileShape_MNK{})));
using SmemLayoutAtomK = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
using SmemLayoutK =
decltype(tile_to_shape(SmemLayoutAtomK{},
make_shape(shape<1>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int<kStages>{})));
using SmemLayoutAtomV = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
using SmemLayoutV =
decltype(tile_to_shape(SmemLayoutAtomV{},
make_shape(shape<1>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int<kStages>{})));
using SmemLayoutAtomO = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
using SmemLayoutO = decltype(tile_to_shape(SmemLayoutAtomO{}, select<0, 2>(TileShape_MNK{})));
using SmemCopyAtomQ = Copy_Atom<cute::SM75_U32x4_LDSM_N, Element>;
using SmemCopyAtomO = Copy_Atom<cute::SM90_U32x4_STSM_N, Element>;
using SharedStorage = SharedStorageQKVO<kStages, Element, Element, Element, SmemLayoutQ, SmemLayoutK, SmemLayoutV, SmemLayoutO>;
static constexpr int NumProducerThreads = cutlass::NumThreadsPerWarpGroup;
static constexpr int NumMmaThreads = kNThreads - NumProducerThreads;
static constexpr int kNumVecElem = ceil_div(128, sizeof_bits_v<Element>);
static constexpr int kNumThreadsPerRow = kHeadDim / kNumVecElem;
static_assert(NumMmaThreads % kNumThreadsPerRow == 0);
static constexpr int kNumRows = NumMmaThreads / kNumThreadsPerRow;
using TiledCopyOAtom = cute::Copy_Atom<cute::UniversalCopy<cutlass::uint128_t>, Element>;
using TiledCopyOThrLayout = decltype(cute::make_layout(
cute::make_shape(Int<kNumRows>{}, Int<kNumThreadsPerRow>{}),
LayoutRight{}));
using TiledCopyOValLayout = decltype(cute::make_layout(
cute::make_shape(_1{}, Int<kNumVecElem>{}),
LayoutRight{}));
using GmemTiledCopyO = decltype(make_tiled_copy(
TiledCopyOAtom{},
TiledCopyOThrLayout{},
TiledCopyOValLayout{}
));
using MainloopPipeline = typename cutlass::PipelineTmaAsync<kStages>;
using PipelineState = typename cutlass::PipelineState<kStages>;
};

View File

@@ -0,0 +1,431 @@
/******************************************************************************
* Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
******************************************************************************/
#pragma once
#include <cutlass/cutlass.h>
#include <cutlass/array.h>
#include <cutlass/numeric_types.h>
#include <cutlass/numeric_conversion.h>
#include "cutlass/pipeline/pipeline.hpp"
#include "cute/tensor.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "utils.hpp"
using namespace cute;
template <typename Ktraits>
struct CollectiveMainloopAttn {
using Element = typename Ktraits::Element;
using TileShape_MNK = typename Ktraits::TileShape_MNK;
using ClusterShape = typename Ktraits::ClusterShape_MNK;
static constexpr int kStages = Ktraits::kStages;
static constexpr int kHeadDim = Ktraits::kHeadDim;
static constexpr int kBlockM = Ktraits::kBlockM;
static constexpr int kBlockN = Ktraits::kBlockN;
static constexpr bool NeedMask = Ktraits::NeedMask;
using ShapeT = cute::Shape<int32_t, int32_t, int32_t>;
using StrideT = cute::Shape<int32_t, _1, int32_t>;
using LayoutT = cute::Layout<ShapeT, StrideT>;
using GmemTiledCopyQ = cute::SM90_TMA_LOAD;
using GmemTiledCopyKV = decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape{})));
using GmemTiledCopyO = typename Ktraits::GmemTiledCopyO;
using SmemLayoutAtomQ = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
using SmemLayoutQ = decltype(tile_to_shape(SmemLayoutAtomQ{}, select<0, 2>(TileShape_MNK{})));
using SmemLayoutAtomK = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
using SmemLayoutK =
decltype(tile_to_shape(SmemLayoutAtomK{},
make_shape(shape<1>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int<kStages>{})));
using SmemLayoutV = SmemLayoutK;
// Note this is the transpose in terms of the view, not in terms of memory.
using SmemLayoutVt =
decltype(cute::composition(SmemLayoutV{},
make_layout(make_shape(get<2>(TileShape_MNK{}), get<1>(TileShape_MNK{}), Int<kStages>{}),
make_stride(get<1>(TileShape_MNK{}), _1{}, Int<size(SmemLayoutV{}(_, _, _0{}))>{}))));
using SmemLayoutO = typename Ktraits::SmemLayoutO;
using SmemCopyAtomO = typename Ktraits::SmemCopyAtomO;
using TMA_Q = decltype(make_tma_copy(
GmemTiledCopyQ{},
make_tensor(
make_gmem_ptr(static_cast<Element const*>(nullptr)),
repeat_like(StrideT{}, int32_t(0)),
StrideT{}
),
SmemLayoutQ{},
select<0, 2>(TileShape_MNK{}),
_1{})); // no mcast for Q
using TMA_KV = decltype(make_tma_copy(
GmemTiledCopyKV{},
make_tensor(
make_gmem_ptr(static_cast<Element const*>(nullptr)),
repeat_like(StrideT{}, int32_t(0)),
StrideT{}
),
take<0, 2>(SmemLayoutK{}),
select<1, 2>(TileShape_MNK{}),
size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
static constexpr int NumMmaThreads = size(typename Ktraits::TiledMma0{});
using MainloopPipeline = typename Ktraits::MainloopPipeline;
using PipelineParams = typename MainloopPipeline::Params;
using PipelineState = typename MainloopPipeline::PipelineState;
// Set the bytes transferred in this TMA transaction (may involve multiple issues)
static constexpr uint32_t TmaTransactionBytesQ = static_cast<uint32_t>(size(SmemLayoutQ{}) * cutlass::sizeof_bits_v<Element> / 8);
static constexpr uint32_t TmaTransactionBytesK = static_cast<uint32_t>(size(take<0, 2>(SmemLayoutK{})) * cutlass::sizeof_bits_v<Element> / 8);
static constexpr bool UseSchedulerBarrier = kHeadDim <= 128;
// Host side kernel arguments
struct Arguments {
Element const* ptr_Q;
LayoutT layout_Q;
Element const* ptr_K;
LayoutT layout_K;
Element const* ptr_V;
LayoutT layout_V;
float const softmax_scale_log2;
};
// Device side kernel params
struct Params {
LayoutT layout_Q;
LayoutT layout_K;
LayoutT layout_V;
cutlass::FastDivmod qhead_per_khead_divmod;
TMA_Q tma_load_Q;
TMA_KV tma_load_K, tma_load_V;
float const softmax_scale_log2;
};
static Params
to_underlying_arguments(Arguments const& args) {
Tensor mQ = make_tensor(make_gmem_ptr(args.ptr_Q), args.layout_Q);
TMA_Q tma_load_Q = make_tma_copy(
GmemTiledCopyQ{},
mQ,
SmemLayoutQ{},
select<0, 2>(TileShape_MNK{}),
_1{});
Tensor mK = make_tensor(make_gmem_ptr(args.ptr_K), args.layout_K);
TMA_KV tma_load_K = make_tma_copy(
GmemTiledCopyKV{},
mK,
SmemLayoutK{}(_, _, _0{}),
select<1, 2>(TileShape_MNK{}),
size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
Tensor mV = make_tensor(make_gmem_ptr(args.ptr_V), args.layout_V);
TMA_KV tma_load_V = make_tma_copy(
GmemTiledCopyKV{},
mV,
SmemLayoutV{}(_, _, _0{}),
select<1, 2>(TileShape_MNK{}),
size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
return {args.layout_Q, args.layout_K, args.layout_V,
cutlass::FastDivmod(cute::ceil_div(get<2>(args.layout_Q.shape()), get<2>(args.layout_K.shape()))),
tma_load_Q, tma_load_K, tma_load_V,
args.softmax_scale_log2};
}
/// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
CUTLASS_DEVICE
static void prefetch_tma_descriptors(Params const& mainloop_params) {
cute::prefetch_tma_descriptor(mainloop_params.tma_load_Q.get_tma_descriptor());
cute::prefetch_tma_descriptor(mainloop_params.tma_load_K.get_tma_descriptor());
cute::prefetch_tma_descriptor(mainloop_params.tma_load_V.get_tma_descriptor());
}
template <typename MTensor, typename Shape>
CUTLASS_DEVICE auto get_local_tile_tensor(
const MTensor &m_tensor,
const Shape &tile_shape,
const int *cu_seq_len,
const int bidh,
const int bidb,
const int actual_seq_len) const {
auto g_offset = local_tile(
m_tensor(_, _, bidh),
cute::make_shape(1, get<1>(tile_shape)),
make_coord(cu_seq_len[bidb], _0{}));
auto g_sequence = make_tensor(
g_offset.data(),
make_layout(
cute::make_shape(actual_seq_len, get<1>(tile_shape)),
g_offset.stride()
));
auto g_tensor = local_tile(g_sequence, tile_shape, make_coord(_, _0{}));
return g_tensor;
}
template <typename SharedStorage>
CUTLASS_DEVICE void
load(Params const& mainloop_params,
MainloopPipeline pipeline_k,
MainloopPipeline pipeline_v,
PipelineState& smem_pipe_write_k,
PipelineState& smem_pipe_write_v,
SharedStorage &shared_storage,
const int n_block_max,
const int m_block,
const int bidh,
const int bidb,
const int *cu_seq_q,
const int *cu_seq_k,
const int seq_len_q,
const int seq_len_k) {
Tensor sQ = make_tensor(make_smem_ptr(shared_storage.smem_q.data()), SmemLayoutQ{});
Tensor sK = make_tensor(make_smem_ptr(shared_storage.smem_k.data()), SmemLayoutK{});
Tensor sV = make_tensor(make_smem_ptr(shared_storage.smem_v.data()), SmemLayoutV{});
Tensor mQ = mainloop_params.tma_load_Q.get_tma_tensor(mainloop_params.layout_Q.shape());
Tensor mK = mainloop_params.tma_load_K.get_tma_tensor(mainloop_params.layout_K.shape());
Tensor mV = mainloop_params.tma_load_V.get_tma_tensor(mainloop_params.layout_V.shape());
int bidh_kv = mainloop_params.qhead_per_khead_divmod.divide(bidh);
Tensor gQ = get_local_tile_tensor(
mQ, select<0, 2>(TileShape_MNK{}), cu_seq_q, bidh, bidb, seq_len_q)(_, _, m_block);
Tensor gK = get_local_tile_tensor(
mK, select<1, 2>(TileShape_MNK{}), cu_seq_k, bidh_kv, bidb, seq_len_k);
Tensor gV = get_local_tile_tensor(
mV, select<1, 2>(TileShape_MNK{}), cu_seq_k, bidh_kv, bidb, seq_len_k);
Tensor sQ_x = make_tensor(sQ.data(), make_layout(sQ.layout(), Layout<_1>{}));
Tensor gQ_x = make_tensor(gQ.data(), make_layout(gQ.layout(), Layout<_1>{}));
auto [tQgQ, tQsQ] = tma_partition(mainloop_params.tma_load_Q, _0{}, Layout<_1>{},group_modes<0, 2>(sQ_x), group_modes<0, 2>(gQ_x));
auto [tKgK, tKsK] = tma_partition(mainloop_params.tma_load_K, _0{}, Layout<_1>{},group_modes<0, 2>(sK), group_modes<0, 2>(gK));
auto [tVgV, tVsV] = tma_partition(mainloop_params.tma_load_V, _0{}, Layout<_1>{},group_modes<0, 2>(sV), group_modes<0, 2>(gV));
uint16_t mcast_mask_kv = 0;
int n_block = n_block_max - 1;
int lane_predicate = cute::elect_one_sync();
if (lane_predicate) {
shared_storage.barrier_Q.arrive_and_expect_tx(TmaTransactionBytesQ);
copy(mainloop_params.tma_load_Q.with(reinterpret_cast<cutlass::arch::ClusterTransactionBarrier::ValueType&>(shared_storage.barrier_Q), 0 /*mcast_mask*/), tQgQ, tQsQ);
}
if (lane_predicate) {
pipeline_k.producer_acquire(smem_pipe_write_k);
copy(mainloop_params.tma_load_K.with(*pipeline_k.producer_get_barrier(smem_pipe_write_k), mcast_mask_kv),
tKgK(_, n_block), tKsK(_, smem_pipe_write_k.index()));
++smem_pipe_write_k;
}
if (lane_predicate) {
#pragma unroll 2
for (; n_block > 0; --n_block) {
pipeline_k.producer_acquire(smem_pipe_write_k);
copy(mainloop_params.tma_load_K.with(*pipeline_k.producer_get_barrier(smem_pipe_write_k), mcast_mask_kv),
tKgK(_, n_block - 1), tKsK(_, smem_pipe_write_k.index()));
++smem_pipe_write_k;
pipeline_v.producer_acquire(smem_pipe_write_v);
copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write_v), mcast_mask_kv),
tVgV(_, n_block), tVsV(_, smem_pipe_write_v.index()));
++smem_pipe_write_v;
}
}
if (lane_predicate) {
pipeline_v.producer_acquire(smem_pipe_write_v);
copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write_v), mcast_mask_kv),
tVgV(_, n_block), tVsV(_, smem_pipe_write_v.index()));
++smem_pipe_write_v;
}
}
template <typename SharedStorage, typename FrgTensorO, typename Softmax>
CUTLASS_DEVICE void
mma(Params const& mainloop_params,
MainloopPipeline pipeline_k,
MainloopPipeline pipeline_v,
PipelineState& smem_pipe_read_k,
PipelineState& smem_pipe_read_v,
FrgTensorO& tOrO,
Softmax& softmax,
const int *mask,
const int n_block_max,
const int thread_idx,
const int m_block,
const int seq_len_q,
const int seq_len_k,
SharedStorage& shared_storage) {
Tensor sQ = make_tensor(make_smem_ptr(shared_storage.smem_q.data()), SmemLayoutQ{});
Tensor sK = make_tensor(make_smem_ptr(shared_storage.smem_k.data()), SmemLayoutK{});
Tensor sVt = make_tensor(make_smem_ptr(shared_storage.smem_v.data()), SmemLayoutVt{});
typename Ktraits::TiledMma0 tiled_mma0;
typename Ktraits::TiledMma1 tiled_mma1;
auto threadMma0 = tiled_mma0.get_thread_slice(thread_idx);
auto threadMma1 = tiled_mma1.get_thread_slice(thread_idx);
Tensor tSrQ = threadMma0.partition_fragment_A(sQ);
Tensor tSrK = threadMma0.partition_fragment_B(sK);
Tensor tOrV = threadMma1.partition_fragment_B(sVt);
auto consumer_wait = [](auto& pipeline, auto& smem_pipe_read) {
auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
pipeline.consumer_wait(smem_pipe_read, barrier_token);
};
tiled_mma1.accumulate_ = GMMA::ScaleOut::Zero;
int n_block = n_block_max - 1;
cutlass::ConsumerToken barrier_token = static_cast<cutlass::BarrierStatus>(shared_storage.barrier_Q.try_wait(0));
if (barrier_token == cutlass::BarrierStatus::WaitAgain) { shared_storage.barrier_Q.wait(0); }
Tensor tSrS = partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
consumer_wait(pipeline_k, smem_pipe_read_k);
gemm</*zero_init=*/true, /*wg_wait=*/-1>(tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read_k.index()), tSrS);
warpgroup_wait<0>();
pipeline_k.consumer_release(smem_pipe_read_k);
++smem_pipe_read_k;
int mask_start_idx;
int mask_row_id;
int col_base;
if constexpr (NeedMask) {
const int lane_id = thread_idx % 32;
mask_start_idx = mask[0] / kBlockN - 1;
mask_row_id = thread_idx / 32 * 16 + lane_id / 4;
col_base = thread_idx % 4 * 2;
app_mask(
tSrS,
mask,
mask_row_id,
col_base + n_block * kBlockN);
} else {
auto col_limit_causal = [&](int row, int n_block) {
return row + 1 + seq_len_k - n_block * kBlockN - seq_len_q + m_block * kBlockM;
};
Tensor cS = cute::make_identity_tensor(select<0, 1>(TileShape_MNK{}));
Tensor tScS = threadMma0.partition_C(cS);
#pragma unroll
for (int i = 0; i < size(tSrS); ++i) {
if (int(get<1>(tScS(i))) >=
std::min(seq_len_k - n_block * kBlockN, col_limit_causal(int(get<0>(tScS(i))), n_block))) {
tSrS(i) = -INFINITY;
}
}
}
softmax.template online_softmax</*Is_first=*/true>(tSrS, mainloop_params.softmax_scale_log2);
Tensor tOrP = make_tensor(convert_type<Element>(tSrS).data(), convert_layout_acc_Aregs<typename Ktraits::TiledMma1>(tSrS.layout()));
Tensor scores_scale = make_fragment_like(softmax.row_max);
clear(scores_scale);
#pragma unroll 1
for (; n_block > 0; --n_block) {
Tensor tSrS = partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
consumer_wait(pipeline_k, smem_pipe_read_k);
if constexpr (NeedMask) {
if (n_block >= mask_start_idx) {
app_mask(
tSrS,
mask,
mask_row_id,
col_base + n_block * kBlockN);
}
}
gemm</*zero_init=*/true, /*wg_wait=*/-1>(tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read_k.index()), tSrS);
softmax.rescale_o(tOrO, scores_scale);
consumer_wait(pipeline_v, smem_pipe_read_v);
gemm</*zero_init=*/false, /*wg_wait=*/-1>(tiled_mma1, tOrP, tOrV(_, _, _, smem_pipe_read_v.index()), tOrO);
warpgroup_wait<1>();
pipeline_k.consumer_release(smem_pipe_read_k); // release K
cute::copy(softmax.template max</*Is_first=*/false>(tSrS, mainloop_params.softmax_scale_log2), scores_scale);
softmax.template online_softmax</*Is_first=*/false>(tSrS, mainloop_params.softmax_scale_log2);
warpgroup_wait<0>();
pipeline_v.consumer_release(smem_pipe_read_v); // release V
++smem_pipe_read_k;
++smem_pipe_read_v;
cute::copy(make_tensor(convert_type<Element>(tSrS).data(), convert_layout_acc_Aregs<typename Ktraits::TiledMma1>(tSrS.layout())), tOrP);
}
softmax.rescale_o(tOrO, scores_scale);
consumer_wait(pipeline_v, smem_pipe_read_v);
gemm</*zero_init=*/false, /*wg_wait=*/-1>(tiled_mma1, tOrP, tOrV(_, _, _, smem_pipe_read_v.index()), tOrO);
cute::copy(softmax.finalize(mainloop_params.softmax_scale_log2), scores_scale);
warpgroup_wait<0>();
pipeline_v.consumer_release(smem_pipe_read_v);
++smem_pipe_read_v;
softmax.rescale_o(tOrO, scores_scale);
return;
}
template <int NumMmaThreads, typename SharedStorage, typename FrgTensorO, typename TiledMma, typename T>
CUTLASS_DEVICE void
store(Params const& mainloop_params,
FrgTensorO const& tOrO,
SharedStorage& shared_storage,
TiledMma tiled_mma,
int thread_idx,
const int o_head_stride,
const int real_seq,
T * out_ptr) {
Tensor sO = make_tensor(make_smem_ptr(shared_storage.smem_o.data()), SmemLayoutO{});
auto smem_tiled_copy_O = make_tiled_copy_C(SmemCopyAtomO{}, tiled_mma);
auto smem_thr_copy_O = smem_tiled_copy_O.get_thread_slice(thread_idx);
Tensor tOrO_out = convert_type<Element>(tOrO);
Tensor taccOrO = smem_thr_copy_O.retile_S(tOrO_out);
Tensor taccOsO = smem_thr_copy_O.partition_D(sO);
cute::copy(smem_tiled_copy_O, taccOrO, taccOsO);
cutlass::arch::NamedBarrier::sync(NumMmaThreads, 0);
Tensor gO = make_tensor(make_gmem_ptr(out_ptr),
Shape<Int<kBlockM>, Int<kHeadDim>>{},
make_stride(o_head_stride, _1{}));
GmemTiledCopyO gmem_tiled_copy_O;
auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(thread_idx);
Tensor tOsO = gmem_thr_copy_O.partition_S(sO);
Tensor tOgO = gmem_thr_copy_O.partition_D(gO);
Tensor cO = make_identity_tensor(Shape<Int<kBlockM>, Int<kHeadDim>>{});
Tensor tOcO = gmem_thr_copy_O.partition_S(cO);
if (real_seq >= kBlockM) {
copy<true>(gmem_tiled_copy_O, tOsO, tOgO, tOcO);
} else {
copy<false>(gmem_tiled_copy_O, tOsO, tOgO, tOcO, real_seq);
}
}
};

Some files were not shown because too many files have changed in this diff Show More