diff --git a/.github/workflows/ci_iluvatar.yml b/.github/workflows/ci_iluvatar.yml index 8c15eca89..6d64c2e9c 100644 --- a/.github/workflows/ci_iluvatar.yml +++ b/.github/workflows/ci_iluvatar.yml @@ -11,8 +11,7 @@ concurrency: jobs: CI_ILUVATAR: - runs-on: - group: IXUCA + runs-on: [self-hosted, ILUVATAR_8Card] steps: - name: Print current runner name run: | @@ -23,7 +22,7 @@ jobs: - name: Code Checkout env: - docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest + docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:paddle-ocr-vl-1107 run: | REPO="https://github.com/${{ github.repository }}.git" FULL_REPO="${{ github.repository }}" @@ -56,7 +55,7 @@ jobs: - name: Run CI unittest env: - docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest + docker_image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:paddle-ocr-vl-1107 run: | runner_name="${{ runner.name }}" last_char="${runner_name: -1}" diff --git a/custom_ops/gpu_ops/get_padding_offset.cu b/custom_ops/gpu_ops/get_padding_offset.cu index 60591d246..db85e343a 100644 --- a/custom_ops/gpu_ops/get_padding_offset.cu +++ b/custom_ops/gpu_ops/get_padding_offset.cu @@ -28,8 +28,13 @@ __global__ void PrefixSumKernel(int64_t *ids_remove_padding, const int max_seq_len) { const int bi = blockIdx.x; const int tid = threadIdx.x; +#ifdef PADDLE_WITH_COREX + const int warp_id = threadIdx.x / 64; + const int lane_id = threadIdx.x % 64; +#else const int warp_id = threadIdx.x / 32; const int lane_id = threadIdx.x % 32; +#endif int cum_seq_len = 0; diff --git a/custom_ops/iluvatar_ops/mixed_fused_attn.cu b/custom_ops/iluvatar_ops/mixed_fused_attn.cu index b5388cc2c..9224e22f8 100644 --- a/custom_ops/iluvatar_ops/mixed_fused_attn.cu +++ b/custom_ops/iluvatar_ops/mixed_fused_attn.cu @@ -16,32 +16,37 @@ #include "iluvatar_context.h" template -void MixedFusedPagedAttnKernel(const paddle::Tensor& qkv, - paddle::Tensor& k_cache, - paddle::Tensor& v_cache, - const paddle::Tensor& prefill_block_table, - const paddle::Tensor& decode_block_table, - const paddle::Tensor& cu_seqlens_qkv, - const paddle::Tensor& seq_lens, - const paddle::optional& rope_sin, - const paddle::optional& rope_cos, - int prefill_num_tokens, - int num_heads, - int head_dim, - int num_kv_heads, - int block_size, - int max_seq_len, - float scale, - bool causal, - bool q_rope, - bool k_rope, - bool v_rope, - int window_left, - int window_right, - float softcap, - bool enable_cuda_graph, - bool use_sqrt_alibi, - paddle::Tensor& out) { +void MixedFusedPagedAttnKernel( + const paddle::Tensor& qkv, + paddle::Tensor& k_cache, + paddle::Tensor& v_cache, + const paddle::Tensor& prefill_block_table, + const paddle::Tensor& decode_block_table, + const paddle::Tensor& cu_seqlens_qkv, + const paddle::Tensor& seq_lens, + const paddle::Tensor& prefill_rope_sin, + const paddle::Tensor& prefill_rope_cos, + const paddle::optional& decode_rope_sin, + const paddle::optional& decode_rope_cos, + int prefill_num_tokens, + int num_heads, + int head_dim, + int num_kv_heads, + int block_size, + int max_seq_len, + float scale, + bool causal, + bool q_rope, + bool k_rope, + bool v_rope, + int window_left, + int window_right, + float softcap, + bool enable_cuda_graph, + bool use_sqrt_alibi, + int rope_batch_stride, + bool is_interleaved_rope_mode, + paddle::Tensor& out) { typedef PDTraits traits_; typedef typename traits_::data_t data_t; @@ -72,8 +77,39 @@ void MixedFusedPagedAttnKernel(const paddle::Tensor& qkv, int kv_block_stride = k_cache.strides()[0]; int kv_head_stride = k_cache.strides()[1]; int block_table_stride = prefill_block_table.strides()[0]; - const float* rope_sin_ptr = rope_sin ? rope_sin.get().data() : nullptr; - const float* rope_cos_ptr = rope_cos ? rope_cos.get().data() : nullptr; + const float* prefill_rope_sin_ptr = prefill_rope_sin.data(); + const float* prefill_rope_cos_ptr = prefill_rope_cos.data(); + const auto& prefill_rope_dims = prefill_rope_sin.dims(); + std::vector prefill_rope_shape_vec, prefill_rope_stride_vec; + int prefill_rope_ndim; + if (prefill_rope_dims.size() == 4) { + // [prefill_batch_size, max_seq_len, 1, head_dim] + PADDLE_ENFORCE_EQ( + prefill_rope_dims[0], + prefill_batch_size, + common::errors::InvalidArgument( + "prefill_rope_dims[0] must be equal to prefill_batch_size")); + prefill_rope_shape_vec = + std::vector({prefill_batch_size, max_seq_len, head_dim}); + prefill_rope_stride_vec = + std::vector({max_seq_len * head_dim, head_dim, 1}); + prefill_rope_ndim = 3; + } else if (prefill_rope_dims.size() == 3) { + // [max_seq_len, 1, head_dim] + prefill_rope_shape_vec = std::vector({max_seq_len, head_dim}); + prefill_rope_stride_vec = std::vector({head_dim, 1}); + prefill_rope_ndim = 2; + } else { + PD_THROW("Unsupported prefill_rope_ndim = %d for Paged attn", + prefill_rope_ndim); + } + + const float* decode_rope_sin_ptr = + decode_rope_sin ? decode_rope_sin.get().data() : nullptr; + const float* decode_rope_cos_ptr = + decode_rope_cos ? decode_rope_cos.get().data() : nullptr; + cuinferAttentionRopeMode_t rope_mode = + is_interleaved_rope_mode ? CUINFER_ATTEN_NORMAL : CUINFER_ATTEN_OCRV1; cuinferTensorDescriptor_t qkv_desc; CUINFER_CHECK(cuinferCreateTensorDescriptor(&qkv_desc)); @@ -139,21 +175,19 @@ void MixedFusedPagedAttnKernel(const paddle::Tensor& qkv, cuinferTensorDescriptor_t cos_desc; CUINFER_CHECK(cuinferCreateTensorDescriptor(&cos_desc)); - CUINFER_CHECK(cuinferSetTensorNdDescriptor( - cos_desc, - CUINFER_DATA_FLOAT, - 2, - std::vector({max_seq_len, head_dim}).data(), - std::vector({head_dim, 1}).data())); + CUINFER_CHECK(cuinferSetTensorNdDescriptor(cos_desc, + CUINFER_DATA_FLOAT, + prefill_rope_ndim, + prefill_rope_shape_vec.data(), + prefill_rope_stride_vec.data())); cuinferTensorDescriptor_t sin_desc; CUINFER_CHECK(cuinferCreateTensorDescriptor(&sin_desc)); - CUINFER_CHECK(cuinferSetTensorNdDescriptor( - sin_desc, - CUINFER_DATA_FLOAT, - 2, - std::vector({max_seq_len, head_dim}).data(), - std::vector({head_dim, 1}).data())); + CUINFER_CHECK(cuinferSetTensorNdDescriptor(sin_desc, + CUINFER_DATA_FLOAT, + prefill_rope_ndim, + prefill_rope_shape_vec.data(), + prefill_rope_stride_vec.data())); cuinferHandle_t cuinfer_handle = iluvatar::getContextInstance()->getIxInferHandle(); @@ -195,9 +229,9 @@ void MixedFusedPagedAttnKernel(const paddle::Tensor& qkv, prefill_workspace_ptr, prefill_workspace_size, cos_desc, - rope_cos_ptr, + prefill_rope_cos_ptr, sin_desc, - rope_sin_ptr, + prefill_rope_sin_ptr, prefill_batch_size, num_heads, num_kv_heads, @@ -206,7 +240,8 @@ void MixedFusedPagedAttnKernel(const paddle::Tensor& qkv, scale, q_rope, k_rope, - v_rope)); + v_rope, + rope_mode)); size_t decode_workspace_size = 0; CUINFER_CHECK(cuInferPageAttentionGetWorkspaceV7(decode_num_tokens, @@ -241,8 +276,18 @@ void MixedFusedPagedAttnKernel(const paddle::Tensor& qkv, decode_qkv_ptr, decode_workspace_ptr, true, - rope_sin_ptr, - rope_cos_ptr}; + decode_rope_sin_ptr, + decode_rope_cos_ptr, + nullptr, + nullptr, + nullptr, + nullptr, + 1, + 0, + 0, + nullptr, + static_cast(rope_batch_stride), + rope_mode}; CUINFER_CHECK(cuInferPageAttentionV7(cuinfer_handle, decode_out_ptr, @@ -285,8 +330,10 @@ std::vector MixedFusedPagedAttn( const paddle::Tensor& decode_block_table, const paddle::Tensor& cu_seqlens_qkv, const paddle::Tensor& seq_lens, - const paddle::optional& rope_sin, - const paddle::optional& rope_cos, + const paddle::Tensor& prefill_rope_sin, + const paddle::Tensor& prefill_rope_cos, + const paddle::optional& decode_rope_sin, + const paddle::optional& decode_rope_cos, int prefill_num_tokens, int num_heads, int head_dim, @@ -302,67 +349,79 @@ std::vector MixedFusedPagedAttn( int window_right, float softcap, bool enable_cuda_graph, - bool use_sqrt_alibi) { + bool use_sqrt_alibi, + int rope_batch_stride, + bool is_interleaved_rope_mode) { const auto dtype = qkv.dtype(); auto out = paddle::empty({qkv.shape()[0], num_heads * head_dim}, dtype, qkv.place()); switch (dtype) { case paddle::DataType::BFLOAT16: - MixedFusedPagedAttnKernel(qkv, - k_cache, - v_cache, - prefill_block_table, - decode_block_table, - cu_seqlens_qkv, - seq_lens, - rope_sin, - rope_cos, - prefill_num_tokens, - num_heads, - head_dim, - num_kv_heads, - block_size, - max_seq_len, - scale, - causal, - q_rope, - k_rope, - v_rope, - window_left, - window_right, - softcap, - enable_cuda_graph, - use_sqrt_alibi, - out); + MixedFusedPagedAttnKernel( + qkv, + k_cache, + v_cache, + prefill_block_table, + decode_block_table, + cu_seqlens_qkv, + seq_lens, + prefill_rope_sin, + prefill_rope_cos, + decode_rope_sin, + decode_rope_cos, + prefill_num_tokens, + num_heads, + head_dim, + num_kv_heads, + block_size, + max_seq_len, + scale, + causal, + q_rope, + k_rope, + v_rope, + window_left, + window_right, + softcap, + enable_cuda_graph, + use_sqrt_alibi, + rope_batch_stride, + is_interleaved_rope_mode, + out); break; case paddle::DataType::FLOAT16: - MixedFusedPagedAttnKernel(qkv, - k_cache, - v_cache, - prefill_block_table, - decode_block_table, - cu_seqlens_qkv, - seq_lens, - rope_sin, - rope_cos, - prefill_num_tokens, - num_heads, - head_dim, - num_kv_heads, - block_size, - max_seq_len, - scale, - causal, - q_rope, - k_rope, - v_rope, - window_left, - window_right, - softcap, - enable_cuda_graph, - use_sqrt_alibi, - out); + MixedFusedPagedAttnKernel( + qkv, + k_cache, + v_cache, + prefill_block_table, + decode_block_table, + cu_seqlens_qkv, + seq_lens, + prefill_rope_sin, + prefill_rope_cos, + decode_rope_sin, + decode_rope_cos, + prefill_num_tokens, + num_heads, + head_dim, + num_kv_heads, + block_size, + max_seq_len, + scale, + causal, + q_rope, + k_rope, + v_rope, + window_left, + window_right, + softcap, + enable_cuda_graph, + use_sqrt_alibi, + rope_batch_stride, + is_interleaved_rope_mode, + out); break; default: PD_THROW("Unsupported data type for mixed paged attn"); @@ -388,8 +447,10 @@ PD_BUILD_STATIC_OP(mixed_fused_paged_attn) "decode_block_table", "cu_seqlens_qkv", "seq_lens", - paddle::Optional("rope_sin"), - paddle::Optional("rope_cos")}) + "prefill_rope_sin", + "prefill_rope_cos", + paddle::Optional("decode_rope_sin"), + paddle::Optional("decode_rope_cos")}) .Outputs({"out"}) .Attrs({"prefill_num_tokens:int", "num_heads: int", @@ -406,7 +467,9 @@ PD_BUILD_STATIC_OP(mixed_fused_paged_attn) "window_right:int", "softcap:float", "enable_cuda_graph:bool", - "use_sqrt_alibi:bool"}) + "use_sqrt_alibi:bool", + "rope_batch_stride:int", + "is_interleaved_rope_mode:bool"}) .SetKernelFn(PD_KERNEL(MixedFusedPagedAttn)) .SetInferShapeFn(PD_INFER_SHAPE(MixedFusedPagedAttnInferShape)) .SetInferDtypeFn(PD_INFER_DTYPE(MixedFusedPagedAttnInferDtype)); diff --git a/custom_ops/iluvatar_ops/paged_attn.cu b/custom_ops/iluvatar_ops/paged_attn.cu index 9d2c19a17..e04731b0a 100644 --- a/custom_ops/iluvatar_ops/paged_attn.cu +++ b/custom_ops/iluvatar_ops/paged_attn.cu @@ -39,6 +39,8 @@ void PagedAttnKernel(const paddle::Tensor& q, bool enable_cuda_graph, bool use_sqrt_alibi, bool merged_qkv, + int rope_batch_stride, + bool is_interleaved_rope_mode, paddle::Tensor& out) { if (alibi_slopes) { PADDLE_ENFORCE_EQ(alibi_slopes.get().dtype(), @@ -186,6 +188,9 @@ void PagedAttnKernel(const paddle::Tensor& q, allocator->Allocate(workspace_size); void* workspace_ptr = tmp_workspace->ptr(); + cuinferAttentionRopeMode_t rope_mode = + is_interleaved_rope_mode ? CUINFER_ATTEN_NORMAL : CUINFER_ATTEN_OCRV1; + PageAttentionWithKVCacheArguments args{static_cast(scale), 1.0, 1.0, @@ -202,7 +207,17 @@ void PagedAttnKernel(const paddle::Tensor& q, workspace_ptr, merged_qkv, rope_sin_ptr, - rope_cos_ptr}; + rope_cos_ptr, + nullptr, + nullptr, + nullptr, + nullptr, + 1, + 0, + 0, + nullptr, + static_cast(rope_batch_stride), + rope_mode}; CUINFER_CHECK(cuInferPageAttentionV7(cuinfer_handle, out.data(), data_type, @@ -250,7 +265,9 @@ std::vector PagedAttn( float softcap, bool enable_cuda_graph, bool use_sqrt_alibi, - bool merged_qkv) { + bool merged_qkv, + int rope_batch_stride, + bool is_interleaved_rope_mode) { const auto dtype = q.dtype(); auto out = paddle::empty({q.shape()[0], num_heads * head_dim}, dtype, q.place()); @@ -280,6 +297,8 @@ std::vector PagedAttn( enable_cuda_graph, use_sqrt_alibi, merged_qkv, + rope_batch_stride, + is_interleaved_rope_mode, out); break; case paddle::DataType::FLOAT16: @@ -306,6 +325,8 @@ std::vector PagedAttn( enable_cuda_graph, use_sqrt_alibi, merged_qkv, + rope_batch_stride, + is_interleaved_rope_mode, out); break; default: @@ -374,7 +395,9 @@ PD_BUILD_STATIC_OP(paged_attn) "softcap:float", "enable_cuda_graph:bool", "use_sqrt_alibi:bool", - "merged_qkv:bool"}) + "merged_qkv:bool", + "rope_batch_stride:int", + "is_interleaved_rope_mode:bool"}) .SetKernelFn(PD_KERNEL(PagedAttn)) .SetInferShapeFn(PD_INFER_SHAPE(PagedAttnInferShape)) .SetInferDtypeFn(PD_INFER_DTYPE(PagedAttnInferDtype)); diff --git a/custom_ops/iluvatar_ops/prefill_fused_attn.cu b/custom_ops/iluvatar_ops/prefill_fused_attn.cu index fe8449c40..1578eb1c5 100644 --- a/custom_ops/iluvatar_ops/prefill_fused_attn.cu +++ b/custom_ops/iluvatar_ops/prefill_fused_attn.cu @@ -16,25 +16,25 @@ #include "iluvatar_context.h" template -void PrefillFusedPagedAttnKernel( - const paddle::Tensor& qkv, - paddle::Tensor& k_cache, - paddle::Tensor& v_cache, - const paddle::Tensor& block_table, - const paddle::Tensor& cu_seqlens_qkv, - const paddle::optional& rope_sin, - const paddle::optional& rope_cos, - int num_heads, - int head_dim, - int num_kv_heads, - int block_size, - int max_seq_len, - float scale, - bool causal, - bool q_rope, - bool k_rope, - bool v_rope, - paddle::Tensor& out) { +void PrefillFusedPagedAttnKernel(const paddle::Tensor& qkv, + paddle::Tensor& k_cache, + paddle::Tensor& v_cache, + const paddle::Tensor& block_table, + const paddle::Tensor& cu_seqlens_qkv, + const paddle::Tensor& rope_sin, + const paddle::Tensor& rope_cos, + int num_heads, + int head_dim, + int num_kv_heads, + int block_size, + int max_seq_len, + float scale, + bool causal, + bool q_rope, + bool k_rope, + bool v_rope, + bool is_interleaved_rope_mode, + paddle::Tensor& out) { // check dtype and contiguous const auto& dtype = qkv.dtype(); cuinferDataType_t data_type; @@ -139,8 +139,28 @@ void PrefillFusedPagedAttnKernel( "cu_seqlens_qkv_dims[0] must be equal to batch_size + 1")); int block_table_stride = block_table.strides()[0]; - const float* rope_sin_ptr = rope_sin ? rope_sin.get().data() : nullptr; - const float* rope_cos_ptr = rope_cos ? rope_cos.get().data() : nullptr; + const float* rope_sin_ptr = rope_sin.data(); + const float* rope_cos_ptr = rope_cos.data(); + const auto& rope_dims = rope_sin.dims(); + std::vector rope_shape_vec, rope_stride_vec; + int rope_ndim; + if (rope_dims.size() == 4) { + // [batch_size, max_seq_len, 1, head_dim] + PADDLE_ENFORCE_EQ(rope_dims[0], + batch_size, + common::errors::InvalidArgument( + "rope_dims[0] must be equal to batch_size")); + rope_shape_vec = std::vector({batch_size, max_seq_len, head_dim}); + rope_stride_vec = std::vector({max_seq_len * head_dim, head_dim, 1}); + rope_ndim = 3; + } else if (rope_dims.size() == 3) { + // [max_seq_len, 1, head_dim] + rope_shape_vec = std::vector({max_seq_len, head_dim}); + rope_stride_vec = std::vector({head_dim, 1}); + rope_ndim = 2; + } else { + PD_THROW("Unsupported rope_ndim = %d for Paged attn", rope_ndim); + } cuinferHandle_t cuinfer_handle = iluvatar::getContextInstance()->getIxInferHandle(); @@ -226,22 +246,22 @@ void PrefillFusedPagedAttnKernel( cuinferTensorDescriptor_t cos_desc; CUINFER_CHECK(cuinferCreateTensorDescriptor(&cos_desc)); - CUINFER_CHECK(cuinferSetTensorNdDescriptor( - cos_desc, - CUINFER_DATA_FLOAT, - 2, - std::vector({max_seq_len, head_dim}).data(), - std::vector({head_dim, 1}).data())); + CUINFER_CHECK(cuinferSetTensorNdDescriptor(cos_desc, + CUINFER_DATA_FLOAT, + rope_ndim, + rope_shape_vec.data(), + rope_stride_vec.data())); cuinferTensorDescriptor_t sin_desc; CUINFER_CHECK(cuinferCreateTensorDescriptor(&sin_desc)); - CUINFER_CHECK(cuinferSetTensorNdDescriptor( - sin_desc, - CUINFER_DATA_FLOAT, - 2, - std::vector({max_seq_len, head_dim}).data(), - std::vector({head_dim, 1}).data())); + CUINFER_CHECK(cuinferSetTensorNdDescriptor(sin_desc, + CUINFER_DATA_FLOAT, + rope_ndim, + rope_shape_vec.data(), + rope_stride_vec.data())); + cuinferAttentionRopeMode_t rope_mode = + is_interleaved_rope_mode ? CUINFER_ATTEN_NORMAL : CUINFER_ATTEN_OCRV1; CUINFER_CHECK(cuinferFmhaFwdMergedFuseRopeFunc(cuinfer_handle, qkv_desc, qkv.data(), @@ -269,7 +289,8 @@ void PrefillFusedPagedAttnKernel( scale, q_rope, k_rope, - v_rope)); + v_rope, + rope_mode)); CUINFER_CHECK(cuinferDestroyTensorDescriptor(qkv_desc)); CUINFER_CHECK(cuinferDestroyTensorDescriptor(qkv_seqlens_desc)); @@ -287,8 +308,8 @@ std::vector PrefillFusedPagedAttn( paddle::Tensor& v_cache, const paddle::Tensor& block_table, const paddle::Tensor& cu_seqlens_qkv, - const paddle::optional& rope_sin, - const paddle::optional& rope_cos, + const paddle::Tensor& rope_sin, + const paddle::Tensor& rope_cos, int num_heads, int head_dim, int num_kv_heads, @@ -298,51 +319,56 @@ std::vector PrefillFusedPagedAttn( bool causal, bool q_rope, bool k_rope, - bool v_rope) { + bool v_rope, + bool is_interleaved_rope_mode) { const auto dtype = qkv.dtype(); auto out = paddle::empty({qkv.shape()[0], num_heads * head_dim}, dtype, qkv.place()); switch (dtype) { case paddle::DataType::BFLOAT16: - PrefillFusedPagedAttnKernel(qkv, - k_cache, - v_cache, - block_table, - cu_seqlens_qkv, - rope_sin, - rope_cos, - num_heads, - head_dim, - num_kv_heads, - block_size, - max_seq_len, - scale, - causal, - q_rope, - k_rope, - v_rope, - out); + PrefillFusedPagedAttnKernel( + qkv, + k_cache, + v_cache, + block_table, + cu_seqlens_qkv, + rope_sin, + rope_cos, + num_heads, + head_dim, + num_kv_heads, + block_size, + max_seq_len, + scale, + causal, + q_rope, + k_rope, + v_rope, + is_interleaved_rope_mode, + out); break; case paddle::DataType::FLOAT16: - PrefillFusedPagedAttnKernel(qkv, - k_cache, - v_cache, - block_table, - cu_seqlens_qkv, - rope_sin, - rope_cos, - num_heads, - head_dim, - num_kv_heads, - block_size, - max_seq_len, - scale, - causal, - q_rope, - k_rope, - v_rope, - out); + PrefillFusedPagedAttnKernel( + qkv, + k_cache, + v_cache, + block_table, + cu_seqlens_qkv, + rope_sin, + rope_cos, + num_heads, + head_dim, + num_kv_heads, + block_size, + max_seq_len, + scale, + causal, + q_rope, + k_rope, + v_rope, + is_interleaved_rope_mode, + out); break; default: PD_THROW("Unsupported data type for Paged attn"); @@ -382,8 +408,8 @@ PD_BUILD_STATIC_OP(prefill_fused_paged_attn) "v_cache", "block_table", "cu_seqlens_qkv", - paddle::Optional("rope_sin"), - paddle::Optional("rope_cos")}) + "rope_sin", + "rope_cos"}) .Outputs({"out"}) .Attrs({"num_heads:int", "head_dim:int", @@ -394,7 +420,8 @@ PD_BUILD_STATIC_OP(prefill_fused_paged_attn) "causal:bool", "q_rope:bool", "k_rope:bool", - "v_rope:bool"}) + "v_rope:bool", + "is_interleaved_rope_mode:bool"}) .SetKernelFn(PD_KERNEL(PrefillFusedPagedAttn)) .SetInferShapeFn(PD_INFER_SHAPE(PrefillFusedPagedAttnInferShape)) .SetInferDtypeFn(PD_INFER_DTYPE(PrefillFusedPagedAttnInferDtype)); diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py index 1e5136985..16634a7d5 100644 --- a/custom_ops/setup_ops.py +++ b/custom_ops/setup_ops.py @@ -555,6 +555,9 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"): "gpu_ops/set_data_ipc.cu", "gpu_ops/limit_thinking_content_length_v1.cu", "gpu_ops/limit_thinking_content_length_v2.cu", + "gpu_ops/recover_decode_task.cu", + "gpu_ops/update_inputs_v1.cu", + "gpu_ops/get_img_boundaries.cc", "iluvatar_ops/moe_dispatch.cu", "iluvatar_ops/moe_reduce.cu", "iluvatar_ops/paged_attn.cu", diff --git a/docs/get_started/installation/iluvatar_gpu.md b/docs/get_started/installation/iluvatar_gpu.md index e5720015c..703d80040 100644 --- a/docs/get_started/installation/iluvatar_gpu.md +++ b/docs/get_started/installation/iluvatar_gpu.md @@ -1,67 +1,62 @@ [简体中文](../../zh/get_started/installation/iluvatar_gpu.md) -# Run ERNIE-4.5-300B-A47B & ERNIE-4.5-21B-A3B model on iluvatar machine - -## Machine Preparation -First, the `TP=16` when running the ERNIE4.5 300B model and so you need to prepare a machine with the following configurations: +## 1. Machine Preparation | CPU | Memory | Card | Hard Disk| | :---: | :---: | :---: | :---: | | x86 | 1TB| 16xBI150| 1TB| -Currently, the entire model needs to be loaded into the host memory, which requires more than 600GB of host memory. This issue will be optimized in subsequent versions. - -## Image Preparation +## 2. Image Preparation Pull the Docker image ```bash -docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest +docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:paddle-ocr-vl-1107 ``` -## Container Preparation -### Start Container +## 3. Container Preparation +### 3.1 Start Container ```bash -docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest +docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:paddle-ocr-vl-1107 docker exec -it paddle_infer bash ``` /home/paddle contains the model files, *.whl packages, and scripts. -### Install paddle +### 3.2 Install paddle ```bash -pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ -pip3 install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ +pip3 install paddlepaddle==3.3.0.dev20251103 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/ +pip3 install paddle-iluvatar-gpu==3.0.0.dev20251107 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/ ``` -For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/) -### Install or build FastDeploy +### 3.3 Install or build FastDeploy ```bash -pip3 install fastdeploy_iluvatar_gpu==2.1.0.dev0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels +pip3 install fastdeploy_iluvatar_gpu==2.4.0.dev0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.aliyun.com/pypi/simple/ ``` You can build FastDeploy from source if you need the ```latest version```. ```bash git clone https://github.com/PaddlePaddle/FastDeploy cd FastDeploy -pip install -r requirements_iluvatar.txt +ln -sf /usr/local/bin/python3 /usr/local/bin/python +pip3 install -r requirements_iluvatar.txt export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 bash build.sh ``` -## Prepare the inference demo script +## 4. Test models on iluvatar machine +### 4.1 ERNIE-4.5 series +#### 4.1.1 ERNIE-4.5-21B-A3B-Paddle -script list below: - -`run_demo.sh`: +**offline demo** +script list bellow: ```bash #!/bin/bash export PADDLE_XCCL_BACKEND=iluvatar_gpu export INFERENCE_MSG_QUEUE_ID=232132 export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 export FD_SAMPLING_CLASS=rejection -export FD_DEBUG=1 python3 run_demo.py ``` @@ -79,11 +74,10 @@ prompts = [ sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256) # load the model -llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, block_size=16, quantization='wint8') +llm = LLM(model="/home/paddle/ERNIE-4.5-21B-A3B-Paddle", tensor_parallel_size=1, max_model_len=8192, block_size=16, quantization='wint8') # Perform batch inference outputs = llm.generate(prompts, sampling_params) -# Note:Replace `/home/paddle/ernie-4_5-21b-a3b-bf16-paddle` in it with the path to the ERNIE model you have downloaded. for output in outputs: prompt = output.prompt @@ -91,13 +85,7 @@ for output in outputs: print(prompt, generated_text) ``` -## run demo - -```bash -./run_demo.sh -``` - -The following logs will be printed: Loading the model took approximately 74 seconds, and running the demo took approximately 240 seconds. +The following logs will be printed: ``` /usr/local/lib/python3.10/site-packages/paddle/utils/cpp_extension/extension_utils.py:715: UserWarning: No ccache found. Please be aware that recompiling all source files may be required. You can download and install ccache from: https://github.com/ccache/ccache/blob/master/doc/INSTALL.md @@ -134,277 +122,147 @@ Now, let's break down each step: The largest ocean is the Pacific Ocean, covering an area of approximately ⦠[3], The first scientific expeditions to determine the ocean's depth were the Challenger expedition (1872â1876) and the U.S. Navy Hydrographic Office survey (1877â1879). The oceanic crust is thin and irregular, consisting of upward moving magma from the mantle below, and cooling and solidifying on the surface. The shallowest parts of the ocean are called the continental shelves. Large tides are caused mainly by the alignment of the Sun, Moon, and Earth during new or full moons. The origin of the word "ocean" is not clear. The first global oceanic topography survey was completed by the Challenger expedition (1872â1876). [57] The sound speed in the ocean is primarily a function of water temperature and salinity, and varies with depth. The deep-ocean floor is mostly flat and devoid of life, with the exception of seamounts and various underwater volcanic features, including seamounts and hydrothermal vents. [73] Today, the five ocean ``` -## Run ernie4.5 300B model with the GSM8K dataset +**online demo** -1. Download GSM8K dataset - -```bash -wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl -``` - -2. Prepare `bench_gsm8k.py` - -```python -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" Fastdeploy + ERNIE-4.5-Turbo 的指标评估 """ -# adapted from https://github.com/sgl-project/sglang/blob/main/benchmark/gsm8k/bench_other.py -import argparse -import ast -import json -import re -import time -from concurrent.futures import ThreadPoolExecutor - -import numpy as np -import requests -from tqdm import tqdm - -INVALID = -9999999 - - -def call_generate(prompt, **kwargs): - """ - Generates response based on the input prompt. - - Args: - prompt (str): The input prompt text. - **kwargs: Keyword arguments, including server IP address and port number. - - Returns: - str: The response generated based on the prompt. - - """ - url = f"http://{kwargs['ip']}:{kwargs['port']}/v1/chat/completions" - headers = {"Content-Type": "application/json"} - data = { - "messages": [ - { - "role": "user", - "content": prompt, - } - ], - "temperature": 0.6, - "max_tokens": 2047, - "top_p": 0.95, - "do_sample": True, - } - - response = requests.post(url, headers=headers, data=json.dumps(data)) - out = response.json() - return out["choices"][0]["message"]["content"] - - -def get_one_example(lines, i, include_answer): - """ - Retrieves a question-answer example from the given list of text lines. - - Args: - lines (list of dict): A list of question-answer pairs. - i (int): The index of the question-answer pair to retrieve from lines. - include_answer (bool): Whether to include the answer in the returned string. - - Returns: - str: A formatted question-answer string in the format "Question: \nAnswer: ". - - """ - ret = "Question: " + lines[i]["question"] + "\nAnswer:" - if include_answer: - ret += " " + lines[i]["answer"] - return ret - - -def get_few_shot_examples(lines, k): - """ - Selects k examples from the given list of text lines and concatenates them into a single string. - - Args: - lines (list): A list containing text lines. - k (int): The number of examples to select. - - Returns: - str: A string composed of k examples, separated by two newline characters. - """ - ret = "" - for i in range(k): - ret += get_one_example(lines, i, True) + "\n\n" - return ret - - -def get_answer_value(answer_str): - """ - Extracts numerical values from an answer string and returns them. - - Args: - answer_str (str): The string containing the answer. - - Returns: - The extracted numerical value; returns "INVALID" if extraction fails. - """ - answer_str = answer_str.replace(",", "") - numbers = re.findall(r"\d+", answer_str) - if len(numbers) < 1: - return INVALID - try: - return ast.literal_eval(numbers[-1]) - except SyntaxError: - return INVALID - - -def read_jsonl(filename: str): - """ - Reads a JSONL file. - - Args: - filename (str): Path to the JSONL file. - - Yields: - dict: A dictionary object corresponding to each line in the JSONL file. - """ - with open(filename) as fin: - for line in fin: - if line.startswith("#"): - continue - yield json.loads(line) - - -def main(args): - """ - Process inputs and generate answers by calling the model in parallel using a thread pool. - - Args: - args (argparse.Namespace): - - num_questions (int): Number of questions to process. - - num_shots (int): Number of few-shot learning examples. - - ip (str): IP address of the model service. - - port (int): Port number of the model service. - - parallel (int): Number of questions to process in parallel. - - result_file (str): File path to store the results. - - Returns: - None - - """ - # Read data - filename = "test.jsonl" - - lines = list(read_jsonl(filename)) - - # Construct prompts - num_questions = args.num_questions - num_shots = args.num_shots - few_shot_examples = get_few_shot_examples(lines, num_shots) - - questions = [] - labels = [] - for i in range(len(lines[:num_questions])): - questions.append(get_one_example(lines, i, False)) - labels.append(get_answer_value(lines[i]["answer"])) - assert all(l != INVALID for l in labels) - - states = [None] * len(labels) - - # Use thread pool - def get_one_answer(i): - answer = call_generate( - prompt=few_shot_examples + questions[i], - # stop=["Question", "Assistant:", "<|separator|>"], - ip=args.ip, - port=args.port, - ) - states[i] = answer - - tic = time.time() - if args.parallel == 1: - for i in tqdm(range(len(questions))): - get_one_answer(i) - else: - with ThreadPoolExecutor(args.parallel) as executor: - list( - tqdm( - executor.map(get_one_answer, list(range(len(questions)))), - total=len(questions), - ) - ) - - latency = time.time() - tic - preds = [] - for i in range(len(states)): - preds.append(get_answer_value(states[i])) - - # Compute accuracy - acc = np.mean(np.array(preds) == np.array(labels)) - invalid = np.mean(np.array(preds) == INVALID) - - # Print results - print(f"Accuracy: {acc:.3f}") - print(f"Invalid: {invalid:.3f}") - print(f"Latency: {latency:.3f} s") - - with open(args.result_file, "a") as fout: - value = { - "task": "gsm8k", - "backend": "paddlepaddle", - "num_gpus": 1, - "latency": round(latency, 3), - "accuracy": round(acc, 3), - "num_requests": args.num_questions, - "other": { - "num_questions": args.num_questions, - "parallel": args.parallel, - }, - } - fout.write(json.dumps(value) + "\n") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--ip", type=str, default="127.0.0.1") - parser.add_argument("--port", type=str, default="8188") - parser.add_argument("--num-shots", type=int, default=10) - parser.add_argument("--data-path", type=str, default="test.jsonl") - parser.add_argument("--num-questions", type=int, default=1319) - parser.add_argument("--result-file", type=str, default="result.jsonl") - parser.add_argument("--parallel", type=int, default=1) - args = parser.parse_args() - main(args) -``` - -3. Prepare `run_bench.sh` +Refer to [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/best_practices/ERNIE-4.5-VL-28B-A3B-Paddle.md), the command as bellow: +server: ```bash #!/bin/bash export PADDLE_XCCL_BACKEND=iluvatar_gpu export INFERENCE_MSG_QUEUE_ID=232132 export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 export FD_SAMPLING_CLASS=rejection - -python3 -m fastdeploy.entrypoints.openai.api_server --model "/home/paddle/ernie-45t" --port 8188 --tensor-parallel-size 16 --block-size 16 --quantization wint8 +python3 -m fastdeploy.entrypoints.openai.api_server \ + --model /home/paddle/ERNIE-4.5-21B-A3B-Paddle \ + --port 8180 \ + --tensor-parallel-size 1 \ + --quantization wint8 \ + --max-model-len 32768 \ + --max-num-seqs 8 \ + --block-size 16 ``` +If you want to use v0 loader, please set `--load-choices "default"`. -4. Running the Script +client: -Firstly, open a terminal and run: +- Simple request: ```bash -./run_bench.sh +curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ +-H "Content-Type: application/json" \ +-d '{ + "messages": [ + {"role": "user", "content": "Write me a poem about large language model."} + ] +}' ``` -After the service is ready, open another terminal and run: + +- Test GSM8K dataset benchmark +1) Download GSM8K dataset ```bash -python3 -u bench_gsm8k.py --port 8188 --num-questions 1319 --num-shots 5 --parallel 8 +wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl ``` -It takes about 4.8 hours to run the GSM8K dataset. +2) Copy `bench_gsm8k.py` to your workspace +```bash +cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py . +``` +3) Execute +```bash +python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8 +``` +It takes about 52 minutes to run the GSM8K dataset. + +``` +Accuracy: 0.914 +Invaild: 0.000 +Latency: 3143.301 s +``` + +#### 4.1.2 ERNIE-4.5-21B-A3B-Thinking + +Refer to [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/best_practices/ERNIE-4.5-21B-A3B-Thinking.md), the command as bellow: + +server: +```bash +#!/bin/bash +export PADDLE_XCCL_BACKEND=iluvatar_gpu +export INFERENCE_MSG_QUEUE_ID=232132 +export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 +export FD_SAMPLING_CLASS=rejection +python3 -m fastdeploy.entrypoints.openai.api_server \ + --model /home/paddle/ERNIE-4.5-21B-A3B-Thinking \ + --port 8180 \ + --tensor-parallel-size 1 \ + --max-model-len 32768 \ + --quantization wint8 \ + --reasoning-parser ernie_x1 \ + --tool-call-parser ernie_x1 \ + --max-num-seqs 8 \ + --block-size 16 +``` + +client: + +```bash +curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ +-H "Content-Type: application/json" \ +-d '{ + "messages": [ + {"role": "user", "content": "Write me a poem about large language model."} + ] +}' +``` + +#### 4.1.3 ERNIE-4.5-300B-A47B +Firstly, the TP=16 when running the `ERNIE-4.5-300B-A47B` and it needs to be loaded into the host memory, which requires more than 600GB of host memory. This issue will be optimized in subsequent versions. + +Refer to [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/best_practices/ERNIE-4.5-300B-A47B-Paddle.md), the command as bellow: + +server: +```bash +#!/bin/bash +export PADDLE_XCCL_BACKEND=iluvatar_gpu +export INFERENCE_MSG_QUEUE_ID=232132 +export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 +export FD_SAMPLING_CLASS=rejection +python3 -m fastdeploy.entrypoints.openai.api_server \ + --model /home/paddle/ERNIE-4.5-300B-A47B \ + --port 8180 \ + --tensor-parallel-size 16 \ + --quantization wint8 \ + --max-model-len 32768 \ + --max-num-seqs 8 \ + --block-size 16 +``` +If you want to use v0 loader, please set `--load-choices "default"`. + +client: + +- Simple request: +```bash +curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ +-H "Content-Type: application/json" \ +-d '{ + "messages": [ + {"role": "user", "content": "Write me a poem about large language model."} + ] +}' +``` + +- Test GSM8K dataset benchmark +1) Download GSM8K dataset +```bash +wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl +``` +2) Copy `bench_gsm8k.py` to your workspace +```bash +cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py . +``` +3) Execute +```bash +python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8 +``` +It takes about 52 minutes to run the GSM8K dataset. ``` Accuracy: 0.962 @@ -412,48 +270,12 @@ Invaild: 0.000 Latency: 17332.728 s ``` -# Run ERNIE-4.5-VL-28B-A3B-Paddle model on iluvatar machine +### 4.2 ERNIE-4.5-VL series +#### 4.2.1 ERNIE-4.5-VL-28B-A3B-Paddle -## Machine Preparation -First, the `TP=2` when running the ERNIE-4.5-VL-28B-A3B-Paddle model and so you need to prepare a machine with the following configurations: +**offline demo** -| CPU | Memory | Card | Hard Disk| -| :---: | :---: | :---: | :---: | -| x86 | 1TB| 2xBI150| 1TB| - -## Image Preparation -Pull the Docker image - -```bash -docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest -``` - -## Container Preparation -### Start Container - -```bash -docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest -docker exec -it paddle_infer bash -``` - -/home/paddle contains the model files, *.whl packages, and scripts. - -### Install paddle - -```bash -pip3 install paddlepaddle==3.3.0.dev20251028 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/ -pip3 install paddle-iluvatar-gpu==3.0.0.dev20251029 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/ -``` -For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/) - -### Install FastDeploy -```bash -pip3 install fastdeploy_iluvatar_gpu==2.3.0.dev0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.aliyun.com/pypi/simple/ -``` - -## Prepare the inference demo script - -script list below: +The script as bellow: `run_demo_vl.sh`: @@ -463,7 +285,6 @@ export PADDLE_XCCL_BACKEND=iluvatar_gpu export INFERENCE_MSG_QUEUE_ID=232132 export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 export FD_SAMPLING_CLASS=rejection -export FD_DEBUG=1 python3 run_demo_vl.py ``` @@ -528,12 +349,6 @@ for output in outputs: print(f"generated_text={generated_text}") ``` -## run demo - -```bash -./run_demo_vl.sh -``` - The following logs will be printed: ``` @@ -557,10 +372,9 @@ generated_text= 这件佛像具有典型的北齐风格,佛像结跏趺坐于莲花座上,身披通肩袈裟,面部圆润,神态安详,体现了北齐佛教艺术的独特魅力。 ``` -## Testing thinking model +**online demo** -### ERNIE-4.5-21B-A3B-Thinking -Refer to [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/best_practices/ERNIE-4.5-21B-A3B-Thinking.md), the command is bellow: +Refer to [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/get_started/ernie-4.5-vl.md), the command as bellow: server: ```bash @@ -569,52 +383,16 @@ export PADDLE_XCCL_BACKEND=iluvatar_gpu export INFERENCE_MSG_QUEUE_ID=232132 export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 export FD_SAMPLING_CLASS=rejection -export FD_DEBUG=1 python3 -m fastdeploy.entrypoints.openai.api_server \ - --model baidu/ERNIE-4.5-21B-A3B-Thinking \ + --model /home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle \ --port 8180 \ --tensor-parallel-size 2 \ --max-model-len 32768 \ --quantization wint8 \ - --block-size 16 \ - --reasoning-parser ernie_x1 \ - --tool-call-parser ernie_x1 \ - --max-num-seqs 8 -``` - -client: - -```bash -curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ --H "Content-Type: application/json" \ --d '{ - "messages": [ - {"role": "user", "content": "Write me a poem about large language model."} - ] -}' -``` - -### ERNIE-4.5-VL-28B-A3B -Refer to [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/get_started/ernie-4.5-vl.md), set `"chat_template_kwargs":{"enable_thinking": true}` and the command is bellow: - -server: -```bash -#!/bin/bash -export PADDLE_XCCL_BACKEND=iluvatar_gpu -export INFERENCE_MSG_QUEUE_ID=232132 -export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 -export FD_SAMPLING_CLASS=rejection -export FD_DEBUG=1 -python3 -m fastdeploy.entrypoints.openai.api_server \ - --model baidu/ERNIE-4.5-VL-28B-A3B-Paddle \ - --port 8180 \ - --tensor-parallel-size 2 \ - --max-model-len 32768 \ - --quantization wint8 \ - --block-size 16 \ --limit-mm-per-prompt '{"image": 100, "video": 100}' \ --reasoning-parser ernie-45-vl \ - --max-num-seqs 8 + --max-num-seqs 8 \ + --block-size 16 ``` client: @@ -629,12 +407,13 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ {"type": "text", "text": "From which era does the artifact in the image originate?"} ]} ], - "chat_template_kwargs":{"enable_thinking": true} + "chat_template_kwargs":{"enable_thinking": false} }' ``` -### ERNIE-4.5-VL-28B-A3B-Thinking -Refer to [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/get_started/ernie-4.5-vl-thinking.md), the command is bellow: +#### 4.2.2 ERNIE-4.5-VL-28B-A3B-Thinking + +Refer to [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/get_started/ernie-4.5-vl-thinking.md), the command as bellow: server: ```bash @@ -643,19 +422,18 @@ export PADDLE_XCCL_BACKEND=iluvatar_gpu export INFERENCE_MSG_QUEUE_ID=232132 export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 export FD_SAMPLING_CLASS=rejection -export FD_DEBUG=1 python3 -m fastdeploy.entrypoints.openai.api_server \ - --model baidu/ERNIE-4.5-VL-28B-A3B-Thinking \ + --model /home/paddle/ERNIE-4.5-VL-28B-A3B-Thinking \ --port 8180 \ --tensor-parallel-size 2 \ --max-model-len 32768 \ --quantization wint8 \ - --block-size 16 \ --limit-mm-per-prompt '{"image": 100, "video": 100}' \ --reasoning-parser ernie-45-vl-thinking \ --tool-call-parser ernie-45-vl-thinking \ --mm-processor-kwargs '{"image_max_pixels": 12845056 }' \ - --max-num-seqs 8 + --max-num-seqs 8 \ + --block-size 16 ``` client: @@ -671,3 +449,87 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ ] }' ``` + +### 4.3 PaddleOCR-VL series +#### 4.3.1 PaddleOCR-VL-0.9B + +- (Optional) Install paddleocr + +To install the latest `paddleocr`, you can compile it from source. The image contains a compilation and installation based on source code `39128c2c7fd40be44d8f33498cabd4ec10f1bfcd`. + +```bash +git clone -b main https://github.com/PaddlePaddle/PaddleOCR.git +cd PaddleOCR +pip3 install -e ".[doc-parser]" +``` + +Refer to [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/best_practices/PaddleOCR-VL-0.9B.md), the command as bellow: + +server: +```bash +#!/bin/bash +export PADDLE_XCCL_BACKEND=iluvatar_gpu +export INFERENCE_MSG_QUEUE_ID=232132 +export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 +export FD_SAMPLING_CLASS=rejection +python3 -m fastdeploy.entrypoints.openai.api_server \ + --model /data1/fastdeploy/PaddleOCR-VL \ + --port 8180 \ + --metrics-port 8471 \ + --engine-worker-queue-port 8472 \ + --cache-queue-port 55660 \ + --max-model-len 16384 \ + --max-num-batched-tokens 16384 \ + --max-num-seqs 32 \ + --workers 2 \ + --block-size 16 +``` + +client: + +**simple demo** + +```bash +paddleocr doc_parser -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/paddleocr_vl_demo.png --vl_rec_backend fastdeploy-server --vl_rec_server_url http://127.0.0.1:8180/v1 +``` + +The output is: + +{'res': {'input_path': '/root/.paddlex/predict_input/paddleocr_vl_demo.png', 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_layout_detection': True, 'use_chart_recognition': False, 'format_block_content': False}, 'layout_det_res': {'input_path': None, 'page_index': None, 'boxes': [{'cls_id': 6, 'label': 'doc_title', 'score': 0.9636866450309753, 'coordinate': [131.31543, 36.45137, 1384.522, 127.98457]}, {'cls_id': 22, 'label': 'text', 'score': 0.928146243095398, 'coordinate': [585.39355, 158.43787, 930.2197, 182.57446]}, {'cls_id': 22, 'label': 'text', 'score': 0.9840242266654968, 'coordinate': [9.02211, 200.86037, 361.41748, 343.8839]}, {'cls_id': 14, 'label': 'image', 'score': 0.9871442914009094, 'coordinate': [775.5067, 200.66461, 1503.379, 684.9366]}, {'cls_id': 22, 'label': 'text', 'score': 0.9801799058914185, 'coordinate': [9.532669, 344.90558, 361.44202, 440.8252]}, {'cls_id': 17, 'label': 'paragraph_title', 'score': 0.9708914756774902, 'coordinate': [28.03984, 455.88013, 341.72076, 520.7113]}, {'cls_id': 22, 'label': 'text', 'score': 0.9825296401977539, 'coordinate': [8.897079, 536.5491, 361.0522, 655.80566]}, {'cls_id': 22, 'label': 'text', 'score': 0.982223391532898, 'coordinate': [8.970978, 657.4961, 362.01614, 774.6245]}, {'cls_id': 24, 'label': 'vision_footnote', 'score': 0.9001952409744263, 'coordinate': [809.06995, 703.70044, 1488.3029, 750.5239]}, {'cls_id': 22, 'label': 'text', 'score': 0.9767361879348755, 'coordinate': [9.407532, 776.5222, 361.31128, 846.8281]}, {'cls_id': 22, 'label': 'text', 'score': 0.9868096113204956, 'coordinate': [8.669312, 848.2549, 361.64832, 1062.8562]}, {'cls_id': 22, 'label': 'text', 'score': 0.9826636910438538, 'coordinate': [8.8025055, 1063.8627, 361.46454, 1182.8519]}, {'cls_id': 22, 'label': 'text', 'score': 0.9825499653816223, 'coordinate': [8.82019, 1184.4667, 361.66507, 1302.4513]}, {'cls_id': 22, 'label': 'text', 'score': 0.9584522843360901, 'coordinate': [9.170425, 1304.2166, 361.48846, 1351.7488]}, {'cls_id': 22, 'label': 'text', 'score': 0.978195309638977, 'coordinate': [389.1593, 200.38223, 742.76196, 295.65167]}, {'cls_id': 22, 'label': 'text', 'score': 0.9844739437103271, 'coordinate': [388.73267, 297.18472, 744.0012, 441.30356]}, {'cls_id': 17, 'label': 'paragraph_title', 'score': 0.9680613875389099, 'coordinate': [409.39398, 455.8943, 721.71893, 520.9389]}, {'cls_id': 22, 'label': 'text', 'score': 0.9741637706756592, 'coordinate': [389.7167, 536.8141, 742.71155, 608.0021]}, {'cls_id': 22, 'label': 'text', 'score': 0.9840295910835266, 'coordinate': [389.30914, 609.3971, 743.0931, 750.32263]}, {'cls_id': 22, 'label': 'text', 'score': 0.9845904111862183, 'coordinate': [389.1331, 751.77673, 743.05884, 894.88196]}, {'cls_id': 22, 'label': 'text', 'score': 0.9848388433456421, 'coordinate': [388.83295, 896.0353, 743.5821, 1038.7367]}, {'cls_id': 22, 'label': 'text', 'score': 0.9804726243019104, 'coordinate': [389.0833, 1039.9131, 742.7598, 1134.4902]}, {'cls_id': 22, 'label': 'text', 'score': 0.9864556789398193, 'coordinate': [388.5259, 1135.8118, 743.45215, 1352.0105]}, {'cls_id': 22, 'label': 'text', 'score': 0.9869311451911926, 'coordinate': [769.8312, 775.6598, 1124.9835, 1063.2106]}, {'cls_id': 22, 'label': 'text', 'score': 0.9822818040847778, 'coordinate': [770.3026, 1063.9371, 1124.8307, 1184.2206]}, {'cls_id': 17, 'label': 'paragraph_title', 'score': 0.968923032283783, 'coordinate': [791.3031, 1199.3169, 1104.454, 1264.6992]}, {'cls_id': 22, 'label': 'text', 'score': 0.9712913036346436, 'coordinate': [770.42285, 1279.6072, 1124.6924, 1351.8679]}, {'cls_id': 22, 'label': 'text', 'score': 0.9236321449279785, 'coordinate': [1153.9055, 775.5812, 1334.0662, 798.1588]}, {'cls_id': 22, 'label': 'text', 'score': 0.985789954662323, 'coordinate': [1151.5193, 799.27954, 1506.362, 991.1172]}, {'cls_id': 22, 'label': 'text', 'score': 0.9820653796195984, 'coordinate': [1151.5708, 991.9118, 1506.6016, 1110.8875]}, {'cls_id': 22, 'label': 'text', 'score': 0.9865990877151489, 'coordinate': [1151.6917, 1112.1348, 1507.1611, 1351.9453]}]}, 'parsing_res_list': [{'block_label': 'doc_title', 'block_content': '助力双方交往 搭建友谊桥梁', 'block_bbox': [131, 36, 1384, 127]}, {'block_label': 'text', 'block_content': '本报记者 沈小晓 任彦 黄培昭', 'block_bbox': [585, 158, 930, 182]}, {'block_label': 'text', 'block_content': '身着中国传统民族服装的厄立特里亚青年依次登台表演中国民族舞、现代舞、扇子舞等,曼妙的舞姿赢得现场观众阵阵掌声。这是日前厄立特里亚高等教育与研究院孔子学院(以下简称“厄特孔院”)举办“喜迎新年”中国歌舞比赛的场景。', 'block_bbox': [9, 200, 361, 343]}, {'block_label': 'image', 'block_content': '', 'block_bbox': [775, 200, 1503, 684]}, {'block_label': 'text', 'block_content': '中国和厄立特里亚传统友谊深厚。近年来,在高质量共建“一带一路”框架下,中厄两国人文交流不断深化,互利合作的民意基础日益深厚。', 'block_bbox': [9, 344, 361, 440]}, {'block_label': 'paragraph_title', 'block_content': '“学好中文,我们的未来不是梦”', 'block_bbox': [28, 455, 341, 520]}, {'block_label': 'text', 'block_content': '鲜花曾告诉我你怎样走过,大地知道你心中的每一个角落……”厄立特里亚阿斯马拉大学综合楼二层,一阵优美的歌声在走廊里回响。循着熟悉的旋律轻轻推开一间教室的门,学生们正跟着老师学唱中文歌曲《同一首歌》。', 'block_bbox': [8, 536, 361, 655]}, {'block_label': 'text', 'block_content': '这是厄特孔院阿斯马拉大学教学点的一节中文歌曲课。为了让学生们更好地理解歌词大意,老师尤斯拉·穆罕默德萨尔·侯赛因逐字翻译和解释歌词。随着伴奏声响起,学生们边唱边随着节拍摇动身体,现场气氛热烈。', 'block_bbox': [8, 657, 362, 774]}, {'block_label': 'vision_footnote', 'block_content': '在厄立特里亚不久前举办的第六届中国风筝文化节上,当地小学生体验风筝制作。\n中国驻厄立特里亚大使馆供图', 'block_bbox': [809, 703, 1488, 750]}, {'block_label': 'text', 'block_content': '“这是中文歌曲初级班,共有32人。学生大部分来自首都阿斯马拉的中小学,年龄最小的仅有6岁。”尤斯拉告诉记者。', 'block_bbox': [9, 776, 361, 846]}, {'block_label': 'text', 'block_content': '尤斯拉今年23岁,是厄立特里亚一所公立学校的艺术老师。她12岁开始在厄特孔院学习中文,在2017年第十届“汉语桥”世界中学生中文比赛中获得厄立特里亚赛区第一名,并和同伴代表厄立特里亚前往中国参加决赛, +获得团体优胜奖。2022年起,尤斯拉开始在厄特孔院兼职教授中文歌曲,每周末两个课时。“中国文化博大精深,我希望我的学生们能够通过中文歌曲更好地理解中国文化。”她说。', 'block_bbox': [8, 848, 361, 1062]}, {'block_label': 'text', 'block_content': '“姐姐,你想去中国吗?”“非常想!我想去看故宫、爬长城。”尤斯拉的学生中有一对能歌善舞的姐妹,姐姐露娅今年15岁,妹妹莉娅14岁,两人都已在厄特孔院学习多年,中文说得格外流利。', 'block_bbox': [8, 1063, 361, 1182]}, {'block_label': 'text', 'block_content': '露娅对记者说:“这些年来,怀着对中文和中国文化的热爱,我们姐妹俩始终相互鼓励,一起学习。我们的中文一天比一天好,还学会了中文歌和中国舞。我们一定要到中国去。学好中文,我们的未来不是梦!”', 'block_bbox': [8, 1184, 361, 1302]}, {'block_label': 'text', 'block_content': '据厄特孔院中方院长黄鸣飞介绍,这所孔院成立于2013年3月,由贵州财经大学和厄立特里亚高等教育与研究院合作建立,开设了中国语言课程和中国文化课程,注册学生2万余人次。10余年来,厄特孔院已成为当地民众了解中国的一扇窗口。', 'block_bbox': [9, 1304, 361, 1351]}, {'block_label': 'text', 'block_content': '', 'block_bbox': [389, 200, 742, 295]}, {'block_label': 'text', 'block_content': '黄鸣飞表示,随着来学习中文的人日益增多,阿斯马拉大学教学点已难以满足教学需要。2024年4月,由中企蜀道集团所属四川路桥承建的孔院教学楼项目在阿斯马拉开工建设,预计今年上半年竣工,建成后将为厄特孔院提供全新的办学场地。', 'block_bbox': [388, 297, 744, 441]}, {'block_label': 'paragraph_title', 'block_content': '“在中国学习的经历让我看到更广阔的世界”', 'block_bbox': [409, 455, 721, 520]}, {'block_label': 'text', 'block_content': '多年来,厄立特里亚广大赴华留学生和培训人员积极投身国家建设,成为助力该国发展的人才和厄中友好的见证者和推动者。', 'block_bbox': [389, 536, 742, 608]}, {'block_label': 'text', 'block_content': '在厄立特里亚全国妇女联盟工作的约翰娜·特韦尔德·凯莱塔就是其中一位。她曾在中华女子学院攻读硕士学位,研究方向是女性领导力与社会发展。其间,她实地走访中国多个地区,获得了观察中国社会发展的第一手资料。', 'block_bbox': [389, 609, 743, 750]}, {'block_label': 'text', 'block_content': '谈起在中国求学的经历,约翰娜记忆犹新:“中国的发展在当今世界是独一无二的。沿着中国特色社会主义道路坚定前行,中国创造了发展奇迹,这一切都离不开中国共产党的领导。中国的发展经验值得许多国家学习借鉴。”', 'block_bbox': [389, 751, 743, 894]}, {'block_label': 'text', 'block_content': '正在西南大学学习的厄立特里亚博士生穆卢盖塔·泽穆伊对中国怀有深厚感情。8年前,在北京师范大学获得硕士学位后,穆卢盖塔在社交媒体上写下这样一段话:“这是我人生的重要一步,自此我拥有了一双坚固的鞋子,赋予我穿越荆棘的力量。”', 'block_bbox': [388, 896, 743, 1038]}, {'block_label': 'text', 'block_content': '穆卢盖塔密切关注中国在经济、科技、教育等领域的发展,“中国在科研等方面的实力与日俱增。在中国学习的经历让我看到更广阔的世界,从中受益匪浅。”', 'block_bbox': [389, 1039, 742, 1134]}, {'block_label': 'text', 'block_content': '23岁的莉迪亚·埃斯蒂法诺斯已在厄特孔院学习3年,在中国书法、中国画等方面表现十分优秀,在2024年厄立特里亚赛区的“汉语桥”比赛中获得一等奖。莉迪亚说:“学习中国书法让我的内心变得安宁和纯粹。我也喜欢中国的服饰,希望未来能去中国学习,把中国不同民族元素融入服装设计中,创作出更多精美作品,也把厄特文化分享给更多的中国朋友。”\n“不管远近都是客人,请不用客气;相约好了在一起,我们欢迎你……”在一场中厄青年联谊活动上,四川路桥中方员工同当地大学生合唱《北京 +欢迎你》。厄立特里亚技术学院计算机科学与工程专业学生鲁夫塔·谢拉是其中一名演唱者,她很早便在孔院学习中文,一直在为去中国留学作准备。“这句歌词是我们两国人民友谊的生动写照。无论是投身于厄特里亚基础设施建设的中企员工,还是在中国留学的厄立特里亚学子,两国人民携手努力,必将推动两国关系不断向前发展。”鲁夫塔说。', 'block_bbox': [388, 1135, 743, 1352]}, {'block_label': 'text', 'block_content': '', 'block_bbox': [769, 775, 1124, 1063]}, {'block_label': 'text', 'block_content': '厄立特里亚高等教育委员会主任助理萨马瑞表示:“每年我们都会组织学生到中国访问学习,目前有超过5000名厄立特里亚学生在中国留学。学习中国的教育经验,有助于提升厄立特里亚的教育水平。”', 'block_bbox': [770, 1063, 1124, 1184]}, {'block_label': 'paragraph_title', 'block_content': '“共同向世界展示非洲和亚洲的灿烂文明”', 'block_bbox': [791, 1199, 1104, 1264]}, {'block_label': 'text', 'block_content': '从阿斯马拉出发,沿着蜿蜒曲折的盘山公路一路向东寻找丝路印迹。驱车两个小时,记者来到位于厄立特里亚港口城市马萨瓦的北红海省博物馆。', 'block_bbox': [770, 1279, 1124, 1351]}, {'block_label': 'text', 'block_content': '', 'block_bbox': [1153, 775, 1334, 798]}, {'block_label': 'text', 'block_content': '博物馆二层陈列着一个发掘自阿杜利斯古城的中国古代陶制酒器,罐身上写着“万”“和”“禅”“山”等汉字。“这件文物证明,很早以前我们就通过海上丝绸之路进行贸易往来与文化交流。这也是厄立特里亚与中国友好交往历史的有力证明。”北红海省博物馆研究与文献部负责人伊萨亚斯·特斯法兹吉说。', 'block_bbox': [1151, 799, 1506, 991]}, {'block_label': 'text', 'block_content': '厄立特里亚国家博物馆考古学和人类学研究员菲尔蒙·特韦尔德十分喜爱中国文化。他表示:“学习彼此的语言和文化,将帮助厄中两国人民更好地理解彼此,助力双方交往,搭建友谊桥梁。”', 'block_bbox': [1151, 991, 1506, 1110]}, {'block_label': 'text', 'block_content': '厄立特里亚国家博物馆馆长塔吉丁·努里达姆·优素福曾多次 +访问中国,对中华文明的传承与创新、现代化博物馆的建设与发展印象深刻。“中国博物馆不仅有许多保存完好的文物,还充分运用先进科技手段进行展示,帮助人们更好理解中华文明。”塔吉丁说,“厄立特里亚与中国都拥有悠久的文明,始终相互理解、相互尊重。我希望未来与中国同行加强合作,共同向世界展示非洲和亚洲的灿烂文明。”', 'block_bbox': [1151, 1112, 1507, 1351]}]}} + +**benchmark** + +1. Download and extract image datasets + +```bash +wget https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/internal/tmp/images.tar +tar xvf images.tar +``` + +2. Prepare `infer_ocr_vl_benchmark.py` + +```python +import os +from paddleocr import PaddleOCRVL + +input_path = "./images" +pipeline = PaddleOCRVL(vl_rec_backend="fastdeploy-server", vl_rec_server_url="http://127.0.0.1:8180/v1") +file_list = os.listdir(input_path) +for file_name in file_list: + file_path = os.path.join(input_path, file_name) + output = pipeline.predict(file_path) + for res in output: + res.print() + res.save_to_markdown(save_path="output", pretty=False) +``` + +3. execute `infer_ocr_vl_benchmark.py` on client + +```bash +python3 infer_ocr_vl_benchmark.py +``` + +After each image is inferred, a corresponding `md` file will be generated in the `output` path. Running the entire benchmark (1355 images) takes approximately 5 hours. diff --git a/docs/zh/get_started/installation/iluvatar_gpu.md b/docs/zh/get_started/installation/iluvatar_gpu.md index 06ad189ed..332c65a85 100644 --- a/docs/zh/get_started/installation/iluvatar_gpu.md +++ b/docs/zh/get_started/installation/iluvatar_gpu.md @@ -1,70 +1,66 @@ [English](../../../get_started/installation/iluvatar_gpu.md) -# 如何在天数机器上运行 ERNIE-4.5-300B-A47B-BF16 & ERNIE-4.5-21B-A3B +## 1. 准备机器 -## 准备机器 -首先运行ERNIE4.5 300B模型需要`TP=16`, 所以您需要准备以下配置的机器: - -| CPU | 内存 | 天数 | 硬盘| -|-----|------|-----|-----| +| CPU | Memory | Card | Hard Disk| +| :---: | :---: | :---: | :---: | | x86 | 1TB| 16xBI150| 1TB| -目前需要将完整模型 load 到 host memory 中,需要需要大于 600GB 的 host memory,后续版本会优化。 - -## 镜像 -从官网获取: +## 2. 准备镜像 +Pull the Docker image ```bash -docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest +docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:paddle-ocr-vl-1107 ``` -## 准备容器 -### 启动容器 +## 3. 准备容器 +### 3.1 启动容器 ```bash -docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest +docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:paddle-ocr-vl-1107 docker exec -it paddle_infer bash ``` -/home/paddle 为模型文件、whl包、脚本所在目录 +/home/paddle 为模型文件、whl包、脚本所在目录。 -### 安装paddle +### 3.2 安装paddle ```bash -pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ -pip3 install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ +pip3 install paddlepaddle==3.3.0.dev20251103 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/ +pip3 install paddle-iluvatar-gpu==3.0.0.dev20251107 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/ ``` -获取Paddle的最新安装版本: [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/) -### 安装fastdeploy +### 3.3 安装fastdeploy ```bash -pip3 install fastdeploy_iluvatar_gpu==2.1.0.dev0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels +pip3 install fastdeploy_iluvatar_gpu==2.4.0.dev0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.aliyun.com/pypi/simple/ ``` -可以按如下步骤编译FastDeploy,,得到```最新版本```. +可以按如下步骤编译FastDeploy,,得到```最新版本```。 ```bash git clone https://github.com/PaddlePaddle/FastDeploy cd FastDeploy -pip install -r requirements_iluvatar.txt +ln -sf /usr/local/bin/python3 /usr/local/bin/python +pip3 install -r requirements_iluvatar.txt export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 bash build.sh ``` -## 准备推理demo脚本 -推理 demo 路径:/home/paddle/scripts -脚本内容如下 +## 4. 在天数机器上测试模型 +### 4.1 ERNIE-4.5系列 +#### 4.1.1 ERNIE-4.5-21B-A3B-Paddle -`run_demo.sh`: +**离线脚本** +脚本如下所示: ```bash #!/bin/bash export PADDLE_XCCL_BACKEND=iluvatar_gpu export INFERENCE_MSG_QUEUE_ID=232132 export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 -export FD_DEBUG=1 +export FD_SAMPLING_CLASS=rejection python3 run_demo.py ``` -run_demo.py +`run_demo.py`: ```python from fastdeploy import LLM, SamplingParams @@ -74,30 +70,22 @@ prompts = [ "The largest ocean is", ] -# 采样参数 +# sampling parameters sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256) -# 加载模型 -llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, quantization='wint8') +# load the model +llm = LLM(model="/home/paddle/ERNIE-4.5-21B-A3B-Paddle", tensor_parallel_size=1, max_model_len=8192, block_size=16, quantization='wint8') -# 批量进行推理(llm内部基于资源情况进行请求排队、动态插入处理) +# Perform batch inference outputs = llm.generate(prompts, sampling_params) -# 注意将其中`/home/paddle/ernie-4_5-21b-a3b-bf16-paddle`替换为您下载的ERNIE模型的路径。 -# 输出结果 + for output in outputs: prompt = output.prompt generated_text = output.outputs.text print(prompt, generated_text) ``` -## 运行demo -执行 - -```bash -./run_demo.sh -``` - -会有如下 log 打印;load 模型耗时约74s,demo 运行约240s。 +会有如下 log 打印: ``` /usr/local/lib/python3.10/site-packages/paddle/utils/cpp_extension/extension_utils.py:715: UserWarning: No ccache found. Please be aware that recompiling all source files may be required. You can download and install ccache from: https://github.com/ccache/ccache/blob/master/doc/INSTALL.md @@ -134,275 +122,145 @@ Now, let's break down each step: The largest ocean is the Pacific Ocean, covering an area of approximately ⦠[3], The first scientific expeditions to determine the ocean's depth were the Challenger expedition (1872â1876) and the U.S. Navy Hydrographic Office survey (1877â1879). The oceanic crust is thin and irregular, consisting of upward moving magma from the mantle below, and cooling and solidifying on the surface. The shallowest parts of the ocean are called the continental shelves. Large tides are caused mainly by the alignment of the Sun, Moon, and Earth during new or full moons. The origin of the word "ocean" is not clear. The first global oceanic topography survey was completed by the Challenger expedition (1872â1876). [57] The sound speed in the ocean is primarily a function of water temperature and salinity, and varies with depth. The deep-ocean floor is mostly flat and devoid of life, with the exception of seamounts and various underwater volcanic features, including seamounts and hydrothermal vents. [73] Today, the five ocean ``` -## 在GSM8K数据集上运行ernie4.5 300B模型 +**服务部署** -1. 下载GSM8K数据集 - -```bash -wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl -``` - -2. 准备`bench_gsm8k.py` - -```python -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" Fastdeploy + ERNIE-4.5-Turbo 的指标评估 """ -# adapted from https://github.com/sgl-project/sglang/blob/main/benchmark/gsm8k/bench_other.py -import argparse -import ast -import json -import re -import time -from concurrent.futures import ThreadPoolExecutor - -import numpy as np -import requests -from tqdm import tqdm - -INVALID = -9999999 - - -def call_generate(prompt, **kwargs): - """ - Generates response based on the input prompt. - - Args: - prompt (str): The input prompt text. - **kwargs: Keyword arguments, including server IP address and port number. - - Returns: - str: The response generated based on the prompt. - - """ - url = f"http://{kwargs['ip']}:{kwargs['port']}/v1/chat/completions" - headers = {"Content-Type": "application/json"} - data = { - "messages": [ - { - "role": "user", - "content": prompt, - } - ], - "temperature": 0.6, - "max_tokens": 2047, - "top_p": 0.95, - "do_sample": True, - } - - response = requests.post(url, headers=headers, data=json.dumps(data)) - out = response.json() - return out["choices"][0]["message"]["content"] - - -def get_one_example(lines, i, include_answer): - """ - Retrieves a question-answer example from the given list of text lines. - - Args: - lines (list of dict): A list of question-answer pairs. - i (int): The index of the question-answer pair to retrieve from lines. - include_answer (bool): Whether to include the answer in the returned string. - - Returns: - str: A formatted question-answer string in the format "Question: \nAnswer: ". - - """ - ret = "Question: " + lines[i]["question"] + "\nAnswer:" - if include_answer: - ret += " " + lines[i]["answer"] - return ret - - -def get_few_shot_examples(lines, k): - """ - Selects k examples from the given list of text lines and concatenates them into a single string. - - Args: - lines (list): A list containing text lines. - k (int): The number of examples to select. - - Returns: - str: A string composed of k examples, separated by two newline characters. - """ - ret = "" - for i in range(k): - ret += get_one_example(lines, i, True) + "\n\n" - return ret - - -def get_answer_value(answer_str): - """ - Extracts numerical values from an answer string and returns them. - - Args: - answer_str (str): The string containing the answer. - - Returns: - The extracted numerical value; returns "INVALID" if extraction fails. - """ - answer_str = answer_str.replace(",", "") - numbers = re.findall(r"\d+", answer_str) - if len(numbers) < 1: - return INVALID - try: - return ast.literal_eval(numbers[-1]) - except SyntaxError: - return INVALID - - -def read_jsonl(filename: str): - """ - Reads a JSONL file. - - Args: - filename (str): Path to the JSONL file. - - Yields: - dict: A dictionary object corresponding to each line in the JSONL file. - """ - with open(filename) as fin: - for line in fin: - if line.startswith("#"): - continue - yield json.loads(line) - - -def main(args): - """ - Process inputs and generate answers by calling the model in parallel using a thread pool. - - Args: - args (argparse.Namespace): - - num_questions (int): Number of questions to process. - - num_shots (int): Number of few-shot learning examples. - - ip (str): IP address of the model service. - - port (int): Port number of the model service. - - parallel (int): Number of questions to process in parallel. - - result_file (str): File path to store the results. - - Returns: - None - - """ - # Read data - filename = "test.jsonl" - - lines = list(read_jsonl(filename)) - - # Construct prompts - num_questions = args.num_questions - num_shots = args.num_shots - few_shot_examples = get_few_shot_examples(lines, num_shots) - - questions = [] - labels = [] - for i in range(len(lines[:num_questions])): - questions.append(get_one_example(lines, i, False)) - labels.append(get_answer_value(lines[i]["answer"])) - assert all(l != INVALID for l in labels) - - states = [None] * len(labels) - - # Use thread pool - def get_one_answer(i): - answer = call_generate( - prompt=few_shot_examples + questions[i], - # stop=["Question", "Assistant:", "<|separator|>"], - ip=args.ip, - port=args.port, - ) - states[i] = answer - - tic = time.time() - if args.parallel == 1: - for i in tqdm(range(len(questions))): - get_one_answer(i) - else: - with ThreadPoolExecutor(args.parallel) as executor: - list( - tqdm( - executor.map(get_one_answer, list(range(len(questions)))), - total=len(questions), - ) - ) - - latency = time.time() - tic - preds = [] - for i in range(len(states)): - preds.append(get_answer_value(states[i])) - - # Compute accuracy - acc = np.mean(np.array(preds) == np.array(labels)) - invalid = np.mean(np.array(preds) == INVALID) - - # Print results - print(f"Accuracy: {acc:.3f}") - print(f"Invalid: {invalid:.3f}") - print(f"Latency: {latency:.3f} s") - - with open(args.result_file, "a") as fout: - value = { - "task": "gsm8k", - "backend": "paddlepaddle", - "num_gpus": 1, - "latency": round(latency, 3), - "accuracy": round(acc, 3), - "num_requests": args.num_questions, - "other": { - "num_questions": args.num_questions, - "parallel": args.parallel, - }, - } - fout.write(json.dumps(value) + "\n") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--ip", type=str, default="127.0.0.1") - parser.add_argument("--port", type=str, default="8188") - parser.add_argument("--num-shots", type=int, default=10) - parser.add_argument("--data-path", type=str, default="test.jsonl") - parser.add_argument("--num-questions", type=int, default=1319) - parser.add_argument("--result-file", type=str, default="result.jsonl") - parser.add_argument("--parallel", type=int, default=1) - args = parser.parse_args() - main(args) -``` - -3. 准备`run_bench.sh` +参考[gpu文档](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/best_practices/ERNIE-4.5-VL-28B-A3B-Paddle.md), 命令如下所示: +服务端: ```bash #!/bin/bash export PADDLE_XCCL_BACKEND=iluvatar_gpu export INFERENCE_MSG_QUEUE_ID=232132 export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 export FD_SAMPLING_CLASS=rejection +python3 -m fastdeploy.entrypoints.openai.api_server \ + --model /home/paddle/ERNIE-4.5-21B-A3B-Paddle \ + --port 8180 \ + --tensor-parallel-size 1 \ + --quantization wint8 \ + --max-model-len 32768 \ + --max-num-seqs 8 \ + --block-size 16 +``` +如果想切换到 v0 loader, 请设置 `--load-choices "default"`。 -python3 -m fastdeploy.entrypoints.openai.api_server --model "/home/paddle/ernie-45t" --port 8188 --tensor-parallel-size 16 --block-size 16 --quantization wint8 +客户端: + +- 简单请求: +```bash +curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ +-H "Content-Type: application/json" \ +-d '{ + "messages": [ + {"role": "user", "content": "Write me a poem about large language model."} + ] +}' ``` -4. 运行脚本 - -首先打开一个终端执行服务端命令: +- 测试GSM8K数据集性能 +1) 下载GSM8K数据集 ```bash -./run_bench.sh +wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl ``` -等服务起好后,在打开另一个终端执行客户端命令: +2) 将`bench_gsm8k.py`拷贝到工作目录 ```bash -python3 -u bench_gsm8k.py --port 8188 --num-questions 1319 --num-shots 5 --parallel 8 +cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py . +``` +3) 执行 +```bash +python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8 +``` +推理整个GSM8K数据集大概需要52分钟。 + +``` +Accuracy: 0.914 +Invaild: 0.000 +Latency: 3143.301 s +``` + +#### 4.1.2 ERNIE-4.5-21B-A3B-Thinking + +参考[gpu文档](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/best_practices/ERNIE-4.5-21B-A3B-Thinking.md), 命令如下所示: + +服务端: +```bash +#!/bin/bash +export PADDLE_XCCL_BACKEND=iluvatar_gpu +export INFERENCE_MSG_QUEUE_ID=232132 +export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 +export FD_SAMPLING_CLASS=rejection +python3 -m fastdeploy.entrypoints.openai.api_server \ + --model /home/paddle/ERNIE-4.5-21B-A3B-Thinking \ + --port 8180 \ + --tensor-parallel-size 1 \ + --max-model-len 32768 \ + --quantization wint8 \ + --reasoning-parser ernie_x1 \ + --tool-call-parser ernie_x1 \ + --max-num-seqs 8 \ + --block-size 16 +``` + +客户端: + +```bash +curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ +-H "Content-Type: application/json" \ +-d '{ + "messages": [ + {"role": "user", "content": "Write me a poem about large language model."} + ] +}' +``` + +#### 4.1.3 ERNIE-4.5-300B-A47B +首先,运行`ERNIE-4.5-300B-A47B`需要`TP=16`。目前需要将完整模型 load 到 host memory 中,需要需要大于 600GB 的 host memory,后续版本会优化。 + +参考[gpu文档](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/best_practices/ERNIE-4.5-300B-A47B-Paddle.md), 命令如下所示: + +服务端: +```bash +#!/bin/bash +export PADDLE_XCCL_BACKEND=iluvatar_gpu +export INFERENCE_MSG_QUEUE_ID=232132 +export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 +export FD_SAMPLING_CLASS=rejection +python3 -m fastdeploy.entrypoints.openai.api_server \ + --model /home/paddle/ERNIE-4.5-300B-A47B \ + --port 8180 \ + --tensor-parallel-size 16 \ + --quantization wint8 \ + --max-model-len 32768 \ + --max-num-seqs 8 \ + --block-size 16 +``` +如果想切换到 v0 loader, 请设置 `--load-choices "default"`。 + +客户端: + +- 简单请求: +```bash +curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ +-H "Content-Type: application/json" \ +-d '{ + "messages": [ + {"role": "user", "content": "Write me a poem about large language model."} + ] +}' +``` + +- 测试GSM8K数据集性能 +1) 下载GSM8K数据集 +```bash +wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl +``` +2) 将`bench_gsm8k.py`拷贝到工作目录 +```bash +cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py . +``` +3) 执行 +```bash +python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8 ``` 推理整个GSM8K数据集大概需要4.8个小时。 @@ -412,48 +270,12 @@ Invaild: 0.000 Latency: 17332.728 s ``` -# 如何在天数机器上运行ERNIE-4.5-VL-28B-A3B-Paddle model +### 4.2 ERNIE-4.5-VL系列 +#### 4.2.1 ERNIE-4.5-VL-28B-A3B-Paddle -## 准备机器 -首先运行ERNIE-4.5-VL-28B-A3B-Paddle模型需要`TP=2`, 所以您需要准备以下配置的机器:: +**离线脚本** -| CPU | Memory | Card | Hard Disk| -| :---: | :---: | :---: | :---: | -| x86 | 1TB| 2xBI150| 1TB| - -## 准备镜像 -拉取镜像: - -```bash -docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest -``` - -## 准备容器 -### 启动容器 - -```bash -docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest -docker exec -it paddle_infer bash -``` - -/home/paddle 为模型文件、whl包、脚本所在目录。 - -### Install paddle - -```bash -pip3 install paddlepaddle==3.3.0.dev20251028 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/ -pip3 install paddle-iluvatar-gpu==3.0.0.dev20251029 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/ -``` -获取Paddle的最新安装版本: [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/) - -### 安装FastDeploy -```bash -pip3 install fastdeploy_iluvatar_gpu==2.3.0.dev0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.aliyun.com/pypi/simple/ -``` - -## 准备推理demo脚本 - -脚本列表如下所示: +脚本如下所示: `run_demo_vl.sh`: @@ -463,7 +285,6 @@ export PADDLE_XCCL_BACKEND=iluvatar_gpu export INFERENCE_MSG_QUEUE_ID=232132 export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 export FD_SAMPLING_CLASS=rejection -export FD_DEBUG=1 python3 run_demo_vl.py ``` @@ -528,13 +349,7 @@ for output in outputs: print(f"generated_text={generated_text}") ``` -## 运行demo - -```bash -./run_demo_vl.sh -``` - -打印如下log: +会有如下 log 打印: ``` [2025-09-23 10:13:10,844] [ INFO] - Using download source: huggingface @@ -557,67 +372,30 @@ generated_text= 这件佛像具有典型的北齐风格,佛像结跏趺坐于莲花座上,身披通肩袈裟,面部圆润,神态安详,体现了北齐佛教艺术的独特魅力。 ``` -## 测试thinking模型 +**服务部署** -### ERNIE-4.5-21B-A3B-Thinking -参考 [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/best_practices/ERNIE-4.5-21B-A3B-Thinking.md), 命令如下所示: +参考[gpu文档](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/get_started/ernie-4.5-vl.md), 命令如下所示: -server: +服务端: ```bash #!/bin/bash export PADDLE_XCCL_BACKEND=iluvatar_gpu export INFERENCE_MSG_QUEUE_ID=232132 export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 export FD_SAMPLING_CLASS=rejection -export FD_DEBUG=1 python3 -m fastdeploy.entrypoints.openai.api_server \ - --model baidu/ERNIE-4.5-21B-A3B-Thinking \ + --model /home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle \ --port 8180 \ --tensor-parallel-size 2 \ --max-model-len 32768 \ --quantization wint8 \ - --block-size 16 \ - --reasoning-parser ernie_x1 \ - --tool-call-parser ernie_x1 \ - --max-num-seqs 8 -``` - -client: - -```bash -curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ --H "Content-Type: application/json" \ --d '{ - "messages": [ - {"role": "user", "content": "Write me a poem about large language model."} - ] -}' -``` - -### ERNIE-4.5-VL-28B-A3B -参考 [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/get_started/ernie-4.5-vl.md), 设置 `"chat_template_kwargs":{"enable_thinking": true}`,命令如下所示: - -server: -```bash -#!/bin/bash -export PADDLE_XCCL_BACKEND=iluvatar_gpu -export INFERENCE_MSG_QUEUE_ID=232132 -export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 -export FD_SAMPLING_CLASS=rejection -export FD_DEBUG=1 -python3 -m fastdeploy.entrypoints.openai.api_server \ - --model baidu/ERNIE-4.5-VL-28B-A3B-Paddle \ - --port 8180 \ - --tensor-parallel-size 2 \ - --max-model-len 32768 \ - --quantization wint8 \ - --block-size 16 \ --limit-mm-per-prompt '{"image": 100, "video": 100}' \ --reasoning-parser ernie-45-vl \ - --max-num-seqs 8 + --max-num-seqs 8 \ + --block-size 16 ``` -client: +客户端: ```bash curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ @@ -629,36 +407,36 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ {"type": "text", "text": "From which era does the artifact in the image originate?"} ]} ], - "chat_template_kwargs":{"enable_thinking": true} + "chat_template_kwargs":{"enable_thinking": false} }' ``` -### ERNIE-4.5-VL-28B-A3B-Thinking -参考 [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/get_started/ernie-4.5-vl-thinking.md), 命令如下所示: +#### 4.2.2 ERNIE-4.5-VL-28B-A3B-Thinking -server: +参考[gpu文档](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/get_started/ernie-4.5-vl-thinking.md), 命令如下所示: + +服务端: ```bash #!/bin/bash export PADDLE_XCCL_BACKEND=iluvatar_gpu export INFERENCE_MSG_QUEUE_ID=232132 export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 export FD_SAMPLING_CLASS=rejection -export FD_DEBUG=1 python3 -m fastdeploy.entrypoints.openai.api_server \ - --model baidu/ERNIE-4.5-VL-28B-A3B-Thinking \ + --model /home/paddle/ERNIE-4.5-VL-28B-A3B-Thinking \ --port 8180 \ --tensor-parallel-size 2 \ --max-model-len 32768 \ --quantization wint8 \ - --block-size 16 \ --limit-mm-per-prompt '{"image": 100, "video": 100}' \ --reasoning-parser ernie-45-vl-thinking \ --tool-call-parser ernie-45-vl-thinking \ --mm-processor-kwargs '{"image_max_pixels": 12845056 }' \ - --max-num-seqs 8 + --max-num-seqs 8 \ + --block-size 16 ``` -client: +客户端: ```bash curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ -H "Content-Type: application/json" \ @@ -671,3 +449,84 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ ] }' ``` + +### 4.3 PaddleOCR-VL系列 +#### 4.3.1 PaddleOCR-VL-0.9B + +- (可选) 安装 paddleocr + +如果想要安装最新的`paddleocr`,可以源码编译。镜像里是基于`39128c2c7fd40be44d8f33498cabd4ec10f1bfcd`源码编译安装的 + +```bash +git clone -b main https://github.com/PaddlePaddle/PaddleOCR.git +cd PaddleOCR +pip3 install -e ".[doc-parser]" +``` + +参考[gpu文档](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/best_practices/PaddleOCR-VL-0.9B.md), 命令如下所示: + +服务端: +```bash +#!/bin/bash +export PADDLE_XCCL_BACKEND=iluvatar_gpu +export INFERENCE_MSG_QUEUE_ID=232132 +export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 +export FD_SAMPLING_CLASS=rejection +python3 -m fastdeploy.entrypoints.openai.api_server \ + --model /data1/fastdeploy/PaddleOCR-VL \ + --port 8180 \ + --metrics-port 8471 \ + --engine-worker-queue-port 8472 \ + --cache-queue-port 55660 \ + --max-model-len 16384 \ + --max-num-batched-tokens 16384 \ + --max-num-seqs 32 \ + --workers 2 \ + --block-size 16 +``` + +客户端: +```bash +paddleocr doc_parser -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/paddleocr_vl_demo.png --vl_rec_backend fastdeploy-server --vl_rec_server_url http://127.0.0.1:8180/v1 +``` + +输出如下所示: + +{'res': {'input_path': '/root/.paddlex/predict_input/paddleocr_vl_demo.png', 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_layout_detection': True, 'use_chart_recognition': False, 'format_block_content': False}, 'layout_det_res': {'input_path': None, 'page_index': None, 'boxes': [{'cls_id': 6, 'label': 'doc_title', 'score': 0.9636866450309753, 'coordinate': [131.31543, 36.45137, 1384.522, 127.98457]}, {'cls_id': 22, 'label': 'text', 'score': 0.928146243095398, 'coordinate': [585.39355, 158.43787, 930.2197, 182.57446]}, {'cls_id': 22, 'label': 'text', 'score': 0.9840242266654968, 'coordinate': [9.02211, 200.86037, 361.41748, 343.8839]}, {'cls_id': 14, 'label': 'image', 'score': 0.9871442914009094, 'coordinate': [775.5067, 200.66461, 1503.379, 684.9366]}, {'cls_id': 22, 'label': 'text', 'score': 0.9801799058914185, 'coordinate': [9.532669, 344.90558, 361.44202, 440.8252]}, {'cls_id': 17, 'label': 'paragraph_title', 'score': 0.9708914756774902, 'coordinate': [28.03984, 455.88013, 341.72076, 520.7113]}, {'cls_id': 22, 'label': 'text', 'score': 0.9825296401977539, 'coordinate': [8.897079, 536.5491, 361.0522, 655.80566]}, {'cls_id': 22, 'label': 'text', 'score': 0.982223391532898, 'coordinate': [8.970978, 657.4961, 362.01614, 774.6245]}, {'cls_id': 24, 'label': 'vision_footnote', 'score': 0.9001952409744263, 'coordinate': [809.06995, 703.70044, 1488.3029, 750.5239]}, {'cls_id': 22, 'label': 'text', 'score': 0.9767361879348755, 'coordinate': [9.407532, 776.5222, 361.31128, 846.8281]}, {'cls_id': 22, 'label': 'text', 'score': 0.9868096113204956, 'coordinate': [8.669312, 848.2549, 361.64832, 1062.8562]}, {'cls_id': 22, 'label': 'text', 'score': 0.9826636910438538, 'coordinate': [8.8025055, 1063.8627, 361.46454, 1182.8519]}, {'cls_id': 22, 'label': 'text', 'score': 0.9825499653816223, 'coordinate': [8.82019, 1184.4667, 361.66507, 1302.4513]}, {'cls_id': 22, 'label': 'text', 'score': 0.9584522843360901, 'coordinate': [9.170425, 1304.2166, 361.48846, 1351.7488]}, {'cls_id': 22, 'label': 'text', 'score': 0.978195309638977, 'coordinate': [389.1593, 200.38223, 742.76196, 295.65167]}, {'cls_id': 22, 'label': 'text', 'score': 0.9844739437103271, 'coordinate': [388.73267, 297.18472, 744.0012, 441.30356]}, {'cls_id': 17, 'label': 'paragraph_title', 'score': 0.9680613875389099, 'coordinate': [409.39398, 455.8943, 721.71893, 520.9389]}, {'cls_id': 22, 'label': 'text', 'score': 0.9741637706756592, 'coordinate': [389.7167, 536.8141, 742.71155, 608.0021]}, {'cls_id': 22, 'label': 'text', 'score': 0.9840295910835266, 'coordinate': [389.30914, 609.3971, 743.0931, 750.32263]}, {'cls_id': 22, 'label': 'text', 'score': 0.9845904111862183, 'coordinate': [389.1331, 751.77673, 743.05884, 894.88196]}, {'cls_id': 22, 'label': 'text', 'score': 0.9848388433456421, 'coordinate': [388.83295, 896.0353, 743.5821, 1038.7367]}, {'cls_id': 22, 'label': 'text', 'score': 0.9804726243019104, 'coordinate': [389.0833, 1039.9131, 742.7598, 1134.4902]}, {'cls_id': 22, 'label': 'text', 'score': 0.9864556789398193, 'coordinate': [388.5259, 1135.8118, 743.45215, 1352.0105]}, {'cls_id': 22, 'label': 'text', 'score': 0.9869311451911926, 'coordinate': [769.8312, 775.6598, 1124.9835, 1063.2106]}, {'cls_id': 22, 'label': 'text', 'score': 0.9822818040847778, 'coordinate': [770.3026, 1063.9371, 1124.8307, 1184.2206]}, {'cls_id': 17, 'label': 'paragraph_title', 'score': 0.968923032283783, 'coordinate': [791.3031, 1199.3169, 1104.454, 1264.6992]}, {'cls_id': 22, 'label': 'text', 'score': 0.9712913036346436, 'coordinate': [770.42285, 1279.6072, 1124.6924, 1351.8679]}, {'cls_id': 22, 'label': 'text', 'score': 0.9236321449279785, 'coordinate': [1153.9055, 775.5812, 1334.0662, 798.1588]}, {'cls_id': 22, 'label': 'text', 'score': 0.985789954662323, 'coordinate': [1151.5193, 799.27954, 1506.362, 991.1172]}, {'cls_id': 22, 'label': 'text', 'score': 0.9820653796195984, 'coordinate': [1151.5708, 991.9118, 1506.6016, 1110.8875]}, {'cls_id': 22, 'label': 'text', 'score': 0.9865990877151489, 'coordinate': [1151.6917, 1112.1348, 1507.1611, 1351.9453]}]}, 'parsing_res_list': [{'block_label': 'doc_title', 'block_content': '助力双方交往 搭建友谊桥梁', 'block_bbox': [131, 36, 1384, 127]}, {'block_label': 'text', 'block_content': '本报记者 沈小晓 任彦 黄培昭', 'block_bbox': [585, 158, 930, 182]}, {'block_label': 'text', 'block_content': '身着中国传统民族服装的厄立特里亚青年依次登台表演中国民族舞、现代舞、扇子舞等,曼妙的舞姿赢得现场观众阵阵掌声。这是日前厄立特里亚高等教育与研究院孔子学院(以下简称“厄特孔院”)举办“喜迎新年”中国歌舞比赛的场景。', 'block_bbox': [9, 200, 361, 343]}, {'block_label': 'image', 'block_content': '', 'block_bbox': [775, 200, 1503, 684]}, {'block_label': 'text', 'block_content': '中国和厄立特里亚传统友谊深厚。近年来,在高质量共建“一带一路”框架下,中厄两国人文交流不断深化,互利合作的民意基础日益深厚。', 'block_bbox': [9, 344, 361, 440]}, {'block_label': 'paragraph_title', 'block_content': '“学好中文,我们的未来不是梦”', 'block_bbox': [28, 455, 341, 520]}, {'block_label': 'text', 'block_content': '鲜花曾告诉我你怎样走过,大地知道你心中的每一个角落……”厄立特里亚阿斯马拉大学综合楼二层,一阵优美的歌声在走廊里回响。循着熟悉的旋律轻轻推开一间教室的门,学生们正跟着老师学唱中文歌曲《同一首歌》。', 'block_bbox': [8, 536, 361, 655]}, {'block_label': 'text', 'block_content': '这是厄特孔院阿斯马拉大学教学点的一节中文歌曲课。为了让学生们更好地理解歌词大意,老师尤斯拉·穆罕默德萨尔·侯赛因逐字翻译和解释歌词。随着伴奏声响起,学生们边唱边随着节拍摇动身体,现场气氛热烈。', 'block_bbox': [8, 657, 362, 774]}, {'block_label': 'vision_footnote', 'block_content': '在厄立特里亚不久前举办的第六届中国风筝文化节上,当地小学生体验风筝制作。\n中国驻厄立特里亚大使馆供图', 'block_bbox': [809, 703, 1488, 750]}, {'block_label': 'text', 'block_content': '“这是中文歌曲初级班,共有32人。学生大部分来自首都阿斯马拉的中小学,年龄最小的仅有6岁。”尤斯拉告诉记者。', 'block_bbox': [9, 776, 361, 846]}, {'block_label': 'text', 'block_content': '尤斯拉今年23岁,是厄立特里亚一所公立学校的艺术老师。她12岁开始在厄特孔院学习中文,在2017年第十届“汉语桥”世界中学生中文比赛中获得厄立特里亚赛区第一名,并和同伴代表厄立特里亚前往中国参加决赛, +获得团体优胜奖。2022年起,尤斯拉开始在厄特孔院兼职教授中文歌曲,每周末两个课时。“中国文化博大精深,我希望我的学生们能够通过中文歌曲更好地理解中国文化。”她说。', 'block_bbox': [8, 848, 361, 1062]}, {'block_label': 'text', 'block_content': '“姐姐,你想去中国吗?”“非常想!我想去看故宫、爬长城。”尤斯拉的学生中有一对能歌善舞的姐妹,姐姐露娅今年15岁,妹妹莉娅14岁,两人都已在厄特孔院学习多年,中文说得格外流利。', 'block_bbox': [8, 1063, 361, 1182]}, {'block_label': 'text', 'block_content': '露娅对记者说:“这些年来,怀着对中文和中国文化的热爱,我们姐妹俩始终相互鼓励,一起学习。我们的中文一天比一天好,还学会了中文歌和中国舞。我们一定要到中国去。学好中文,我们的未来不是梦!”', 'block_bbox': [8, 1184, 361, 1302]}, {'block_label': 'text', 'block_content': '据厄特孔院中方院长黄鸣飞介绍,这所孔院成立于2013年3月,由贵州财经大学和厄立特里亚高等教育与研究院合作建立,开设了中国语言课程和中国文化课程,注册学生2万余人次。10余年来,厄特孔院已成为当地民众了解中国的一扇窗口。', 'block_bbox': [9, 1304, 361, 1351]}, {'block_label': 'text', 'block_content': '', 'block_bbox': [389, 200, 742, 295]}, {'block_label': 'text', 'block_content': '黄鸣飞表示,随着来学习中文的人日益增多,阿斯马拉大学教学点已难以满足教学需要。2024年4月,由中企蜀道集团所属四川路桥承建的孔院教学楼项目在阿斯马拉开工建设,预计今年上半年竣工,建成后将为厄特孔院提供全新的办学场地。', 'block_bbox': [388, 297, 744, 441]}, {'block_label': 'paragraph_title', 'block_content': '“在中国学习的经历让我看到更广阔的世界”', 'block_bbox': [409, 455, 721, 520]}, {'block_label': 'text', 'block_content': '多年来,厄立特里亚广大赴华留学生和培训人员积极投身国家建设,成为助力该国发展的人才和厄中友好的见证者和推动者。', 'block_bbox': [389, 536, 742, 608]}, {'block_label': 'text', 'block_content': '在厄立特里亚全国妇女联盟工作的约翰娜·特韦尔德·凯莱塔就是其中一位。她曾在中华女子学院攻读硕士学位,研究方向是女性领导力与社会发展。其间,她实地走访中国多个地区,获得了观察中国社会发展的第一手资料。', 'block_bbox': [389, 609, 743, 750]}, {'block_label': 'text', 'block_content': '谈起在中国求学的经历,约翰娜记忆犹新:“中国的发展在当今世界是独一无二的。沿着中国特色社会主义道路坚定前行,中国创造了发展奇迹,这一切都离不开中国共产党的领导。中国的发展经验值得许多国家学习借鉴。”', 'block_bbox': [389, 751, 743, 894]}, {'block_label': 'text', 'block_content': '正在西南大学学习的厄立特里亚博士生穆卢盖塔·泽穆伊对中国怀有深厚感情。8年前,在北京师范大学获得硕士学位后,穆卢盖塔在社交媒体上写下这样一段话:“这是我人生的重要一步,自此我拥有了一双坚固的鞋子,赋予我穿越荆棘的力量。”', 'block_bbox': [388, 896, 743, 1038]}, {'block_label': 'text', 'block_content': '穆卢盖塔密切关注中国在经济、科技、教育等领域的发展,“中国在科研等方面的实力与日俱增。在中国学习的经历让我看到更广阔的世界,从中受益匪浅。”', 'block_bbox': [389, 1039, 742, 1134]}, {'block_label': 'text', 'block_content': '23岁的莉迪亚·埃斯蒂法诺斯已在厄特孔院学习3年,在中国书法、中国画等方面表现十分优秀,在2024年厄立特里亚赛区的“汉语桥”比赛中获得一等奖。莉迪亚说:“学习中国书法让我的内心变得安宁和纯粹。我也喜欢中国的服饰,希望未来能去中国学习,把中国不同民族元素融入服装设计中,创作出更多精美作品,也把厄特文化分享给更多的中国朋友。”\n“不管远近都是客人,请不用客气;相约好了在一起,我们欢迎你……”在一场中厄青年联谊活动上,四川路桥中方员工同当地大学生合唱《北京 +欢迎你》。厄立特里亚技术学院计算机科学与工程专业学生鲁夫塔·谢拉是其中一名演唱者,她很早便在孔院学习中文,一直在为去中国留学作准备。“这句歌词是我们两国人民友谊的生动写照。无论是投身于厄特里亚基础设施建设的中企员工,还是在中国留学的厄立特里亚学子,两国人民携手努力,必将推动两国关系不断向前发展。”鲁夫塔说。', 'block_bbox': [388, 1135, 743, 1352]}, {'block_label': 'text', 'block_content': '', 'block_bbox': [769, 775, 1124, 1063]}, {'block_label': 'text', 'block_content': '厄立特里亚高等教育委员会主任助理萨马瑞表示:“每年我们都会组织学生到中国访问学习,目前有超过5000名厄立特里亚学生在中国留学。学习中国的教育经验,有助于提升厄立特里亚的教育水平。”', 'block_bbox': [770, 1063, 1124, 1184]}, {'block_label': 'paragraph_title', 'block_content': '“共同向世界展示非洲和亚洲的灿烂文明”', 'block_bbox': [791, 1199, 1104, 1264]}, {'block_label': 'text', 'block_content': '从阿斯马拉出发,沿着蜿蜒曲折的盘山公路一路向东寻找丝路印迹。驱车两个小时,记者来到位于厄立特里亚港口城市马萨瓦的北红海省博物馆。', 'block_bbox': [770, 1279, 1124, 1351]}, {'block_label': 'text', 'block_content': '', 'block_bbox': [1153, 775, 1334, 798]}, {'block_label': 'text', 'block_content': '博物馆二层陈列着一个发掘自阿杜利斯古城的中国古代陶制酒器,罐身上写着“万”“和”“禅”“山”等汉字。“这件文物证明,很早以前我们就通过海上丝绸之路进行贸易往来与文化交流。这也是厄立特里亚与中国友好交往历史的有力证明。”北红海省博物馆研究与文献部负责人伊萨亚斯·特斯法兹吉说。', 'block_bbox': [1151, 799, 1506, 991]}, {'block_label': 'text', 'block_content': '厄立特里亚国家博物馆考古学和人类学研究员菲尔蒙·特韦尔德十分喜爱中国文化。他表示:“学习彼此的语言和文化,将帮助厄中两国人民更好地理解彼此,助力双方交往,搭建友谊桥梁。”', 'block_bbox': [1151, 991, 1506, 1110]}, {'block_label': 'text', 'block_content': '厄立特里亚国家博物馆馆长塔吉丁·努里达姆·优素福曾多次 +访问中国,对中华文明的传承与创新、现代化博物馆的建设与发展印象深刻。“中国博物馆不仅有许多保存完好的文物,还充分运用先进科技手段进行展示,帮助人们更好理解中华文明。”塔吉丁说,“厄立特里亚与中国都拥有悠久的文明,始终相互理解、相互尊重。我希望未来与中国同行加强合作,共同向世界展示非洲和亚洲的灿烂文明。”', 'block_bbox': [1151, 1112, 1507, 1351]}]}} + +**benchmark** + +1. 下载和解压image数据集 + +```bash +wget https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/internal/tmp/images.tar +tar xvf images.tar +``` + +2. 准备推理脚本`infer_ocr_vl_benchmark.py` + +```python +import os +from paddleocr import PaddleOCRVL + +input_path = "./images" +pipeline = PaddleOCRVL(vl_rec_backend="fastdeploy-server", vl_rec_server_url="http://127.0.0.1:8180/v1") +file_list = os.listdir(input_path) +for file_name in file_list: + file_path = os.path.join(input_path, file_name) + output = pipeline.predict(file_path) + for res in output: + res.print() + res.save_to_markdown(save_path="output", pretty=False) +``` + +3. 客户端执行`infer_ocr_vl_benchmark.py` + +```bash +python3 infer_ocr_vl_benchmark.py +``` + +每推理完一张图片,会在`output`路径下生成一个对应的`md`文件,跑完整个benchmark(1355张图片)大概需要5个小时。 diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 4fc0010f3..8a01350a1 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -535,7 +535,12 @@ class EngineArgs: f"scheduler, please provide --router argument." ) - if not (current_platform.is_cuda() or current_platform.is_xpu() or current_platform.is_maca()): + if not ( + current_platform.is_cuda() + or current_platform.is_xpu() + or current_platform.is_maca() + or current_platform.is_iluvatar() + ): envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 if "PaddleOCR" in get_model_architecture(self.model, self.model_config_name): diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index 0d4c062af..1b49dfcec 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -428,6 +428,10 @@ class ResourceManagerV1(ResourceManager): grid_thw = paddle.to_tensor(grid_thw, dtype="int64") if current_platform.is_xpu(): from fastdeploy.model_executor.ops.xpu import get_img_boundaries + elif current_platform.is_iluvatar(): + from fastdeploy.model_executor.ops.iluvatar import ( + get_img_boundaries, + ) else: from fastdeploy.model_executor.ops.gpu import get_img_boundaries diff --git a/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py b/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py index 6fa82573e..41b9864cf 100644 --- a/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py @@ -95,35 +95,54 @@ class IluvatarAttnBackend(AttentionBackend): self.num_layers = fd_config.model_config.num_hidden_layers self.dtype = paddle.get_default_dtype() self.enable_mm = fd_config.model_config.enable_mm + self.rope_batch_stride = self.max_context_len * self.head_dim if self.enable_mm else 0 + if "paddleocr" in fd_config.model_config.model_type: + self.is_interleaved_rope_mode = False + else: + self.is_interleaved_rope_mode = True + + def split_cos_sin(self, batch_ids, forward_meta: ForwardMeta): + if self.enable_mm: + # the num_seqs dim of rotary_embs > 1 (e.g. ernie-vl and paddleocr-vl) + cos = forward_meta.rotary_embs[batch_ids, 0, 0, :, :, :] + sin = forward_meta.rotary_embs[batch_ids, 1, 0, :, :, :] + else: + # the num_seqs dim of rotary_embs = 1 (e.g. ernie-text) + cos = forward_meta.rotary_embs[0, 0, :, :, :] + sin = forward_meta.rotary_embs[1, 0, :, :, :] + return cos, sin def init_attention_metadata(self, forward_meta: ForwardMeta): """Initialize attntion metadata hence all layers in the forward pass can reuse it.""" - if self.enable_mm: - # VL: TODO: The first 0 may need to be replaced with batch_id - # of max_num_seqs when running multiple batch case later - self.rope_cos = forward_meta.rotary_embs[0, 0, 0, :, :, :] - self.rope_sin = forward_meta.rotary_embs[0, 1, 0, :, :, :] - else: - # text - self.rope_cos = forward_meta.rotary_embs[0, 0, :, :, :] - self.rope_sin = forward_meta.rotary_embs[1, 0, :, :, :] self.prefill_info_dict = {} self.decode_info_dict = {} self.prefill_info_dict["batch_ids"] = paddle.where(forward_meta.seq_lens_encoder)[0] self.decode_info_dict["batch_ids"] = paddle.where(forward_meta.seq_lens_decoder)[0] self.prefill_len = len(self.prefill_info_dict["batch_ids"]) self.decode_len = len(self.decode_info_dict["batch_ids"]) + prefill_batch_ids = self.prefill_info_dict["batch_ids"] + decode_batch_ids = self.decode_info_dict["batch_ids"] + if prefill_batch_ids.dim() == 0: + prefill_batch_ids = prefill_batch_ids.unsqueeze(0) + if decode_batch_ids.dim() == 0: + decode_batch_ids = decode_batch_ids.unsqueeze(0) # only prefill if self.decode_len == 0: - cu_seq_ids = list(range(self.prefill_len + 1)) - self.prefill_info_dict["cu_seqlens_q"] = forward_meta.cu_seqlens_q[cu_seq_ids] self.mixed = False + cu_seq_ids = self.prefill_info_dict["batch_ids"] + 1 + self.prefill_info_dict["cu_seqlens_q"] = paddle.concat( + [forward_meta.cu_seqlens_q[:1], forward_meta.cu_seqlens_q[cu_seq_ids]] + ) + self.rope_cos, self.rope_sin = self.split_cos_sin(prefill_batch_ids, forward_meta) # only decode elif self.prefill_len == 0: self.mixed = False + self.rope_cos, self.rope_sin = self.split_cos_sin(decode_batch_ids, forward_meta) # both prefill and decode else: self.mixed = True + self.prefill_rope_cos, self.prefill_rope_sin = self.split_cos_sin(prefill_batch_ids, forward_meta) + self.decode_rope_cos, self.decode_rope_sin = self.split_cos_sin(decode_batch_ids, forward_meta) self.prefill_num_tokens = paddle.sum(forward_meta.seq_lens_encoder).item() self.prefill_info_dict["cu_seqlens_q"] = paddle.zeros( [self.prefill_len + 1], dtype=forward_meta.cu_seqlens_q.dtype @@ -141,7 +160,7 @@ class IluvatarAttnBackend(AttentionBackend): ) prefill_start, decode_start, start = 0, self.prefill_num_tokens, 0 - non_zeros_ids = forward_meta.seq_lens_this_time != 0 + non_zeros_ids = paddle.where(forward_meta.seq_lens_this_time)[0] non_zeros_seq_lens = forward_meta.seq_lens_this_time[non_zeros_ids] end = non_zeros_seq_lens[0] if end > 1: @@ -234,6 +253,8 @@ class IluvatarAttnBackend(AttentionBackend): v_cache, block_tables=forward_meta.block_tables[self.prefill_info_dict["batch_ids"], :], cu_seqlens_qkv=self.prefill_info_dict["cu_seqlens_q"], + rope_sin=self.rope_sin, + rope_cos=self.rope_cos, num_heads=self.num_heads, head_dim=self.head_dim, num_kv_heads=self.num_kv_heads, @@ -244,8 +265,7 @@ class IluvatarAttnBackend(AttentionBackend): q_rope=True, k_rope=True, v_rope=False, - rope_sin=self.rope_sin, - rope_cos=self.rope_cos, + is_interleaved_rope_mode=self.is_interleaved_rope_mode, ) elif self.prefill_len == 0: output = paged_attention( @@ -272,6 +292,8 @@ class IluvatarAttnBackend(AttentionBackend): v=qkv, rope_sin=self.rope_sin, rope_cos=self.rope_cos, + rope_batch_stride=self.rope_batch_stride, + is_interleaved_rope_mode=self.is_interleaved_rope_mode, ) else: output = mixed_fused_paged_attention( @@ -282,6 +304,8 @@ class IluvatarAttnBackend(AttentionBackend): decode_block_tables=forward_meta.block_tables[self.decode_info_dict["batch_ids"], :], cu_seqlens_qkv=self.prefill_info_dict["cu_seqlens_q"], seq_lens=forward_meta.seq_lens_decoder[self.decode_info_dict["batch_ids"], 0] + 1, + prefill_rope_sin=self.prefill_rope_sin, + prefill_rope_cos=self.prefill_rope_cos, prefill_num_tokens=self.prefill_num_tokens, num_heads=self.num_heads, head_dim=self.head_dim, @@ -298,8 +322,10 @@ class IluvatarAttnBackend(AttentionBackend): softcap=self.attention_metadata.softcap, use_cuda_graph=self.attention_metadata.use_cuda_graph, use_sqrt_alibi=self.attention_metadata.use_sqrt_alibi, - rope_sin=self.rope_sin, - rope_cos=self.rope_cos, + decode_rope_sin=self.decode_rope_sin, + decode_rope_cos=self.decode_rope_cos, + rope_batch_stride=self.rope_batch_stride, + is_interleaved_rope_mode=self.is_interleaved_rope_mode, ) return output diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py index bce887dcc..4e564cedd 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py @@ -54,6 +54,7 @@ from fastdeploy.model_executor.models.model_base import ( ModelForCasualLM, ModelRegistry, ) +from fastdeploy.platforms import current_platform class Ernie4_5_VLMLP(Ernie4_5_MLP): @@ -539,6 +540,10 @@ class Ernie4_5_VLModel(nn.Layer): text_image_index_out(vl_moe_meta.token_type_ids, vl_moe_meta.text_index, vl_moe_meta.image_index) hidden_states = input_embeddings + + if current_platform.is_iluvatar() and forward_meta.attn_backend.mixed: + hidden_states = forward_meta.attn_backend.transpose(hidden_states) + residual = None for i in range(self.num_layers): @@ -550,6 +555,10 @@ class Ernie4_5_VLModel(nn.Layer): ) out = self.norm(hidden_states, residual, forward_meta=forward_meta)[0] + + if current_platform.is_iluvatar() and forward_meta.attn_backend.mixed: + out = forward_meta.attn_backend.reverse_transpose(out) + return out diff --git a/fastdeploy/model_executor/models/paddleocr_vl/paddleocr_vl.py b/fastdeploy/model_executor/models/paddleocr_vl/paddleocr_vl.py index 21cc2676a..780763605 100644 --- a/fastdeploy/model_executor/models/paddleocr_vl/paddleocr_vl.py +++ b/fastdeploy/model_executor/models/paddleocr_vl/paddleocr_vl.py @@ -40,6 +40,7 @@ from fastdeploy.model_executor.utils import ( default_weight_loader, process_weights_after_loading, ) +from fastdeploy.platforms import current_platform from .projector import Projector from .siglip import SiglipVisionModel @@ -101,12 +102,19 @@ class PaddleOCRVLModel(nn.Layer): forward_meta: ForwardMeta, ): hidden_states = input_embeddings + + if current_platform.is_iluvatar() and forward_meta.attn_backend.mixed: + hidden_states = forward_meta.attn_backend.transpose(hidden_states) + residual = None for i in range(self.num_layers): hidden_states, residual = self.layers[i](forward_meta, hidden_states, residual) out = self.norm(hidden_states, residual)[0] + if current_platform.is_iluvatar() and forward_meta.attn_backend.mixed: + out = forward_meta.attn_backend.reverse_transpose(out) + return out diff --git a/fastdeploy/model_executor/ops/iluvatar/paged_attention.py b/fastdeploy/model_executor/ops/iluvatar/paged_attention.py index 877ba0a76..4e28e5784 100644 --- a/fastdeploy/model_executor/ops/iluvatar/paged_attention.py +++ b/fastdeploy/model_executor/ops/iluvatar/paged_attention.py @@ -52,6 +52,8 @@ def paged_attention( v: paddle.Tensor = None, rope_sin: paddle.Tensor = None, rope_cos: paddle.Tensor = None, + rope_batch_stride: int = 0, + is_interleaved_rope_mode: bool = True, ): return paged_attn( q, @@ -77,6 +79,8 @@ def paged_attention( use_cuda_graph, use_sqrt_alibi, merged_qkv, + rope_batch_stride, + is_interleaved_rope_mode, ) @@ -86,6 +90,8 @@ def prefill_fused_paged_attention( v_cache: paddle.Tensor, block_tables: paddle.Tensor, cu_seqlens_qkv: paddle.Tensor, + rope_sin: paddle.Tensor, + rope_cos: paddle.Tensor, num_heads: int, head_dim: int, num_kv_heads: int, @@ -96,8 +102,7 @@ def prefill_fused_paged_attention( q_rope: bool = True, k_rope: bool = True, v_rope: bool = False, - rope_sin: paddle.Tensor = None, - rope_cos: paddle.Tensor = None, + is_interleaved_rope_mode: bool = True, ): return prefill_fused_paged_attn( qkv, @@ -117,6 +122,7 @@ def prefill_fused_paged_attention( q_rope, k_rope, v_rope, + is_interleaved_rope_mode, ) @@ -128,6 +134,8 @@ def mixed_fused_paged_attention( decode_block_tables: paddle.Tensor, cu_seqlens_qkv: paddle.Tensor, seq_lens: paddle.Tensor, + prefill_rope_sin: paddle.Tensor, + prefill_rope_cos: paddle.Tensor, prefill_num_tokens: int, num_heads: int, head_dim: int, @@ -144,8 +152,10 @@ def mixed_fused_paged_attention( softcap: float = 0.0, use_cuda_graph: bool = False, use_sqrt_alibi: bool = False, - rope_sin: paddle.Tensor = None, - rope_cos: paddle.Tensor = None, + decode_rope_sin: paddle.Tensor = None, + decode_rope_cos: paddle.Tensor = None, + rope_batch_stride: int = 0, + is_interleaved_rope_mode: bool = True, ): return mixed_fused_paged_attn( qkv, @@ -155,8 +165,10 @@ def mixed_fused_paged_attention( decode_block_tables, cu_seqlens_qkv, seq_lens, - rope_sin, - rope_cos, + prefill_rope_sin, + prefill_rope_cos, + decode_rope_sin, + decode_rope_cos, prefill_num_tokens, num_heads, head_dim, @@ -173,4 +185,6 @@ def mixed_fused_paged_attention( softcap, use_cuda_graph, use_sqrt_alibi, + rope_batch_stride, + is_interleaved_rope_mode, ) diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index 0ed6ec2c6..f07c663b5 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -33,6 +33,7 @@ if current_platform.is_iluvatar(): set_stop_value_multi_ends, step_paddle, update_inputs, + update_inputs_v1, ) elif current_platform.is_gcu(): from fastdeploy.model_executor.ops.gcu import ( diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 8acd259d2..a614a354f 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -56,11 +56,11 @@ from fastdeploy.platforms import current_platform if current_platform.is_iluvatar(): from fastdeploy.model_executor.ops.iluvatar import ( + recover_decode_task, set_data_ipc, set_value_by_flags_and_idx, ) - recover_decode_task = None share_external_data = None elif current_platform.is_dcu(): from fastdeploy.model_executor.ops.gpu import set_value_by_flags_and_idx @@ -467,7 +467,7 @@ class GPUModelRunner(ModelRunnerBase): multi_vision_inputs["encoder_cache_info"].append((mm_hash, feature_positions[i], False)) if envs.FD_ENABLE_MAX_PREFILL: multi_vision_inputs["images_lst"].append( - inputs["images"][image_start_idx : image_start_idx + image_offset].cuda() + inputs["images"][image_start_idx : image_start_idx + image_offset].to(self.device) ) multi_vision_inputs["grid_thw_lst"].append(paddle.to_tensor(grid_thw_list[i])) multi_vision_inputs["cu_seqlens"].append(vit_seqlen_list[i]) @@ -486,7 +486,7 @@ class GPUModelRunner(ModelRunnerBase): else: if envs.FD_ENABLE_MAX_PREFILL: multi_vision_inputs["images_lst"].append( - inputs["images"][request.image_start : request.image_end].cuda() + inputs["images"][request.image_start : request.image_end].to(self.device) ) multi_vision_inputs["grid_thw_lst"].extend( paddle.to_tensor(inputs["grid_thw"][request.num_image_start : request.num_image_end]) diff --git a/fastdeploy/worker/iluvatar_model_runner.py b/fastdeploy/worker/iluvatar_model_runner.py index 15dc8472c..63c022289 100644 --- a/fastdeploy/worker/iluvatar_model_runner.py +++ b/fastdeploy/worker/iluvatar_model_runner.py @@ -38,7 +38,6 @@ class IluvatarModelRunner(GPUModelRunner): ) assert not self.speculative_decoding, "Iluvatar does not support speculative decoding" assert self.guided_backend is None, "Iluvatar does not support guided decoding" - assert not envs.ENABLE_V1_KVCACHE_SCHEDULER, "Iluvatar does not support v1 kvcache scheduler" assert not self.cache_config.enable_prefix_caching, "Iluvatar does not support prefix caching" self.mla_cache = envs.FD_ATTENTION_BACKEND == "MLA_ATTN" assert not self.mla_cache, "Iluvatar does not support MLA" @@ -48,9 +47,9 @@ class IluvatarModelRunner(GPUModelRunner): not self.cache_config.enable_chunked_prefill ), "Iluvatar does not support chunked prefill for VL model" # VL neox style = True - if self.enable_mm: - emb_shape = self.share_inputs["rope_emb"].shape - emb_shape[-1] *= 2 + emb_shape = self.share_inputs["rope_emb"].shape + if emb_shape[-1] == self.model_config.head_dim // 2: + emb_shape[-1] = self.model_config.head_dim self.share_inputs["rope_emb"] = paddle.full( shape=emb_shape, fill_value=0, diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 386228fe6..668229183 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -983,7 +983,12 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: logger.info(f"- Dynamic load weight: {load_config.dynamic_load_weight}") logger.info(f"- Load strategy: {load_config.load_strategy}") - if not (current_platform.is_cuda() or current_platform.is_xpu() or current_platform.is_maca()): + if not ( + current_platform.is_cuda() + or current_platform.is_xpu() + or current_platform.is_maca() + or current_platform.is_iluvatar() + ): logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported.") envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 diff --git a/requirements_iluvatar.txt b/requirements_iluvatar.txt index d91cf1639..0cb60ae88 100644 --- a/requirements_iluvatar.txt +++ b/requirements_iluvatar.txt @@ -10,7 +10,7 @@ tqdm pynvml uvicorn==0.29.0 fastapi -paddleformers==0.3.1 +paddleformers==0.4.0 redis etcd3 httpx diff --git a/scripts/run_ci_iluvatar.sh b/scripts/run_ci_iluvatar.sh index c47fe2c4c..d2e00786c 100644 --- a/scripts/run_ci_iluvatar.sh +++ b/scripts/run_ci_iluvatar.sh @@ -4,7 +4,6 @@ echo "$DIR" #先kill一遍 ps -efww | grep -E 'run_ernie300B_4layer' | grep -v grep | awk '{print $2}' | xargs kill -9 || true -ixsmi -r unset http_proxy unset https_proxy @@ -15,14 +14,13 @@ ln -sf /usr/local/bin/python3 /usr/local/bin/python echo "pip requirements" python -m pip install -r requirements_iluvatar.txt echo "install paddle cpu and custom device" -python -m pip install paddlepaddle==3.3.0.dev20251028 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/ -python -m pip install paddle-iluvatar-gpu==3.0.0.dev20251029 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/ +python -m pip install paddlepaddle==3.3.0.dev20251103 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/ +python -m pip install paddle-iluvatar-gpu==3.0.0.dev20251107 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/ echo "build whl" bash build.sh || exit 1 CI_PATH=tests/ci_use/iluvatar_UT export INFERENCE_MSG_QUEUE_ID=232132 -export FD_DEBUG=1 export PADDLE_XCCL_BACKEND=iluvatar_gpu export FD_SAMPLING_CLASS=rejection @@ -42,8 +40,17 @@ do ps -efww | grep -E '${cur_test_file}' | grep -v grep | awk '{print $2}' | xargs kill -9 || true if [ ${exit_code} -ne 0 ]; then - echo "log/workerlog.0" - cat log/workerlog.0 + if [ ! -f "./log/workerlog.0" ]; then + echo "------------------- log/launch_worker.log -----------------" + cat log/launch_worker.log + else + echo "------------------- log/workerlog.0 -----------------" + cat log/workerlog.0 + fi + if [ -f "log/fastdeploy_error.log" ]; then + echo "------------------- log/fastdeploy_error.log -----------------" + cat log/fastdeploy_error.log + fi exit 1 fi done diff --git a/tests/ci_use/iluvatar_UT/bench_gsm8k.py b/tests/ci_use/iluvatar_UT/bench_gsm8k.py new file mode 100644 index 000000000..6f792eb3d --- /dev/null +++ b/tests/ci_use/iluvatar_UT/bench_gsm8k.py @@ -0,0 +1,235 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Fastdeploy + ERNIE-4.5-Turbo 的指标评估""" +# adapted from https://github.com/sgl-project/sglang/blob/main/benchmark/gsm8k/bench_other.py +import argparse +import ast +import json +import re +import time +from concurrent.futures import ThreadPoolExecutor + +import numpy as np +import requests +from tqdm import tqdm + +INVALID = -9999999 + + +def call_generate(prompt, **kwargs): + """ + Generates response based on the input prompt. + + Args: + prompt (str): The input prompt text. + **kwargs: Keyword arguments, including server IP address and port number. + + Returns: + str: The response generated based on the prompt. + + """ + url = f"http://{kwargs['ip']}:{kwargs['port']}/v1/chat/completions" + headers = {"Content-Type": "application/json"} + data = { + "messages": [ + { + "role": "user", + "content": prompt, + } + ], + "temperature": 0.6, + "max_tokens": 2047, + "top_p": 0.95, + "do_sample": True, + } + + response = requests.post(url, headers=headers, data=json.dumps(data)) + out = response.json() + return out["choices"][0]["message"]["content"] + + +def get_one_example(lines, i, include_answer): + """ + Retrieves a question-answer example from the given list of text lines. + + Args: + lines (list of dict): A list of question-answer pairs. + i (int): The index of the question-answer pair to retrieve from lines. + include_answer (bool): Whether to include the answer in the returned string. + + Returns: + str: A formatted question-answer string in the format "Question: \nAnswer: ". + + """ + ret = "Question: " + lines[i]["question"] + "\nAnswer:" + if include_answer: + ret += " " + lines[i]["answer"] + return ret + + +def get_few_shot_examples(lines, k): + """ + Selects k examples from the given list of text lines and concatenates them into a single string. + + Args: + lines (list): A list containing text lines. + k (int): The number of examples to select. + + Returns: + str: A string composed of k examples, separated by two newline characters. + """ + ret = "" + for i in range(k): + ret += get_one_example(lines, i, True) + "\n\n" + return ret + + +def get_answer_value(answer_str): + """ + Extracts numerical values from an answer string and returns them. + + Args: + answer_str (str): The string containing the answer. + + Returns: + The extracted numerical value; returns "INVALID" if extraction fails. + """ + answer_str = answer_str.replace(",", "") + numbers = re.findall(r"\d+", answer_str) + if len(numbers) < 1: + return INVALID + try: + return ast.literal_eval(numbers[-1]) + except SyntaxError: + return INVALID + + +def read_jsonl(filename: str): + """ + Reads a JSONL file. + + Args: + filename (str): Path to the JSONL file. + + Yields: + dict: A dictionary object corresponding to each line in the JSONL file. + """ + with open(filename) as fin: + for line in fin: + if line.startswith("#"): + continue + yield json.loads(line) + + +def main(args): + """ + Process inputs and generate answers by calling the model in parallel using a thread pool. + + Args: + args (argparse.Namespace): + - num_questions (int): Number of questions to process. + - num_shots (int): Number of few-shot learning examples. + - ip (str): IP address of the model service. + - port (int): Port number of the model service. + - parallel (int): Number of questions to process in parallel. + - result_file (str): File path to store the results. + + Returns: + None + + """ + # Read data + filename = "test.jsonl" + + lines = list(read_jsonl(filename)) + + # Construct prompts + num_questions = args.num_questions + num_shots = args.num_shots + few_shot_examples = get_few_shot_examples(lines, num_shots) + + questions = [] + labels = [] + for i in range(len(lines[:num_questions])): + questions.append(get_one_example(lines, i, False)) + labels.append(get_answer_value(lines[i]["answer"])) + assert all(l != INVALID for l in labels) + + states = [None] * len(labels) + + # Use thread pool + def get_one_answer(i): + answer = call_generate( + prompt=few_shot_examples + questions[i], + # stop=["Question", "Assistant:", "<|separator|>"], + ip=args.ip, + port=args.port, + ) + states[i] = answer + + tic = time.time() + if args.parallel == 1: + for i in tqdm(range(len(questions))): + get_one_answer(i) + else: + with ThreadPoolExecutor(args.parallel) as executor: + list( + tqdm( + executor.map(get_one_answer, list(range(len(questions)))), + total=len(questions), + ) + ) + + latency = time.time() - tic + preds = [] + for i in range(len(states)): + preds.append(get_answer_value(states[i])) + + # Compute accuracy + acc = np.mean(np.array(preds) == np.array(labels)) + invalid = np.mean(np.array(preds) == INVALID) + + # Print results + print(f"Accuracy: {acc:.3f}") + print(f"Invalid: {invalid:.3f}") + print(f"Latency: {latency:.3f} s") + + with open(args.result_file, "a") as fout: + value = { + "task": "gsm8k", + "backend": "paddlepaddle", + "num_gpus": 1, + "latency": round(latency, 3), + "accuracy": round(acc, 3), + "num_requests": args.num_questions, + "other": { + "num_questions": args.num_questions, + "parallel": args.parallel, + }, + } + fout.write(json.dumps(value) + "\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--ip", type=str, default="127.0.0.1") + parser.add_argument("--port", type=str, default="8188") + parser.add_argument("--num-shots", type=int, default=10) + parser.add_argument("--data-path", type=str, default="test.jsonl") + parser.add_argument("--num-questions", type=int, default=1319) + parser.add_argument("--result-file", type=str, default="result.jsonl") + parser.add_argument("--parallel", type=int, default=1) + args = parser.parse_args() + main(args) diff --git a/tests/ci_use/iluvatar_UT/run_ernie300B_4layer.py b/tests/ci_use/iluvatar_UT/run_ernie300B_4layer.py index 0ffa39b1c..de0e3e930 100644 --- a/tests/ci_use/iluvatar_UT/run_ernie300B_4layer.py +++ b/tests/ci_use/iluvatar_UT/run_ernie300B_4layer.py @@ -12,43 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -import functools +import os import sys -import threading from fastdeploy import LLM, SamplingParams from fastdeploy.utils import set_random_seed +tests_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) +sys.path.insert(0, tests_dir) -def timeout(seconds): - def decorator(func): - @functools.wraps(func) - def wrapper(*args, **kwargs): - result = [None] - exception = [None] - - def target(): - try: - result[0] = func(*args, **kwargs) - except Exception as e: - exception[0] = e - - thread = threading.Thread(target=target) - thread.daemon = True - thread.start() - thread.join(seconds) - - if thread.is_alive(): - raise TimeoutError(f"Function timed out after {seconds} seconds") - - if exception[0]: - raise exception[0] - - return result[0] - - return wrapper - - return decorator +from ci_use.iluvatar_UT.utils import TIMEOUT_MSG, timeout @timeout(80) @@ -75,15 +48,15 @@ def offline_infer_check(): 59335, 68170, 183, - 49080, - 94717, - 82966, - 99140, - 31615, - 51497, - 94851, - 60764, - 10889, + 97404, + 100088, + 36310, + 95633, + 95913, + 41459, + 95049, + 94970, + 96840, 2, ], f"{outputs[0].outputs.token_ids}" print("PASSED") @@ -94,10 +67,7 @@ if __name__ == "__main__": result = offline_infer_check() sys.exit(0) except TimeoutError: - print( - "The timeout exit may be due to multiple processes sharing the " - "same gpu card. You can check this using ixsmi on the device." - ) + print(TIMEOUT_MSG) sys.exit(124) except Exception: sys.exit(1) diff --git a/tests/ci_use/iluvatar_UT/run_ernie_vl_28B.py b/tests/ci_use/iluvatar_UT/run_ernie_vl_28B.py index d0da1ae72..8efba5c99 100644 --- a/tests/ci_use/iluvatar_UT/run_ernie_vl_28B.py +++ b/tests/ci_use/iluvatar_UT/run_ernie_vl_28B.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import functools import io +import os import sys -import threading import requests from PIL import Image @@ -24,39 +23,13 @@ from fastdeploy import LLM, SamplingParams from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer from fastdeploy.utils import set_random_seed +tests_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) +sys.path.insert(0, tests_dir) -def timeout(seconds): - def decorator(func): - @functools.wraps(func) - def wrapper(*args, **kwargs): - result = [None] - exception = [None] - - def target(): - try: - result[0] = func(*args, **kwargs) - except Exception as e: - exception[0] = e - - thread = threading.Thread(target=target) - thread.daemon = True - thread.start() - thread.join(seconds) - - if thread.is_alive(): - raise TimeoutError(f"Function timed out after {seconds} seconds") - - if exception[0]: - raise exception[0] - - return result[0] - - return wrapper - - return decorator +from ci_use.iluvatar_UT.utils import TIMEOUT_MSG, timeout -@timeout(180) +@timeout(210) def offline_infer_check(): set_random_seed(123) @@ -122,9 +95,9 @@ def offline_infer_check(): 5119, 93956, 68725, - 14449, - 4356, - 38225, + 100282, + 23, + 23, 2, ], f"{outputs[0].outputs.token_ids}" print("PASSED") @@ -135,10 +108,7 @@ if __name__ == "__main__": result = offline_infer_check() sys.exit(0) except TimeoutError: - print( - "The timeout exit may be due to multiple processes sharing the " - "same gpu card. You can check this using ixsmi on the device." - ) + print(TIMEOUT_MSG) sys.exit(124) except Exception: sys.exit(1) diff --git a/tests/ci_use/iluvatar_UT/utils.py b/tests/ci_use/iluvatar_UT/utils.py new file mode 100644 index 000000000..10b2481f2 --- /dev/null +++ b/tests/ci_use/iluvatar_UT/utils.py @@ -0,0 +1,28 @@ +import functools +import signal + + +def timeout(seconds): + def decorator(func): + def _handle_timeout(signum, frame): + raise TimeoutError(f"Function '{func.__name__}' timed out after {seconds} seconds") + + @functools.wraps(func) + def wrapper(*args, **kwargs): + original_handler = signal.signal(signal.SIGALRM, _handle_timeout) + signal.alarm(seconds) + + try: + result = func(*args, **kwargs) + signal.alarm(0) + return result + finally: + signal.signal(signal.SIGALRM, original_handler) + signal.alarm(0) + + return wrapper + + return decorator + + +TIMEOUT_MSG = "The timeout exit may be due to multiple processes sharing the same gpu card. You can check this using ixsmi on the device."