diff --git a/custom_ops/gpu_ops/per_token_quant_fp8.cu b/custom_ops/gpu_ops/per_token_quant_fp8.cu
index 9a16d4d36..f195403a5 100644
--- a/custom_ops/gpu_ops/per_token_quant_fp8.cu
+++ b/custom_ops/gpu_ops/per_token_quant_fp8.cu
@@ -50,13 +50,11 @@ __global__ void quant_per_token_per_block(const T *input,
                 max_value_thread = max(abs(load_vec_float[vid]), max_value_thread);
             }
             // get max value per warp
-            max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 16), max_value_thread);
-            max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 8), max_value_thread);
-            max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 4), max_value_thread);
-            max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 2), max_value_thread);
-            max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 1), max_value_thread);
-            // broadcast max_value
-            max_value_thread = __shfl_sync(0xFFFFFFFF, max_value_thread, 0);
+            max_value_thread = max(__shfl_xor_sync(0xffffffff, max_value_thread, 16), max_value_thread);
+            max_value_thread = max(__shfl_xor_sync(0xffffffff, max_value_thread, 8), max_value_thread);
+            max_value_thread = max(__shfl_xor_sync(0xffffffff, max_value_thread, 4), max_value_thread);
+            max_value_thread = max(__shfl_xor_sync(0xffffffff, max_value_thread, 2), max_value_thread);
+            max_value_thread = max(__shfl_xor_sync(0xffffffff, max_value_thread, 1), max_value_thread);
             max_value_thread = max(max_value_thread, epsilon);
             float scale_to_store = max_value_thread / MAX_VALUE;
             // quant
diff --git a/docs/get_started/installation/kunlunxin_xpu.md b/docs/get_started/installation/kunlunxin_xpu.md
index 522d5ca76..a253e5ae7 100644
--- a/docs/get_started/installation/kunlunxin_xpu.md
+++ b/docs/get_started/installation/kunlunxin_xpu.md
@@ -218,4 +218,4 @@ for chunk in response:
 print('\n')
 ```
 
-For detailed OpenAI protocol specifications, see [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create).
+For detailed OpenAI protocol specifications, see [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create). Differences from the standard OpenAI protocol are documented in [OpenAI Protocol-Compatible API Server](../../online_serving/README.md).
diff --git a/docs/zh/get_started/installation/kunlunxin_xpu.md b/docs/zh/get_started/installation/kunlunxin_xpu.md
index 642296ba2..1e099c1f6 100644
--- a/docs/zh/get_started/installation/kunlunxin_xpu.md
+++ b/docs/zh/get_started/installation/kunlunxin_xpu.md
@@ -223,4 +223,4 @@ for chunk in response:
 print('\n')
 ```
 
-OpenAI 协议的更多说明可参考文档 [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create)。
+OpenAI 协议的更多说明可参考文档 [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create)，以及与 OpenAI 协议的区别可以参考 [兼容 OpenAI 协议的服务化部署](../../online_serving/README.md)。