diff --git a/custom_ops/gpu_ops/per_token_quant_fp8.cu b/custom_ops/gpu_ops/per_token_quant_fp8.cu index 9a16d4d36..f195403a5 100644 --- a/custom_ops/gpu_ops/per_token_quant_fp8.cu +++ b/custom_ops/gpu_ops/per_token_quant_fp8.cu @@ -50,13 +50,11 @@ __global__ void quant_per_token_per_block(const T *input, max_value_thread = max(abs(load_vec_float[vid]), max_value_thread); } // get max value per warp - max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 16), max_value_thread); - max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 8), max_value_thread); - max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 4), max_value_thread); - max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 2), max_value_thread); - max_value_thread = max(__shfl_down_sync(0xffffffff, max_value_thread, 1), max_value_thread); - // broadcast max_value - max_value_thread = __shfl_sync(0xFFFFFFFF, max_value_thread, 0); + max_value_thread = max(__shfl_xor_sync(0xffffffff, max_value_thread, 16), max_value_thread); + max_value_thread = max(__shfl_xor_sync(0xffffffff, max_value_thread, 8), max_value_thread); + max_value_thread = max(__shfl_xor_sync(0xffffffff, max_value_thread, 4), max_value_thread); + max_value_thread = max(__shfl_xor_sync(0xffffffff, max_value_thread, 2), max_value_thread); + max_value_thread = max(__shfl_xor_sync(0xffffffff, max_value_thread, 1), max_value_thread); max_value_thread = max(max_value_thread, epsilon); float scale_to_store = max_value_thread / MAX_VALUE; // quant diff --git a/docs/get_started/installation/kunlunxin_xpu.md b/docs/get_started/installation/kunlunxin_xpu.md index 522d5ca76..a253e5ae7 100644 --- a/docs/get_started/installation/kunlunxin_xpu.md +++ b/docs/get_started/installation/kunlunxin_xpu.md @@ -218,4 +218,4 @@ for chunk in response: print('\n') ``` -For detailed OpenAI protocol specifications, see [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create). +For detailed OpenAI protocol specifications, see [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create). Differences from the standard OpenAI protocol are documented in [OpenAI Protocol-Compatible API Server](../../online_serving/README.md). diff --git a/docs/zh/get_started/installation/kunlunxin_xpu.md b/docs/zh/get_started/installation/kunlunxin_xpu.md index 642296ba2..1e099c1f6 100644 --- a/docs/zh/get_started/installation/kunlunxin_xpu.md +++ b/docs/zh/get_started/installation/kunlunxin_xpu.md @@ -223,4 +223,4 @@ for chunk in response: print('\n') ``` -OpenAI 协议的更多说明可参考文档 [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create)。 +OpenAI 协议的更多说明可参考文档 [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create),以及与 OpenAI 协议的区别可以参考 [兼容 OpenAI 协议的服务化部署](../../online_serving/README.md)。