[Precision] Support lm_head layer running in float32 (#3597)

* support lm_head fp32 bf16 fp16 * support lm_head fp32 bf16 fp16 * add doc and check code * lm_head_fp32 specify lm_head as fp32 * code check * check doc
2025-12-24 13:28:13 +08:00 · 2025-08-27 11:34:53 +08:00
parent ad319a87cc
commit ce9c0917c5
15 changed files with 99 additions and 60 deletions
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -370,6 +370,11 @@ class EngineArgs:
        - "default_v1": default_v1 loader.
    """

+    lm_head_fp32: bool = False
+    """
+    Flag to specify the dtype of lm_head as FP32. Default is False (Using model default dtype).
+    """
+
    def __post_init__(self):
        """
        Post-initialization processing to set default tokenizer if not provided.
@@ -576,6 +581,12 @@ class EngineArgs:
            default=EngineArgs.early_stop_config,
            help="the config for early stop.",
        )
+        model_group.add_argument(
+            "--lm_head-fp32",
+            action="store_true",
+            default=EngineArgs.lm_head_fp32,
+            help="Specify the dtype of lm_head weight as float32.",
+        )

        # Parallel processing parameters group
        parallel_group = parser.add_argument_group("Parallel Configuration")