FastDeploy/fastdeploy/model_executor/guided_decoding/guidance_backend.py

"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

import copy
import json
import traceback
from typing import Any, Optional, Tuple, Union

import llguidance
import llguidance.hf
import llguidance.torch
import torch

from fastdeploy.config import FDConfig
from fastdeploy.engine.request import Request
from fastdeploy.envs import FD_GUIDANCE_DISABLE_ADDITIONAL, FD_LLGUIDANCE_LOG_LEVEL
from fastdeploy.model_executor.guided_decoding import (
    BackendBase,
    BaseChecker,
    LogitsProcessorBase,
)
from fastdeploy.utils import llm_logger


class LLGuidanceProcessor(LogitsProcessorBase):
    """
    LLGuidance-specific implementation of LogitsProcessorBase.

    This processor enforces grammar constraints during token generation using llguidance.
    It manages the grammar matching state and applies token masks to logits.
    """

    def __init__(
        self,
        ll_matcher: llguidance.LLMatcher,
        ll_tokenizer: llguidance.LLTokenizer,
        serialized_grammar: str,
        vocab_size: int,
        batch_size: int,
        enable_thinking: bool = False,
    ):
        super().__init__(enable_reasoning=enable_thinking)
        self.matcher = ll_matcher
        self.ll_tokenizer = ll_tokenizer
        self.serialized_grammar = serialized_grammar
        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.is_terminated: bool = False
        self._printed_error: bool = False

    def _check_error(self):
        """Checks for and logs any errors from the LLMatcher."""
        if not self._printed_error:
            err = self.matcher.get_error()
            if err:
                self._printed_error = True
                llm_logger.warning(f"LLGuidance Matcher error: {err}")

    def allocate_token_bitmask(self) -> torch.Tensor:
        """
        Allocate a token bitmask tensor for grammar constraints.
        """
        return llguidance.torch.allocate_token_bitmask(self.batch_size, self.vocab_size)

    def fill_token_bitmask(self, token_bitmask: torch.Tensor, idx: int) -> None:
        """
        Fill the token bitmask with allowed tokens for the given index.
        This will automatically provide an EOS mask if the matcher is stopped.
        """
        llguidance.torch.fill_next_token_bitmask(self.matcher, token_bitmask, idx)
        self._check_error()

    def reset(self) -> None:
        """
        Reset the grammar matcher state to initial conditions.
        """
        self.matcher.reset()
        self.is_terminated = False
        self._printed_error = False
        self._check_error()

    def accept_token(self, token: int) -> bool:
        """
        Validate and accept a generated token against the grammar constraints.
        Returns True if the token is accepted, False otherwise.
        """
        if self.is_terminated:
            return False
        if self.ll_tokenizer.eos_token == token:
            self.is_terminated = True
            return True

        result = self.matcher.consume_tokens([token])
        self._check_error()

        return result


class LLGuidanceBackend(BackendBase):
    """
    LLGuidance-specific implementation of BackendBase.

    This backend handles the compilation of various schema types (JSON, regex, etc.)
    into LLGuidance processors.
    """

    def __init__(self, fd_config: FDConfig, **kwargs):
        super().__init__(fd_config=fd_config)
        self.vocab_size = fd_config.model_config.vocab_size
        self.batch_size = fd_config.scheduler_config.max_num_seqs
        self.any_whitespace = not fd_config.structured_outputs_config.disable_any_whitespace

        llm_logger.info(f"LLGuidanceBackend vocab_size={self.vocab_size} batch_size={self.batch_size}")
        try:
            self.ll_tokenizer = llguidance.hf.from_tokenizer(self.hf_tokenizer, self.vocab_size)
        except Exception as e:
            import traceback

            raise RuntimeError(
                f"Failed to initialize llguidance tokenizer from HuggingFace tokenizer: {e} {traceback.format_exc()}"
            )

    def _create_processor(
        self,
        compiled_grammar: str,
        enable_thinking: bool = False,
    ) -> Optional[LLGuidanceProcessor]:
        """
        Create a logits processor instance for the given grammar schemata.
        """
        try:

            ll_matcher = llguidance.LLMatcher(
                self.ll_tokenizer,
                compiled_grammar,
                log_level=FD_LLGUIDANCE_LOG_LEVEL,
            )

            return LLGuidanceProcessor(
                ll_matcher=ll_matcher,
                ll_tokenizer=self.ll_tokenizer,
                serialized_grammar=compiled_grammar,
                vocab_size=self.vocab_size,
                batch_size=self.batch_size,
                enable_thinking=enable_thinking,
            )
        except Exception as e:
            llm_logger.error(f"Failed to create llguidance processor: {e}, {str(traceback.format_exc())}")
            return None

    def _json_processor(self, compiled_grammar: str, enable_thinking: bool = False) -> Optional[LLGuidanceProcessor]:
        return self._create_processor(compiled_grammar, enable_thinking)

    def _regex_processor(self, compiled_grammar: str, enable_thinking: bool = False) -> Optional[LLGuidanceProcessor]:
        return self._create_processor(compiled_grammar, enable_thinking)

    def _grammar_processor(
        self, compiled_grammar: str, enable_thinking: bool = False
    ) -> Optional[LLGuidanceProcessor]:
        return self._create_processor(compiled_grammar, enable_thinking)

    def _structural_tag_processor(
        self, compiled_grammar: str, enable_thinking: bool = False
    ) -> Optional[LLGuidanceProcessor]:
        return self._create_processor(compiled_grammar, enable_thinking)


def _walk_json_for_additional_properties(data: object):
    if isinstance(data, dict):
        for value in data.values():
            _walk_json_for_additional_properties(value)
        if "additionalProperties" not in data and ("properties" in data or "patternProperties" in data):
            data["additionalProperties"] = False
    elif isinstance(data, list):
        for item in data:
            _walk_json_for_additional_properties(item)


def process_for_additional_properties(guide_json: Union[str, dict[str, Any]]) -> dict[str, Any]:
    if isinstance(guide_json, str):
        guide_json_obj = json.loads(guide_json)
    else:
        # copy for modifications
        guide_json_obj = copy.deepcopy(guide_json)
    _walk_json_for_additional_properties(guide_json_obj)
    return guide_json_obj


class LLGuidanceChecker(BaseChecker):
    """
    LLGuidance-specific implementation of BaseChecker.

    This checker validates various schema types for compatibility with the
    llguidance library before processing.
    """

    def __init__(self, **kwargs):
        super().__init__()
        # Although the backend handles serialization, we can perform a quick
        # static check here without a full tokenizer.
        self.any_whitespace = not kwargs.get("disable_any_whitespace", False)
        self.disable_additional_properties = FD_GUIDANCE_DISABLE_ADDITIONAL
        """If `True`, the `guidance` backend will not use `additionalProperties`
        in the JSON schema. This is only supported for the `guidance` backend and
        is used to better align its behaviour with `outlines` and `xgrammar`."""

    def serialize_guidance_grammar(self, request: Request):
        def _process_schema(
            grammar_spec: Union[str, dict[str, Any]],
        ) -> str:
            if self.disable_additional_properties:
                grammar_spec = process_for_additional_properties(grammar_spec)
            return llguidance.LLMatcher.grammar_from_json_schema(
                grammar_spec,
                defaults={
                    "whitespace_flexible": self.any_whitespace,
                },
            )

        if request.guided_json:
            if isinstance(request.guided_json, dict):
                guided_json = json.dumps(request.guided_json)
            else:
                guided_json = request.guided_json
            return _process_schema(guided_json)
        elif request.guided_json_object:
            return llguidance.LLMatcher.grammar_from_json_schema(
                '{"type": "object"}',
                defaults={
                    "whitespace_flexible": self.any_whitespace,
                },
            )

        if request.structural_tag:
            if isinstance(request.structural_tag, str):
                s_tag = json.loads(request.structural_tag)
            else:
                s_tag = request.structural_tag
            triggers: list[str] = s_tag["triggers"]
            tags: list[llguidance.StructTag] = []
            for s in s_tag["structures"]:
                begin: str = s["begin"]
                trig = next((t for t in triggers if begin.startswith(t)), None)
                if trig is None:
                    raise ValueError(f"Trigger {begin} not found in triggers {triggers}")
                tags.append(
                    llguidance.StructTag(
                        trigger=trig,
                        begin=s["begin"],
                        grammar=_process_schema(s["schema"]),
                        end=s["end"],
                    )
                )
            if not tags:
                raise ValueError("No structural tags found in the grammar spec.")
            return llguidance.StructTag.to_grammar(tags)

        if request.guided_regex:
            tp = "regex"
            grammar_spec = request.guided_regex
        elif request.guided_choice:
            tp = "choice"
            grammar_spec = request.guided_choice
        elif request.guided_grammar:
            tp = "grammar"
            grammar_spec = request.guided_grammar
        else:
            llm_logger.error("Validation should have already occurred. " "Please file an issue.")
            raise ValueError("grammar is not of valid supported types. ")
        return llguidance.grammar_from(tp, grammar_spec)

    def schema_format(self, request: Request) -> Tuple[Request, Optional[str]]:
        """
        Validates and formats the schema for the LLGuidance backend.
        """
        try:
            guidance_grm = self.serialize_guidance_grammar(request)
            err = llguidance.LLMatcher.validate_grammar(guidance_grm, None)
            if err:
                raise ValueError(f"Grammar error: {err}")
            else:
                llm_logger.info(f"valid schema_format {guidance_grm} {request}")
            if request.guided_regex:
                request.guided_regex = guidance_grm
            elif request.guided_choice:
                request.guided_grammar = guidance_grm
                request.guided_choice = None
            elif request.guided_grammar:
                request.guided_grammar = guidance_grm
            elif request.guided_json:
                request.guided_json = guidance_grm

        except (ValueError, TypeError, json.JSONDecodeError) as e:
            err_msg = f"Invalid format for guided decoding: {e!s} request={request}"
            return request, err_msg

        except Exception as e:
            err_msg = f"An unexpected error occurred during schema validation: {e!s}"
            return request, err_msg

        return request, None