# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License")
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

import numpy as np
import paddle

from fastdeploy.model_executor.ops.gpu import eagle_get_self_hidden_states


def computeOrderKernel(last_seq_lens_this_time, seq_lens_this_time, step_idx, src_map, output_token_num, bsz):
    in_offset = 0
    out_offset = 0
    for i in range(bsz):
        cur_seq_lens_this_time = seq_lens_this_time[i]
        cur_last_seq_lens_this_time = last_seq_lens_this_time[i]
        # 1. encoder
        if step_idx[i] == 1 and cur_seq_lens_this_time > 0:
            in_offset += 1
            src_map[out_offset] = in_offset - 1
            out_offset += 1
        # 2. decoder
        elif cur_seq_lens_this_time > 0:  # =1
            in_offset += cur_last_seq_lens_this_time
            src_map[out_offset] = in_offset - 1
            out_offset += 1
        # 3. stop
        else:
            # first token end
            if step_idx[i] == 1:
                in_offset += 1 if cur_last_seq_lens_this_time > 0 else 0
            # normal end
            else:
                in_offset += cur_last_seq_lens_this_time
    output_token_num[0] = out_offset


def rebuildSelfHiddenStatesKernel(input, src_map, output, dim_embed, elem_cnt):
    for elem_id in range(elem_cnt):
        output_token_idx = int(elem_id / dim_embed)
        input_token_idx = src_map[output_token_idx]
        offset = elem_id % dim_embed
        output[output_token_idx][offset] = input[input_token_idx][offset]


def eagle_get_self_hidden_states_ref(input, last_seq_lens_this_time, seq_lens_this_time, step_idx):
    input_token_num = input.shape[0]
    dim_embed = input.shape[1]
    bsz = seq_lens_this_time.shape[0]
    src_map = paddle.full([input_token_num], -1, seq_lens_this_time.dtype)
    output_token_num = paddle.full([1], 0, seq_lens_this_time.dtype)

    computeOrderKernel(last_seq_lens_this_time, seq_lens_this_time, step_idx, src_map, output_token_num, bsz)

    output_token_num_cpu = output_token_num[0]
    out = paddle.full([output_token_num_cpu, dim_embed], -1, input.dtype)

    elem_cnt = output_token_num_cpu * dim_embed
    rebuildSelfHiddenStatesKernel(input, src_map, out, dim_embed, elem_cnt)

    return out


class TestEagleGetSelfHiddenStates(unittest.TestCase):
    def test_eagle_get_self_hidden_states(self):
        paddle.seed(2023)
        np.random.seed(2023)
        bs = 2
        input_token_num = 10
        dim_embed = 512

        last_seq_lens_this_time = np.random.randint(0, input_token_num // bs, bs, dtype=np.int32)
        seq_lens_this_time = np.random.randint(0, input_token_num // bs, bs, dtype=np.int32)
        step_idx = np.arange(0, bs, dtype=np.int32)

        last_seq_lens_this_time_tensor = paddle.to_tensor(last_seq_lens_this_time, dtype=paddle.int32)
        seq_lens_this_time_tensor = paddle.to_tensor(seq_lens_this_time, dtype=paddle.int32)
        step_idx_tensor = paddle.to_tensor(step_idx, dtype=paddle.int64)

        input = np.random.randint(0, 10, (input_token_num, dim_embed), dtype=np.int32)
        input_tensor = paddle.to_tensor(input, dtype=paddle.float16)
        out = eagle_get_self_hidden_states(
            input_tensor,
            last_seq_lens_this_time_tensor,
            seq_lens_this_time_tensor,
            step_idx_tensor,
        )
        out_ref = eagle_get_self_hidden_states_ref(
            input_tensor,
            last_seq_lens_this_time_tensor,
            seq_lens_this_time_tensor,
            step_idx_tensor,
        )
        np.testing.assert_allclose(out.numpy(), out_ref.numpy())


if __name__ == "__main__":
    unittest.main()