""" # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License" # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ import os import weakref import pytest from fastdeploy.entrypoints.llm import LLM bash_path = os.getenv("MODEL_PATH") FD_ENGINE_QUEUE_PORTS = [ [9961, 9962, 9963, 9964, 9965, 9966, 9967, 9968], [9971, 9972, 9973, 9974, 9975, 9976, 9977, 9978], [9981, 9982, 9983, 9984, 9985, 9986, 9987, 9988], [9991, 9992, 9993, 9994, 9995, 9996, 9997, 9998], ] models = [ "ernie-4_5-fake-w4a8-unpermuted", "ernie-4_5-fake-w4a8-permuted", "ernie-4_5-fake-w4afp8-unpermuted", "ernie-4_5-fake-w4afp8-permuted", ] prompts = ["解释下“温故而知新"] @pytest.fixture(scope="module", params=models) def llm(request): """LLM测试夹具""" model_path = os.path.join(bash_path, request.param) try: port_index = models.index(request.param) % len(FD_ENGINE_QUEUE_PORTS) llm_instance = LLM( model=model_path, tensor_parallel_size=1, data_parallel_size=8, max_model_len=8192, num_gpu_blocks_override=1024, engine_worker_queue_port=FD_ENGINE_QUEUE_PORTS[port_index], load_choices="default", enable_expert_parallel=True, ) yield weakref.proxy(llm_instance) except Exception as e: pytest.skip(f"LLM initialization failed: {e}") @pytest.mark.timeout(60) def test_generation(llm): print(f"testing generation with model: {llm}") # topp_params = SamplingParams(temperature=0.1, top_p=0, max_tokens=20) # output = llm.generate(prompts=prompts, sampling_params=topp_params) # print(output)