mirror of
				https://github.com/PaddlePaddle/FastDeploy.git
				synced 2025-10-31 11:56:44 +08:00 
			
		
		
		
	 f2c09a87a6
			
		
	
	f2c09a87a6
	
	
	
		
			
			* add tts example * update example * update use fd engine * add tts python example * add readme * fix comment * change paddle model * fix readme style Co-authored-by: Jason <jiangjiajun@baidu.com>
		
			
				
	
	
		
			215 lines
		
	
	
		
			7.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			215 lines
		
	
	
		
			7.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 | ||
| #
 | ||
| # Licensed under the Apache License, Version 2.0 (the "License");
 | ||
| # you may not use this file except in compliance with the License.
 | ||
| # You may obtain a copy of the License at
 | ||
| #
 | ||
| #     http://www.apache.org/licenses/LICENSE-2.0
 | ||
| #
 | ||
| # Unless required by applicable law or agreed to in writing, software
 | ||
| # distributed under the License is distributed on an "AS IS" BASIS,
 | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||
| # See the License for the specific language governing permissions and
 | ||
| # limitations under the License.
 | ||
| import math
 | ||
| import os
 | ||
| import time
 | ||
| 
 | ||
| import fastdeploy as fd
 | ||
| import numpy as np
 | ||
| import soundfile as sf
 | ||
| 
 | ||
| from paddlespeech.server.utils.util import denorm
 | ||
| from paddlespeech.server.utils.util import get_chunks
 | ||
| from paddlespeech.t2s.frontend.zh_frontend import Frontend
 | ||
| 
 | ||
| model_name_fastspeech2 = "fastspeech2_cnndecoder_csmsc_streaming_static_1.0.0"
 | ||
| model_zip_fastspeech2 = model_name_fastspeech2 + ".zip"
 | ||
| model_url_fastspeech2 = "https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/" + model_zip_fastspeech2
 | ||
| model_name_mb_melgan = "mb_melgan_csmsc_static_0.1.1"
 | ||
| model_zip_mb_melgan = model_name_mb_melgan + ".zip"
 | ||
| model_url_mb_melgan = "https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/" + model_zip_mb_melgan
 | ||
| 
 | ||
| dir_name = os.path.dirname(os.path.realpath(__file__)) + "/"
 | ||
| 
 | ||
| if not os.path.exists(model_name_fastspeech2):
 | ||
|     if os.path.exists(model_zip_fastspeech2):
 | ||
|         os.remove(model_zip_fastspeech2)
 | ||
|     fd.download_and_decompress(model_url_fastspeech2, path=dir_name)
 | ||
|     os.remove(model_zip_fastspeech2)
 | ||
| if not os.path.exists(model_name_mb_melgan):
 | ||
|     if os.path.exists(model_zip_mb_melgan):
 | ||
|         os.remove(model_zip_mb_melgan)
 | ||
|     fd.download_and_decompress(model_url_mb_melgan, path=dir_name)
 | ||
|     os.remove(model_zip_mb_melgan)
 | ||
| 
 | ||
| voc_block = 36
 | ||
| voc_pad = 14
 | ||
| am_block = 72
 | ||
| am_pad = 12
 | ||
| voc_upsample = 300
 | ||
| 
 | ||
| # 模型路径
 | ||
| 
 | ||
| phones_dict = dir_name + model_name_fastspeech2 + "/phone_id_map.txt"
 | ||
| am_stat_path = dir_name + model_name_fastspeech2 + "/speech_stats.npy"
 | ||
| 
 | ||
| am_encoder_model = dir_name + model_name_fastspeech2 + "/fastspeech2_csmsc_am_encoder_infer.pdmodel"
 | ||
| am_decoder_model = dir_name + model_name_fastspeech2 + "/fastspeech2_csmsc_am_decoder.pdmodel"
 | ||
| am_postnet_model = dir_name + model_name_fastspeech2 + "/fastspeech2_csmsc_am_postnet.pdmodel"
 | ||
| voc_melgan_model = dir_name + model_name_mb_melgan + "/mb_melgan_csmsc.pdmodel"
 | ||
| 
 | ||
| am_encoder_para = dir_name + model_name_fastspeech2 + "/fastspeech2_csmsc_am_encoder_infer.pdiparams"
 | ||
| am_decoder_para = dir_name + model_name_fastspeech2 + "/fastspeech2_csmsc_am_decoder.pdiparams"
 | ||
| am_postnet_para = dir_name + model_name_fastspeech2 + "/fastspeech2_csmsc_am_postnet.pdiparams"
 | ||
| voc_melgan_para = dir_name + model_name_mb_melgan + "/mb_melgan_csmsc.pdiparams"
 | ||
| 
 | ||
| frontend = Frontend(phone_vocab_path=phones_dict, tone_vocab_path=None)
 | ||
| am_mu, am_std = np.load(am_stat_path)
 | ||
| 
 | ||
| option_1 = fd.RuntimeOption()
 | ||
| option_1.set_model_path(am_encoder_model, am_encoder_para)
 | ||
| option_1.use_cpu()
 | ||
| option_1.use_ort_backend()
 | ||
| option_1.set_cpu_thread_num(12)
 | ||
| am_encoder_runtime = fd.Runtime(option_1)
 | ||
| 
 | ||
| option_2 = fd.RuntimeOption()
 | ||
| option_2.set_model_path(am_decoder_model, am_decoder_para)
 | ||
| option_2.use_cpu()
 | ||
| option_2.use_ort_backend()
 | ||
| option_2.set_cpu_thread_num(12)
 | ||
| am_decoder_runtime = fd.Runtime(option_2)
 | ||
| 
 | ||
| option_3 = fd.RuntimeOption()
 | ||
| option_3.set_model_path(am_postnet_model, am_postnet_para)
 | ||
| option_3.use_cpu()
 | ||
| option_3.use_ort_backend()
 | ||
| option_3.set_cpu_thread_num(12)
 | ||
| am_postnet_runtime = fd.Runtime(option_3)
 | ||
| 
 | ||
| option_4 = fd.RuntimeOption()
 | ||
| option_4.set_model_path(voc_melgan_model, voc_melgan_para)
 | ||
| option_4.use_cpu()
 | ||
| option_4.use_ort_backend()
 | ||
| option_4.set_cpu_thread_num(12)
 | ||
| voc_melgan_runtime = fd.Runtime(option_4)
 | ||
| 
 | ||
| 
 | ||
| def depadding(data, chunk_num, chunk_id, block, pad, upsample):
 | ||
|     """ 
 | ||
|     Streaming inference removes the result of pad inference
 | ||
|     """
 | ||
|     front_pad = min(chunk_id * block, pad)
 | ||
|     # first chunk
 | ||
|     if chunk_id == 0:
 | ||
|         data = data[:block * upsample]
 | ||
|     # last chunk
 | ||
|     elif chunk_id == chunk_num - 1:
 | ||
|         data = data[front_pad * upsample:]
 | ||
|     # middle chunk
 | ||
|     else:
 | ||
|         data = data[front_pad * upsample:(front_pad + block) * upsample]
 | ||
| 
 | ||
|     return data
 | ||
| 
 | ||
| 
 | ||
| def inference_stream(text):
 | ||
|     input_ids = frontend.get_input_ids(
 | ||
|         text, merge_sentences=False, get_tone_ids=False)
 | ||
|     phone_ids = input_ids["phone_ids"]
 | ||
|     for i in range(len(phone_ids)):
 | ||
|         part_phone_ids = phone_ids[i].numpy()
 | ||
|         voc_chunk_id = 0
 | ||
|         orig_hs = am_encoder_runtime.infer({
 | ||
|             'text':
 | ||
|             part_phone_ids.astype("int64")
 | ||
|         })
 | ||
|         orig_hs = orig_hs[0]
 | ||
| 
 | ||
|         # streaming voc chunk info
 | ||
|         mel_len = orig_hs.shape[1]
 | ||
|         voc_chunk_num = math.ceil(mel_len / voc_block)
 | ||
|         start = 0
 | ||
|         end = min(voc_block + voc_pad, mel_len)
 | ||
| 
 | ||
|         # streaming am
 | ||
|         hss = get_chunks(orig_hs, am_block, am_pad, "am")
 | ||
|         am_chunk_num = len(hss)
 | ||
|         for i, hs in enumerate(hss):
 | ||
|             am_decoder_output = am_decoder_runtime.infer({
 | ||
|                 'xs':
 | ||
|                 hs.astype("float32")
 | ||
|             })
 | ||
| 
 | ||
|             am_postnet_output = am_postnet_runtime.infer({
 | ||
|                 'xs':
 | ||
|                 np.transpose(am_decoder_output[0], (0, 2, 1))
 | ||
|             })
 | ||
|             am_output_data = am_decoder_output + np.transpose(
 | ||
|                 am_postnet_output[0], (0, 2, 1))
 | ||
|             normalized_mel = am_output_data[0][0]
 | ||
| 
 | ||
|             sub_mel = denorm(normalized_mel, am_mu, am_std)
 | ||
|             sub_mel = depadding(sub_mel, am_chunk_num, i, am_block, am_pad, 1)
 | ||
| 
 | ||
|             if i == 0:
 | ||
|                 mel_streaming = sub_mel
 | ||
|             else:
 | ||
|                 mel_streaming = np.concatenate((mel_streaming, sub_mel), axis=0)
 | ||
| 
 | ||
|             # streaming voc
 | ||
|             # 当流式AM推理的mel帧数大于流式voc推理的chunk size,开始进行流式voc 推理
 | ||
|             while (mel_streaming.shape[0] >= end and
 | ||
|                    voc_chunk_id < voc_chunk_num):
 | ||
|                 voc_chunk = mel_streaming[start:end, :]
 | ||
| 
 | ||
|                 sub_wav = voc_melgan_runtime.infer({
 | ||
|                     'logmel':
 | ||
|                     voc_chunk.astype("float32")
 | ||
|                 })
 | ||
|                 sub_wav = depadding(sub_wav[0], voc_chunk_num, voc_chunk_id,
 | ||
|                                     voc_block, voc_pad, voc_upsample)
 | ||
| 
 | ||
|                 yield sub_wav
 | ||
| 
 | ||
|                 voc_chunk_id += 1
 | ||
|                 start = max(0, voc_chunk_id * voc_block - voc_pad)
 | ||
|                 end = min((voc_chunk_id + 1) * voc_block + voc_pad, mel_len)
 | ||
| 
 | ||
| 
 | ||
| if __name__ == '__main__':
 | ||
|     text = "欢迎使用飞桨语音合成系统,测试一下合成效果。"
 | ||
|     # warm up
 | ||
|     # onnxruntime 第一次时间会长一些,建议先 warmup 一下
 | ||
|     '''
 | ||
|     # pyaudio 播放
 | ||
|     p = pyaudio.PyAudio()
 | ||
|     stream = p.open(
 | ||
|         format=p.get_format_from_width(2),  # int16
 | ||
|         channels=1,
 | ||
|         rate=24000,
 | ||
|         output=True)
 | ||
|     '''
 | ||
|     # 计时
 | ||
|     wavs = []
 | ||
|     t1 = time.time()
 | ||
|     for sub_wav in inference_stream(text):
 | ||
|         print("响应时间:", time.time() - t1)
 | ||
|         t1 = time.time()
 | ||
|         wavs.append(sub_wav.flatten())
 | ||
|         # float32 to int16
 | ||
|         #wav = float2pcm(sub_wav)
 | ||
|         # to bytes  
 | ||
|         #wav_bytes = wav.tobytes()
 | ||
|         #stream.write(wav_bytes)
 | ||
| 
 | ||
|     # 关闭 pyaudio 播放器
 | ||
|     #stream.stop_stream()
 | ||
|     #stream.close()
 | ||
|     #p.terminate()
 | ||
| 
 | ||
|     # 流式合成的结果导出
 | ||
|     wav = np.concatenate(wavs)
 | ||
|     sf.write("demo_stream.wav", data=wav, samplerate=24000)
 |