// Copyright (c) 2023 Chen Qianhe Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include #include "fastdeploy/fastdeploy_model.h" #include "fastdeploy/runtime.h" #include "./wav.h" class Vad : public fastdeploy::FastDeployModel { public: std::string ModelName() const override; Vad(const std::string& model_file, const fastdeploy::RuntimeOption& custom_option = fastdeploy::RuntimeOption()) { valid_cpu_backends = {fastdeploy::Backend::ORT, fastdeploy::Backend::OPENVINO}; valid_gpu_backends = {fastdeploy::Backend::ORT, fastdeploy::Backend::TRT}; runtime_option = custom_option; runtime_option.model_format = fastdeploy::ModelFormat::ONNX; runtime_option.model_file = model_file; runtime_option.params_file = ""; } void init() { initialized = Initialize(); } void setAudioCofig(int sr, int frame_ms, float threshold, int min_silence_duration_ms, int speech_pad_ms); void loadAudio(const std::string& wavPath); bool Predict(); std::vector> getResult(float removeThreshold = 1.6, float expandHeadThreshold = 0.32, float expandTailThreshold = 0, float mergeThreshold = 0.3); private: bool Initialize(); bool Preprocess(std::vector& audioWindowData); bool Postprocess(); private: // model std::vector inputTensors_; std::vector outputTensors_; // model states bool triggerd_ = false; unsigned int speech_start_ = 0; unsigned int speech_end_ = 0; unsigned int temp_end_ = 0; unsigned int current_sample_ = 0; // MAX 4294967295 samples / 8sample per ms / 1000 / 60 = 8947 minutes float outputProb_; /* ======================================================================== */ // input wav data wav::WavReader wavReader_; std::vector data_; std::vector inputWav_; /* ======================================================================== */ // audio config int sample_rate_ = 16000; int frame_ms_ = 64; float threshold_ = 0.5f; int min_silence_duration_ms_ = 0; int speech_pad_ms_ = 0; int64_t window_size_samples_; // Assign when init, support 256 512 768 for 8k; 512 1024 1536 for 16k. int sr_per_ms_; // Assign when init, support 8 or 16 int min_silence_samples_; // sr_per_ms_ * #ms int speech_pad_samples_; // usually a /* ======================================================================== */ std::vector input_; std::vector sr_; unsigned int size_hc_ = 2 * 1 * 64; // It's FIXED. std::vector _h; std::vector _c; std::vector input_node_dims_; const std::vector sr_node_dims_ = {1}; const std::vector hc_node_dims_ = {2, 1, 64}; /* ======================================================================== */ std::vector speakStart_; std::vector speakEnd_; public: int getSampleRate() const; int getFrameMs() const; float getThreshold() const; int getMinSilenceDurationMs() const; int getSpeechPadMs() const; const wav::WavReader& getWavReader() const; const std::vector& getData() const; const std::vector& getInputWav() const; int64_t getWindowSizeSamples() const; int getSrPerMs() const; int getMinSilenceSamples() const; int getSpeechPadSamples() const; };