这是一次长久的提交:
1. 应用界面增加了返回主页的按钮 2. 修复了gif渲染内存泄漏的严重bug 3. 将PetDao当中的cJSON API替换为cpp_json,完美通过测试 4. 整合已经实现的各种上层建筑,实现了一个宠物对话基本业务应用,用于样品测试展示用 5. 重构了音频播放类,使其更modern,更加便于移植和拓展
This commit is contained in:
@@ -0,0 +1,9 @@
|
||||
//
|
||||
// Created by misaki on 2025/9/29.
|
||||
//
|
||||
|
||||
#include "SimpleI2SForwarder.h"
|
||||
|
||||
// 静态成员初始化
|
||||
LatestDataForwarder* LatestDataForwarder::instance = nullptr;
|
||||
std::mutex LatestDataForwarder::instance_mutex;
|
||||
@@ -0,0 +1,124 @@
|
||||
//
|
||||
// Created by misaki on 2025/9/29.
|
||||
//
|
||||
/**
|
||||
* 音频数据转发器 单生产者单消费者模型
|
||||
* 单纯只作为数据转发器,不进行任何处理
|
||||
*/
|
||||
#pragma once
|
||||
#include <atomic>
|
||||
#include <vector>
|
||||
#include <mutex>
|
||||
#include <condition_variable>
|
||||
#include <deque>
|
||||
|
||||
class LatestDataForwarder {
|
||||
private:
|
||||
static LatestDataForwarder* instance;
|
||||
static std::mutex instance_mutex;
|
||||
|
||||
std::deque<std::vector<int16_t>> queue_;
|
||||
size_t max_size_;
|
||||
std::mutex queue_mutex_;
|
||||
std::condition_variable data_available_;
|
||||
std::atomic<bool> is_running_;
|
||||
public:
|
||||
static LatestDataForwarder* getInstance() {
|
||||
std::lock_guard<std::mutex> lock(instance_mutex);
|
||||
if (!instance) {
|
||||
instance = new LatestDataForwarder();
|
||||
}
|
||||
return instance;
|
||||
}
|
||||
void initialize(const size_t max_size = 500) {
|
||||
std::lock_guard<std::mutex> lock(queue_mutex_);
|
||||
max_size_ = max_size;
|
||||
is_running_ = true;
|
||||
}
|
||||
// 生产者:总是注入最新数据,队列满时丢弃最旧数据
|
||||
void injectData(const int16_t* data, const size_t length) {
|
||||
if (!is_running_ || !data || length == 0) return;
|
||||
std::lock_guard<std::mutex> lock(queue_mutex_);
|
||||
// 创建新数据
|
||||
std::vector<int16_t> new_data(data, data + length);
|
||||
// 如果队列已满,移除最旧的数据
|
||||
if (queue_.size() >= max_size_) {
|
||||
queue_.pop_front();
|
||||
}
|
||||
// 添加最新数据
|
||||
queue_.push_back(std::move(new_data));
|
||||
data_available_.notify_one();
|
||||
}
|
||||
// 消费者:获取所有可用的最新数据
|
||||
bool retrieveLatestData(std::vector<int16_t>& output, const int timeout_ms = 0) {
|
||||
if (!is_running_) return false;
|
||||
std::unique_lock<std::mutex> lock(queue_mutex_);
|
||||
// 如果队列当中数据不足,则等待数据可用
|
||||
if (queue_.empty()) {
|
||||
if (timeout_ms <= 0) return false;
|
||||
if (data_available_.wait_for(lock,
|
||||
std::chrono::milliseconds(timeout_ms)) == std::cv_status::timeout) {
|
||||
return false;
|
||||
}
|
||||
if (queue_.empty()) return false;
|
||||
}
|
||||
if (!is_running_) return false;
|
||||
// 合并队列中的所有数据(最新的数据)
|
||||
output.clear();
|
||||
for (const auto& chunk : queue_) {
|
||||
output.insert(output.end(), chunk.begin(), chunk.end());
|
||||
}
|
||||
// 清空队列,准备接收新的实时数据
|
||||
queue_.clear();
|
||||
return true;
|
||||
}
|
||||
// 消费者:只获取最新的N帧数据
|
||||
bool retrieveRecentData(std::vector<int16_t>& output, const size_t recent_frames = 10, const int timeout_ms = 0) {
|
||||
if (!is_running_) return false;
|
||||
std::unique_lock<std::mutex> lock(queue_mutex_);
|
||||
// 等待数据可用
|
||||
if (queue_.empty()) {
|
||||
if (timeout_ms <= 0) return false;
|
||||
if (data_available_.wait_for(lock,
|
||||
std::chrono::milliseconds(timeout_ms)) == std::cv_status::timeout) {
|
||||
return false;
|
||||
}
|
||||
if (queue_.empty()) return false;
|
||||
}
|
||||
if (!is_running_) return false;
|
||||
// 只取最新的recent_frames帧数据
|
||||
output.clear();
|
||||
const size_t start_index = queue_.size() > recent_frames ? queue_.size() - recent_frames : 0;
|
||||
for (size_t i = start_index; i < queue_.size(); ++i) {
|
||||
output.insert(output.end(), queue_[i].begin(), queue_[i].end());
|
||||
}
|
||||
// 移除已取出的旧数据,保留最新的数据
|
||||
if (start_index > 0) {
|
||||
std::deque<std::vector<int16_t>> new_queue(queue_.begin() + start_index, queue_.end());
|
||||
queue_.swap(new_queue);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t getQueueSize() {
|
||||
std::lock_guard<std::mutex> lock(queue_mutex_);
|
||||
return queue_.size();
|
||||
}
|
||||
|
||||
void clear() {
|
||||
std::lock_guard<std::mutex> lock(queue_mutex_);
|
||||
queue_.clear();
|
||||
}
|
||||
|
||||
void stop() {
|
||||
is_running_ = false;
|
||||
data_available_.notify_all();
|
||||
}
|
||||
|
||||
void start() {
|
||||
is_running_ = true;
|
||||
}
|
||||
|
||||
private:
|
||||
LatestDataForwarder() : max_size_(500), is_running_(false) {}
|
||||
};
|
||||
@@ -2,20 +2,22 @@
|
||||
// Created by misaki on 2025/9/15.
|
||||
//
|
||||
#include "SpeechRecognizer.h"
|
||||
#include "esp_afe_sr_models.h"
|
||||
#include "esp_mn_models.h"
|
||||
#include "esp_wn_iface.h"
|
||||
#include "esp_mn_speech_commands.h"
|
||||
#include "model_path.h"
|
||||
#include "driver/gpio.h"
|
||||
#include "soc/soc_caps.h"
|
||||
#include "esp_err.h"
|
||||
#include "nvs_flash.h"
|
||||
#include "freertos/FreeRTOS.h"
|
||||
#include "freertos/task.h"
|
||||
#include "VadSlidingWindow.h"
|
||||
#include "SimpleI2SForwarder.h"
|
||||
|
||||
#include <esp_afe_sr_models.h>
|
||||
#include <esp_mn_models.h>
|
||||
#include <esp_wn_iface.h>
|
||||
#include <esp_mn_speech_commands.h>
|
||||
#include <model_path.h>
|
||||
#include <driver/gpio.h>
|
||||
#include <soc/soc_caps.h>
|
||||
#include <esp_err.h>
|
||||
#include <nvs_flash.h>
|
||||
#include <freertos/FreeRTOS.h>
|
||||
#include <freertos/task.h>
|
||||
#include <atomic>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
// 初始化静态成员变量
|
||||
@@ -40,8 +42,8 @@ SpeechRecognizer::SpeechRecognizer()
|
||||
models(nullptr),
|
||||
multinet(nullptr),
|
||||
model_data(nullptr),
|
||||
tasksRunning(false) {
|
||||
}
|
||||
vad_state_(AFE_VAD_SILENCE),
|
||||
tasksRunning(false){}
|
||||
|
||||
SpeechRecognizer::~SpeechRecognizer() {
|
||||
deinit();
|
||||
@@ -68,6 +70,29 @@ bool SpeechRecognizer::init(const SpeechRecognizerConfig& config) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SpeechRecognizer::init(const bool enable_vad, const vad_mode_t vad_mode, std::string model_path) {
|
||||
if (initialized) {
|
||||
ESP_LOGI("SpeechRecognizer", "Already initialized");
|
||||
return true;
|
||||
}
|
||||
this->config.enable_vad = enable_vad;
|
||||
this->config.vad_mode = vad_mode;
|
||||
this->config.model_path = std::move(model_path);
|
||||
// 初始化I2S
|
||||
if (!initI2S()) {
|
||||
ESP_LOGE("SpeechRecognizer", "I2S initialization failed");
|
||||
return false;
|
||||
}
|
||||
// 初始化ESP-SR
|
||||
if (!initESP_SR()) {
|
||||
ESP_LOGE("SpeechRecognizer", "ESP-SR initialization failed");
|
||||
return false;
|
||||
}
|
||||
initialized = true;
|
||||
ESP_LOGI("SpeechRecognizer", "Initialization completed successfully");
|
||||
return true;
|
||||
}
|
||||
|
||||
void SpeechRecognizer::deinit() {
|
||||
if (!initialized) {
|
||||
return;
|
||||
@@ -88,8 +113,8 @@ void SpeechRecognizer::deinit() {
|
||||
}
|
||||
// 释放I2S资源
|
||||
if (rx_handle) {
|
||||
i2s_channel_disable(rx_handle);
|
||||
i2s_del_channel(rx_handle);
|
||||
i2s_channel_disable(rx_handle); // 删除通道之前必须先禁用通道
|
||||
i2s_del_channel(rx_handle); // 删除该句柄以释放通道资源
|
||||
rx_handle = nullptr;
|
||||
}
|
||||
initialized = false;
|
||||
@@ -98,30 +123,33 @@ void SpeechRecognizer::deinit() {
|
||||
|
||||
bool SpeechRecognizer::initI2S() {
|
||||
esp_err_t ret = ESP_OK;
|
||||
i2s_chan_config_t chan_cfg = I2S_CHANNEL_DEFAULT_CONFIG(I2S_NUM_1, I2S_ROLE_MASTER);
|
||||
// 通过辅助宏获取默认的通道配置, 它可以帮助指定 I2S 角色和端口 ID
|
||||
constexpr i2s_chan_config_t chan_cfg = I2S_CHANNEL_DEFAULT_CONFIG(I2S_NUM_1, I2S_ROLE_MASTER);
|
||||
// 分配新的 TX 通道并获取该通道的句柄
|
||||
ret = i2s_new_channel(&chan_cfg, nullptr, &rx_handle);
|
||||
if (ret != ESP_OK) {
|
||||
ESP_LOGE("SpeechRecognizer", "Failed to create I2S channel: %s", esp_err_to_name(ret));
|
||||
return false;
|
||||
}
|
||||
// 进行配置,通过宏生成声道配置和时钟配置, 这两个辅助宏在 'i2s_std.h' 中定义,只能用于 STD 模式
|
||||
i2s_std_config_t std_cfg = {
|
||||
.clk_cfg = I2S_STD_CLK_DEFAULT_CONFIG(16000),
|
||||
.slot_cfg = I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG(I2S_DATA_BIT_WIDTH_32BIT, I2S_SLOT_MODE_MONO),
|
||||
.clk_cfg = I2S_STD_CLK_DEFAULT_CONFIG(16000), // 16KHz采样率
|
||||
.slot_cfg = I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG(I2S_DATA_BIT_WIDTH_32BIT, I2S_SLOT_MODE_MONO), // 32位单声道
|
||||
.gpio_cfg = {
|
||||
.mclk = GPIO_NUM_NC,
|
||||
.bclk = config.bclk_pin,
|
||||
.ws = config.ws_pin,
|
||||
.dout = GPIO_NUM_NC,
|
||||
.din = config.din_pin,
|
||||
.invert_flags = {
|
||||
.mclk = GPIO_NUM_NC, // 不使用MCLK
|
||||
.bclk = config.bclk_pin, // BCLK引脚
|
||||
.ws = config.ws_pin, // WS引脚
|
||||
.dout = GPIO_NUM_NC, // 不使用DOUT
|
||||
.din = config.din_pin, // DIN引脚
|
||||
.invert_flags = { // 不使用倒置
|
||||
.mclk_inv = false,
|
||||
.bclk_inv = false,
|
||||
.ws_inv = false,
|
||||
},
|
||||
},
|
||||
};
|
||||
std_cfg.slot_cfg.slot_mask = I2S_STD_SLOT_RIGHT;
|
||||
ret = i2s_channel_init_std_mode(rx_handle, &std_cfg);
|
||||
std_cfg.slot_cfg.slot_mask = I2S_STD_SLOT_RIGHT; // 右声道
|
||||
ret = i2s_channel_init_std_mode(rx_handle, &std_cfg); // 初始化STD标准模式
|
||||
if (ret != ESP_OK) {
|
||||
ESP_LOGE("SpeechRecognizer", "Failed to init I2S standard mode: %s", esp_err_to_name(ret));
|
||||
return false;
|
||||
@@ -177,7 +205,7 @@ bool SpeechRecognizer::initESP_SR() {
|
||||
ESP_LOGE("SpeechRecognizer", "Failed to create AFE data from config");
|
||||
return false;
|
||||
}
|
||||
// 加载MultiNet模型(采用esp-sr提供的宏来处理不同语种的模型的处理问题)
|
||||
// 加载MultiNet模型 (采用esp-sr提供的宏来处理不同语种的模型的处理问题)
|
||||
#if defined(CONFIG_SR_MN_CN_MULTINET5_RECOGNITION_QUANT8) || defined(CONFIG_SR_MN_CN_MULTINET6_QUANT) || defined(CONFIG_SR_MN_CN_MULTINET6_AC_QUANT)
|
||||
char *mn_name = esp_srmodel_filter(models, ESP_MN_PREFIX, ESP_MN_CHINESE);
|
||||
#else
|
||||
@@ -211,7 +239,7 @@ bool SpeechRecognizer::start() {
|
||||
return true;
|
||||
}
|
||||
// 启用I2S通道
|
||||
esp_err_t ret = i2s_channel_enable(rx_handle);
|
||||
esp_err_t ret = i2s_channel_enable(rx_handle); // 在读取数据之前,先启动 RX 通道
|
||||
if (ret != ESP_OK) {
|
||||
ESP_LOGE("SpeechRecognizer", "Failed to enable I2S channel: %s", esp_err_to_name(ret));
|
||||
return false;
|
||||
@@ -253,29 +281,51 @@ void SpeechRecognizer::stop() {
|
||||
|
||||
void SpeechRecognizer::feedTask() {
|
||||
ThreadManager::printThreadInfo("Feed task started");
|
||||
int audio_chunksize = afe_handle->get_feed_chunksize(afe_data);
|
||||
int nch = afe_handle->get_channel_num(afe_data);
|
||||
size_t samp_len = audio_chunksize;
|
||||
size_t samp_len_bytes = samp_len * sizeof(int32_t); // 单声道32位
|
||||
auto *i2s_buff = static_cast<int32_t *>(malloc(samp_len_bytes));
|
||||
if (!i2s_buff) {
|
||||
ESP_LOGE("SpeechRecognizer", "Failed to allocate memory for I2S buffer");
|
||||
const int audio_chunksize = afe_handle->get_feed_chunksize(afe_data);
|
||||
ESP_LOGW("SpeechRecognizer", "Feed task: audio_chunksize=%d", audio_chunksize);
|
||||
int nch = afe_handle->get_channel_num(afe_data); // 获取I2S通道的声道数, 此处为1, 因此并没有被下面所使用
|
||||
const size_t samp_len = audio_chunksize;
|
||||
const size_t samp_len_bytes = samp_len * sizeof(int32_t); // 单声道32位
|
||||
// 分配I2S缓冲区 放在PSRAM堆内存中
|
||||
auto *i2s_buff = static_cast<int32_t *>(heap_caps_malloc((samp_len_bytes), MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM));
|
||||
// 创建I2S转发副本 放在PSRAM堆内存中
|
||||
auto *raw_pcm16 = static_cast<int16_t *>(heap_caps_malloc(samp_len * sizeof(int16_t), MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM));
|
||||
if (!i2s_buff || !raw_pcm16) {
|
||||
ESP_LOGE("SpeechRecognizer", "Failed to allocate memory for buffers");
|
||||
if (i2s_buff) free(i2s_buff);
|
||||
if (raw_pcm16) free(raw_pcm16);
|
||||
return;
|
||||
}
|
||||
size_t bytes_read;
|
||||
LatestDataForwarder::getInstance()->initialize(200); // 初始化转发队列,最多缓存 200 帧 I2S 数据
|
||||
size_t bytes_read; // 读取的字节数
|
||||
|
||||
while (tasksRunning) {
|
||||
esp_err_t ret = i2s_channel_read(rx_handle, i2s_buff, samp_len_bytes, &bytes_read, portMAX_DELAY);
|
||||
// 读取I2S数据
|
||||
const esp_err_t ret = i2s_channel_read(rx_handle, i2s_buff, samp_len_bytes, &bytes_read, portMAX_DELAY);
|
||||
if (ret != ESP_OK) {
|
||||
ESP_LOGE("SpeechRecognizer", "I2S read error: %s", esp_err_to_name(ret));
|
||||
vTaskDelay(pdMS_TO_TICKS(10));
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(10));
|
||||
continue;
|
||||
}
|
||||
// 处理音频数据(32位转16位)
|
||||
// 一次性处理所有数据转换
|
||||
for (int i = 0; i < samp_len; ++i) {
|
||||
i2s_buff[i] = i2s_buff[i] >> 14; // 32:8是有效位,转换为16位音频数据
|
||||
// 转发数据转换:32位转16位(取高16位)
|
||||
raw_pcm16[i] = static_cast<int16_t>(i2s_buff[i] >> 16);
|
||||
// AFE数据转换:右移14位(在原始数据上操作)
|
||||
i2s_buff[i] = i2s_buff[i] >> 14; // 32:8 是有效位,8:0 是低 8 位,全部为 0,AFE 输入是 16 位语音数据,29:13 位用于放大语音信号。
|
||||
}
|
||||
// 转发原始数据
|
||||
LatestDataForwarder::getInstance()->injectData(raw_pcm16, samp_len);
|
||||
// 喂数据给AFE
|
||||
afe_handle->feed(afe_data, reinterpret_cast<int16_t *>(i2s_buff));
|
||||
|
||||
// 复制一份到滑动窗口人声检测区 (给网络) 注入音频帧到管理器
|
||||
// AudioBufferManager::getInstance()->injectAudioFrame(
|
||||
// reinterpret_cast<int16_t*>(i2s_buff),
|
||||
// samp_len,
|
||||
// this->vad_state_
|
||||
// );
|
||||
// std::this_thread::sleep_for(std::chrono::milliseconds(5)); // 休眠5ms
|
||||
}
|
||||
free(i2s_buff);
|
||||
ESP_LOGI("SpeechRecognizer", "Feed task exited");
|
||||
@@ -293,6 +343,8 @@ void SpeechRecognizer::detectTask() {
|
||||
ESP_LOGI("SpeechRecognizer", "Ready for speech recognition");
|
||||
while (tasksRunning) {
|
||||
afe_fetch_result_t* res = afe_handle->fetch(afe_data);
|
||||
// 从res中取出 vad 状态
|
||||
this->vad_state_ = res->vad_state;
|
||||
if (!res || res->ret_value == ESP_FAIL) {
|
||||
ESP_LOGE("SpeechRecognizer", "AFE fetch error");
|
||||
vTaskDelay(pdMS_TO_TICKS(10));
|
||||
@@ -364,10 +416,6 @@ bool SpeechRecognizer::addCommand(int command_id, const std::string& phrase) {
|
||||
}
|
||||
// 打印缓存的指令
|
||||
ESP_LOGI("SpeechRecognizer", "Added command: ID=%d, Phrase=%s", command_id, phrase.c_str());
|
||||
ESP_LOGI("SpeechRecognizer", "下面是当前已经缓存的指令:");
|
||||
esp_mn_commands_print();
|
||||
ESP_LOGI("SpeechRecognizer", "下面是当前已经应用的指令:");
|
||||
esp_mn_active_commands_print();
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -376,6 +424,7 @@ bool SpeechRecognizer::addCommands(const std::vector<std::pair<int, std::string>
|
||||
for (const auto& cmd : commands) {
|
||||
if (!addCommand(cmd.first, cmd.second)) {
|
||||
success = false;
|
||||
ESP_LOGE("main", "Failed to add some commands");
|
||||
}
|
||||
}
|
||||
ESP_LOGI("SpeechRecognizer", "下面是当前已经缓存的指令:");
|
||||
@@ -459,6 +508,11 @@ std::string SpeechRecognizer::getCurrentState() const {
|
||||
return currentState;
|
||||
}
|
||||
|
||||
afe_vad_state_t SpeechRecognizer::getVadState() const {
|
||||
return vad_state_;
|
||||
}
|
||||
|
||||
|
||||
bool SpeechRecognizer::isRunning() const {
|
||||
return running;
|
||||
}
|
||||
|
||||
@@ -63,7 +63,8 @@ public:
|
||||
|
||||
// 初始化语音识别系统
|
||||
bool init(const SpeechRecognizerConfig& config = SpeechRecognizerConfig());
|
||||
|
||||
// 重载版本
|
||||
bool init(const bool enable_vad, const vad_mode_t vad_mode, std::string model_path);
|
||||
// 反初始化
|
||||
void deinit();
|
||||
|
||||
@@ -106,6 +107,8 @@ public:
|
||||
// 获取当前识别状态
|
||||
std::string getCurrentState() const;
|
||||
|
||||
// 获取当前 VAD 识别状态(噪音、静音 or 人声) 需要开启 VAD
|
||||
afe_vad_state_t getVadState() const;
|
||||
private:
|
||||
SpeechRecognizer();
|
||||
~SpeechRecognizer();
|
||||
@@ -149,7 +152,8 @@ private:
|
||||
esp_afe_sr_data_t* afe_data;
|
||||
srmodel_list_t* models;
|
||||
esp_mn_iface_t* multinet;
|
||||
model_iface_data_t* model_data;
|
||||
model_iface_data_t* model_data; /// 模型数据
|
||||
afe_vad_state_t vad_state_; /// 语音活动检测状态
|
||||
|
||||
// 命令回调
|
||||
SpeechCommandCallback commandCallback;
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
//
|
||||
// Created by misaki on 2025/9/29.
|
||||
//
|
||||
|
||||
#include "VadSlidingWindow.h"
|
||||
|
||||
// 静态成员初始化
|
||||
AudioBufferManager* AudioBufferManager::instance = nullptr;
|
||||
std::mutex AudioBufferManager::instance_mutex;
|
||||
@@ -0,0 +1,377 @@
|
||||
//
|
||||
// Created by misaki on 2025/9/29.
|
||||
//
|
||||
/**
|
||||
* 本模块用于处理音频数据,使用滑动窗口和VAD算法进行人声语音段过滤
|
||||
* 依赖于VAD人声检测数据,并通过滑动窗口管理区间数据,以精确过滤出人声音频数据
|
||||
* @author Misaki
|
||||
* @date 2025/9/29
|
||||
*/
|
||||
#pragma once
|
||||
#include <vector>
|
||||
#include <queue>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <condition_variable>
|
||||
#include <atomic>
|
||||
|
||||
// PCM帧数据结构
|
||||
struct PCMFrame {
|
||||
std::vector<int16_t> audio_data; // PCM音频数据
|
||||
size_t data_length; // 数据长度(样本数)
|
||||
bool vad_status; // VAD检测状态
|
||||
uint64_t timestamp; // 时间戳(可选,用于调试)
|
||||
|
||||
PCMFrame() : data_length(0), vad_status(false), timestamp(0) {}
|
||||
|
||||
PCMFrame(const int16_t* data, const size_t len, const bool vad, const uint64_t ts = 0)
|
||||
: data_length(len), vad_status(vad), timestamp(ts) {
|
||||
audio_data.assign(data, data + len);
|
||||
}
|
||||
};
|
||||
|
||||
// 滑动窗口结构
|
||||
struct SlidingWindow {
|
||||
std::vector<PCMFrame> frames; // 窗口内的所有帧
|
||||
size_t total_frames; // 总帧数
|
||||
size_t voice_frames; // 人声帧数
|
||||
double voice_ratio; // 人声占比
|
||||
bool is_active; // 窗口是否处于活跃状态
|
||||
|
||||
SlidingWindow() : total_frames(0), voice_frames(0), voice_ratio(0.0), is_active(false) {}
|
||||
|
||||
// 计算人声占比
|
||||
void calculateVoiceRatio() {
|
||||
if (total_frames > 0) {
|
||||
voice_ratio = static_cast<double>(voice_frames) / total_frames;
|
||||
} else {
|
||||
voice_ratio = 0.0;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class AudioBufferManager {
|
||||
private:
|
||||
// 单例实例
|
||||
static AudioBufferManager* instance;
|
||||
static std::mutex instance_mutex;
|
||||
|
||||
// 配置参数
|
||||
struct Config {
|
||||
size_t max_window_frames; // 单个窗口最大帧数
|
||||
size_t max_queue_size; // 队列最大长度
|
||||
size_t silence_threshold_frames; // 静音阈值帧数
|
||||
double voice_ratio_threshold; // 人声占比阈值
|
||||
size_t pre_voice_frames; // 人声开始前预保留帧数
|
||||
size_t post_voice_frames; // 人声结束后保留帧数
|
||||
} config;
|
||||
|
||||
// 内部状态
|
||||
std::unique_ptr<SlidingWindow> current_window;
|
||||
std::queue<std::unique_ptr<SlidingWindow>> completed_windows;
|
||||
std::vector<PCMFrame> pre_voice_buffer; // 人声开始前的预缓存
|
||||
|
||||
// 状态跟踪
|
||||
std::atomic<bool> in_voice_segment;
|
||||
size_t consecutive_silence_frames;
|
||||
size_t current_frame_count;
|
||||
|
||||
// 线程同步
|
||||
std::mutex data_mutex;
|
||||
std::condition_variable data_condition;
|
||||
|
||||
// 内存使用跟踪
|
||||
size_t estimated_memory_usage;
|
||||
const size_t MAX_MEMORY_BYTES = 512 * 1024; // 512KB
|
||||
|
||||
private:
|
||||
AudioBufferManager() {
|
||||
initializeDefaultConfig();
|
||||
resetState();
|
||||
}
|
||||
|
||||
void initializeDefaultConfig() {
|
||||
// 默认配置:基于16kHz采样率,每帧20ms(320样本)
|
||||
config.max_window_frames = 500; // 10秒音频(500 * 20ms)
|
||||
config.max_queue_size = 8; // 队列最多8个窗口
|
||||
config.silence_threshold_frames = 15; // 300ms静音判定结束(15 * 20ms)
|
||||
config.voice_ratio_threshold = 0.3; // 30%人声占比阈值
|
||||
config.pre_voice_frames = 5; // 人声开始前保留100ms
|
||||
config.post_voice_frames = 10; // 人声结束后保留200ms
|
||||
|
||||
estimated_memory_usage = 0;
|
||||
}
|
||||
|
||||
void resetState() {
|
||||
current_window = std::make_unique<SlidingWindow>();
|
||||
in_voice_segment = false;
|
||||
consecutive_silence_frames = 0;
|
||||
current_frame_count = 0;
|
||||
pre_voice_buffer.clear();
|
||||
pre_voice_buffer.reserve(config.pre_voice_frames);
|
||||
}
|
||||
|
||||
// 估算单个帧的内存使用
|
||||
size_t estimateFrameMemory(const PCMFrame& frame) const {
|
||||
return sizeof(PCMFrame) + (frame.audio_data.capacity() * sizeof(int16_t));
|
||||
}
|
||||
|
||||
// 估算窗口内存使用
|
||||
size_t estimateWindowMemory(const SlidingWindow& window) const {
|
||||
size_t memory = sizeof(SlidingWindow);
|
||||
for (const auto& frame : window.frames) {
|
||||
memory += estimateFrameMemory(frame);
|
||||
}
|
||||
return memory;
|
||||
}
|
||||
|
||||
// 检查内存限制
|
||||
bool checkMemoryConstraints() const {
|
||||
return estimated_memory_usage < MAX_MEMORY_BYTES;
|
||||
}
|
||||
|
||||
public:
|
||||
// 删除拷贝构造函数和赋值运算符
|
||||
AudioBufferManager(const AudioBufferManager&) = delete;
|
||||
AudioBufferManager& operator=(const AudioBufferManager&) = delete;
|
||||
|
||||
// 获取单例实例
|
||||
static AudioBufferManager* getInstance() {
|
||||
std::lock_guard<std::mutex> lock(instance_mutex);
|
||||
if (!instance) {
|
||||
instance = new AudioBufferManager();
|
||||
}
|
||||
return instance;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief 配置管理器参数
|
||||
* @param max_window_frames 单个窗口最大帧数
|
||||
* @param max_queue_size 队列最大长度
|
||||
* @param silence_threshold 静音阈值帧数
|
||||
* @param voice_ratio_threshold 人声占比阈值
|
||||
* @param pre_voice_frames 人声开始前预保留帧数
|
||||
* @param post_voice_frames 人声结束后保留帧数
|
||||
*/
|
||||
void configure(const size_t max_window_frames = 500,
|
||||
const size_t max_queue_size = 8,
|
||||
const size_t silence_threshold = 15,
|
||||
const double voice_ratio_threshold = 0.2,
|
||||
const size_t pre_voice_frames = 5,
|
||||
const size_t post_voice_frames = 10) {
|
||||
std::lock_guard<std::mutex> lock(data_mutex);
|
||||
|
||||
config.max_window_frames = max_window_frames;
|
||||
config.max_queue_size = max_queue_size;
|
||||
config.silence_threshold_frames = silence_threshold;
|
||||
config.voice_ratio_threshold = voice_ratio_threshold;
|
||||
config.pre_voice_frames = pre_voice_frames;
|
||||
config.post_voice_frames = post_voice_frames;
|
||||
|
||||
// 重新初始化状态
|
||||
resetState();
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief 注入新的音频帧数据
|
||||
* @param audio_data PCM音频数据指针
|
||||
* @param data_length 数据长度(样本数)
|
||||
* @param vad_status 当前帧的VAD状态
|
||||
*/
|
||||
void injectAudioFrame(const int16_t* audio_data, size_t data_length, bool vad_status) {
|
||||
std::lock_guard<std::mutex> lock(data_mutex);
|
||||
|
||||
// 创建新帧
|
||||
PCMFrame new_frame(audio_data, data_length, vad_status, current_frame_count++);
|
||||
size_t frame_memory = estimateFrameMemory(new_frame);
|
||||
|
||||
// 检查内存限制
|
||||
if (!checkMemoryConstraints()) {
|
||||
// 内存不足,采取清理策略
|
||||
if (!completed_windows.empty()) {
|
||||
auto old_window = std::move(completed_windows.front());
|
||||
completed_windows.pop();
|
||||
estimated_memory_usage -= estimateWindowMemory(*old_window);
|
||||
}
|
||||
}
|
||||
|
||||
// 更新预缓存
|
||||
updatePreVoiceBuffer(new_frame);
|
||||
|
||||
// 状态机处理
|
||||
if (!in_voice_segment) {
|
||||
handleNonVoiceState(new_frame, frame_memory);
|
||||
} else {
|
||||
handleVoiceState(new_frame, frame_memory);
|
||||
}
|
||||
|
||||
estimated_memory_usage += frame_memory;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief 获取可用的音频窗口数据
|
||||
* @param timeout_ms 超时时间(毫秒)
|
||||
* @return 滑动窗口指针,如果没有可用数据则返回nullptr
|
||||
*/
|
||||
std::unique_ptr<SlidingWindow> getAudioWindow(int timeout_ms = 0) {
|
||||
std::unique_lock<std::mutex> lock(data_mutex);
|
||||
if (completed_windows.empty()) {
|
||||
if (timeout_ms <= 0) {
|
||||
return nullptr;
|
||||
}
|
||||
// 等待数据可用
|
||||
if (data_condition.wait_for(lock,
|
||||
std::chrono::milliseconds(timeout_ms)) == std::cv_status::timeout) {
|
||||
return nullptr;
|
||||
}
|
||||
if (completed_windows.empty()) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
auto window = std::move(completed_windows.front());
|
||||
completed_windows.pop();
|
||||
estimated_memory_usage -= estimateWindowMemory(*window);
|
||||
return window;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief 检查是否有可用的音频数据
|
||||
*/
|
||||
bool hasAvailableData() {
|
||||
std::lock_guard<std::mutex> lock(data_mutex);
|
||||
return !completed_windows.empty();
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief 获取当前队列大小
|
||||
*/
|
||||
size_t getQueueSize() {
|
||||
std::lock_guard<std::mutex> lock(data_mutex);
|
||||
return completed_windows.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief 获取估计的内存使用量
|
||||
*/
|
||||
size_t getEstimatedMemoryUsage() const {
|
||||
return estimated_memory_usage;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief 强制结束当前语音段(如果有)
|
||||
*/
|
||||
void forceEndCurrentSegment() {
|
||||
std::lock_guard<std::mutex> lock(data_mutex);
|
||||
if (in_voice_segment && current_window->total_frames > 0) {
|
||||
finalizeCurrentWindow();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief 清空所有缓存数据
|
||||
*/
|
||||
void clearAllData() {
|
||||
std::lock_guard<std::mutex> lock(data_mutex);
|
||||
resetState();
|
||||
std::queue<std::unique_ptr<SlidingWindow>> empty_queue;
|
||||
std::swap(completed_windows, empty_queue);
|
||||
estimated_memory_usage = 0;
|
||||
}
|
||||
|
||||
private:
|
||||
// 更新人声开始前的预缓存
|
||||
void updatePreVoiceBuffer(const PCMFrame& frame) {
|
||||
pre_voice_buffer.push_back(frame);
|
||||
|
||||
// 保持预缓存大小不超过配置值
|
||||
if (pre_voice_buffer.size() > config.pre_voice_frames) {
|
||||
pre_voice_buffer.erase(pre_voice_buffer.begin());
|
||||
}
|
||||
}
|
||||
|
||||
// 处理非人声状态
|
||||
void handleNonVoiceState(const PCMFrame& frame, size_t frame_memory) {
|
||||
if (frame.vad_status) {
|
||||
// 检测到人声开始
|
||||
in_voice_segment = true;
|
||||
consecutive_silence_frames = 0;
|
||||
|
||||
// 将预缓存数据加入当前窗口
|
||||
for (const auto& pre_frame : pre_voice_buffer) {
|
||||
current_window->frames.push_back(pre_frame);
|
||||
current_window->total_frames++;
|
||||
if (pre_frame.vad_status) {
|
||||
current_window->voice_frames++;
|
||||
}
|
||||
}
|
||||
|
||||
// 添加当前帧
|
||||
addFrameToCurrentWindow(frame);
|
||||
}
|
||||
// 非人声状态下,不进行其他处理
|
||||
}
|
||||
|
||||
// 处理人声状态
|
||||
void handleVoiceState(const PCMFrame& frame, size_t frame_memory) {
|
||||
if (frame.vad_status) {
|
||||
// 仍然是人声,重置静音计数
|
||||
consecutive_silence_frames = 0;
|
||||
} else {
|
||||
// 静音帧
|
||||
consecutive_silence_frames++;
|
||||
}
|
||||
|
||||
// 添加当前帧到窗口
|
||||
addFrameToCurrentWindow(frame);
|
||||
|
||||
// 检查是否需要结束当前语音段
|
||||
if (consecutive_silence_frames >= config.silence_threshold_frames ||
|
||||
current_window->frames.size() >= config.max_window_frames) {
|
||||
|
||||
// 添加人声结束后的保留帧
|
||||
addPostVoiceFrames();
|
||||
|
||||
// 完成当前窗口
|
||||
finalizeCurrentWindow();
|
||||
}
|
||||
}
|
||||
|
||||
// 添加帧到当前窗口
|
||||
void addFrameToCurrentWindow(const PCMFrame& frame) {
|
||||
current_window->frames.push_back(frame);
|
||||
current_window->total_frames++;
|
||||
if (frame.vad_status) {
|
||||
current_window->voice_frames++;
|
||||
}
|
||||
current_window->calculateVoiceRatio();
|
||||
}
|
||||
|
||||
// 添加人声结束后的保留帧
|
||||
void addPostVoiceFrames() {
|
||||
// 这个函数在实际实现中需要缓存后续的帧
|
||||
// 简化实现:在当前设计中,我们依赖静音阈值来自然包含结束后的帧
|
||||
}
|
||||
|
||||
// 完成当前窗口的处理
|
||||
void finalizeCurrentWindow() {
|
||||
// 计算最终的人声占比
|
||||
current_window->calculateVoiceRatio();
|
||||
|
||||
// 检查人声占比是否达到阈值
|
||||
if (current_window->voice_ratio >= config.voice_ratio_threshold) {
|
||||
// 窗口有效,加入队列
|
||||
if (completed_windows.size() >= config.max_queue_size) {
|
||||
// 队列已满,移除最旧的数据
|
||||
auto old_window = std::move(completed_windows.front());
|
||||
completed_windows.pop();
|
||||
estimated_memory_usage -= estimateWindowMemory(*old_window);
|
||||
}
|
||||
|
||||
completed_windows.push(std::move(current_window));
|
||||
data_condition.notify_one(); // 通知等待的消费者
|
||||
}
|
||||
|
||||
// 重置状态,开始新的窗口
|
||||
resetState();
|
||||
}
|
||||
};
|
||||
Reference in New Issue
Block a user