这是一次长久的提交：

1. 应用界面增加了返回主页的按钮 2. 修复了gif渲染内存泄漏的严重bug 3. 将PetDao当中的cJSON API替换为cpp_json，完美通过测试 4. 整合已经实现的各种上层建筑，实现了一个宠物对话基本业务应用，用于样品测试展示用 5. 重构了音频播放类，使其更modern，更加便于移植和拓展
2025-10-16 11:36:45 +08:00
parent 801138631e
commit ba5e47bc77
38 changed files with 2487 additions and 2008 deletions
@@ -2,20 +2,22 @@
 // Created by misaki on 2025/9/15.
 //
 #include "SpeechRecognizer.h"
-#include "esp_afe_sr_models.h"
-#include "esp_mn_models.h"
-#include "esp_wn_iface.h"
-#include "esp_mn_speech_commands.h"
-#include "model_path.h"
-#include "driver/gpio.h"
-#include "soc/soc_caps.h"
-#include "esp_err.h"
-#include "nvs_flash.h"
-#include "freertos/FreeRTOS.h"
-#include "freertos/task.h"
+#include "VadSlidingWindow.h"
+#include "SimpleI2SForwarder.h"
+
+#include <esp_afe_sr_models.h>
+#include <esp_mn_models.h>
+#include <esp_wn_iface.h>
+#include <esp_mn_speech_commands.h>
+#include <model_path.h>
+#include <driver/gpio.h>
+#include <soc/soc_caps.h>
+#include <esp_err.h>
+#include <nvs_flash.h>
+#include <freertos/FreeRTOS.h>
+#include <freertos/task.h>
 #include <atomic>
 #include <cstring>
-#include <memory>
 #include <utility>

 // 初始化静态成员变量
@@ -40,8 +42,8 @@ SpeechRecognizer::SpeechRecognizer()
      models(nullptr),
      multinet(nullptr),
      model_data(nullptr),
-      tasksRunning(false) {
-}
+      vad_state_(AFE_VAD_SILENCE),
+      tasksRunning(false){}

 SpeechRecognizer::~SpeechRecognizer() {
    deinit();
@@ -68,6 +70,29 @@ bool SpeechRecognizer::init(const SpeechRecognizerConfig& config) {
    return true;
 }

+bool SpeechRecognizer::init(const bool enable_vad, const vad_mode_t vad_mode, std::string model_path) {
+    if (initialized) {
+        ESP_LOGI("SpeechRecognizer", "Already initialized");
+        return true;
+    }
+    this->config.enable_vad = enable_vad;
+    this->config.vad_mode = vad_mode;
+    this->config.model_path = std::move(model_path);
+    // 初始化I2S
+    if (!initI2S()) {
+        ESP_LOGE("SpeechRecognizer", "I2S initialization failed");
+        return false;
+    }
+    // 初始化ESP-SR
+    if (!initESP_SR()) {
+        ESP_LOGE("SpeechRecognizer", "ESP-SR initialization failed");
+        return false;
+    }
+    initialized = true;
+    ESP_LOGI("SpeechRecognizer", "Initialization completed successfully");
+    return true;
+}
+
 void SpeechRecognizer::deinit() {
    if (!initialized) {
        return;
@@ -88,8 +113,8 @@ void SpeechRecognizer::deinit() {
    }
    // 释放I2S资源
    if (rx_handle) {
-        i2s_channel_disable(rx_handle);
-        i2s_del_channel(rx_handle);
+        i2s_channel_disable(rx_handle);     // 删除通道之前必须先禁用通道
+        i2s_del_channel(rx_handle);         // 删除该句柄以释放通道资源
        rx_handle = nullptr;
    }
    initialized = false;
@@ -98,30 +123,33 @@ void SpeechRecognizer::deinit() {

 bool SpeechRecognizer::initI2S() {
    esp_err_t ret = ESP_OK;
-    i2s_chan_config_t chan_cfg = I2S_CHANNEL_DEFAULT_CONFIG(I2S_NUM_1, I2S_ROLE_MASTER);
+    // 通过辅助宏获取默认的通道配置, 它可以帮助指定 I2S 角色和端口 ID
+    constexpr i2s_chan_config_t chan_cfg = I2S_CHANNEL_DEFAULT_CONFIG(I2S_NUM_1, I2S_ROLE_MASTER);
+    // 分配新的 TX 通道并获取该通道的句柄
    ret = i2s_new_channel(&chan_cfg, nullptr, &rx_handle);
    if (ret != ESP_OK) {
        ESP_LOGE("SpeechRecognizer", "Failed to create I2S channel: %s", esp_err_to_name(ret));
        return false;
    }
+    // 进行配置，通过宏生成声道配置和时钟配置, 这两个辅助宏在 'i2s_std.h' 中定义，只能用于 STD 模式
    i2s_std_config_t std_cfg = {
-        .clk_cfg = I2S_STD_CLK_DEFAULT_CONFIG(16000),
-        .slot_cfg = I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG(I2S_DATA_BIT_WIDTH_32BIT, I2S_SLOT_MODE_MONO),
+        .clk_cfg = I2S_STD_CLK_DEFAULT_CONFIG(16000),       // 16KHz采样率
+        .slot_cfg = I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG(I2S_DATA_BIT_WIDTH_32BIT, I2S_SLOT_MODE_MONO),  // 32位单声道
        .gpio_cfg = {
-            .mclk = GPIO_NUM_NC,
-            .bclk = config.bclk_pin,
-            .ws = config.ws_pin,
-            .dout = GPIO_NUM_NC,
-            .din = config.din_pin,
-            .invert_flags = {
+            .mclk = GPIO_NUM_NC,        // 不使用MCLK
+            .bclk = config.bclk_pin,    // BCLK引脚
+            .ws = config.ws_pin,        // WS引脚
+            .dout = GPIO_NUM_NC,        // 不使用DOUT
+            .din = config.din_pin,      // DIN引脚
+            .invert_flags = {       // 不使用倒置
                .mclk_inv = false,
                .bclk_inv = false,
                .ws_inv = false,
            },
        },
    };
-    std_cfg.slot_cfg.slot_mask = I2S_STD_SLOT_RIGHT;
-    ret = i2s_channel_init_std_mode(rx_handle, &std_cfg);
+    std_cfg.slot_cfg.slot_mask = I2S_STD_SLOT_RIGHT;        // 右声道
+    ret = i2s_channel_init_std_mode(rx_handle, &std_cfg);   // 初始化STD标准模式
    if (ret != ESP_OK) {
        ESP_LOGE("SpeechRecognizer", "Failed to init I2S standard mode: %s", esp_err_to_name(ret));
        return false;
@@ -177,7 +205,7 @@ bool SpeechRecognizer::initESP_SR() {
        ESP_LOGE("SpeechRecognizer", "Failed to create AFE data from config");
        return false;
    }
-    // 加载MultiNet模型(采用esp-sr提供的宏来处理不同语种的模型的处理问题)
+    // 加载MultiNet模型 (采用esp-sr提供的宏来处理不同语种的模型的处理问题)
 #if defined(CONFIG_SR_MN_CN_MULTINET5_RECOGNITION_QUANT8) || defined(CONFIG_SR_MN_CN_MULTINET6_QUANT) || defined(CONFIG_SR_MN_CN_MULTINET6_AC_QUANT)
    char *mn_name = esp_srmodel_filter(models, ESP_MN_PREFIX, ESP_MN_CHINESE);
 #else
@@ -211,7 +239,7 @@ bool SpeechRecognizer::start() {
        return true;
    }
    // 启用I2S通道
-    esp_err_t ret = i2s_channel_enable(rx_handle);
+    esp_err_t ret = i2s_channel_enable(rx_handle);      // 在读取数据之前，先启动 RX 通道
    if (ret != ESP_OK) {
        ESP_LOGE("SpeechRecognizer", "Failed to enable I2S channel: %s", esp_err_to_name(ret));
        return false;
@@ -253,29 +281,51 @@ void SpeechRecognizer::stop() {

 void SpeechRecognizer::feedTask() {
    ThreadManager::printThreadInfo("Feed task started");
-    int audio_chunksize = afe_handle->get_feed_chunksize(afe_data);
-    int nch = afe_handle->get_channel_num(afe_data);
-    size_t samp_len = audio_chunksize;
-    size_t samp_len_bytes = samp_len * sizeof(int32_t);  // 单声道32位
-    auto *i2s_buff = static_cast<int32_t *>(malloc(samp_len_bytes));
-    if (!i2s_buff) {
-        ESP_LOGE("SpeechRecognizer", "Failed to allocate memory for I2S buffer");
+    const int audio_chunksize = afe_handle->get_feed_chunksize(afe_data);
+    ESP_LOGW("SpeechRecognizer", "Feed task: audio_chunksize=%d", audio_chunksize);
+    int nch = afe_handle->get_channel_num(afe_data);        // 获取I2S通道的声道数, 此处为1, 因此并没有被下面所使用
+    const size_t samp_len = audio_chunksize;
+    const size_t samp_len_bytes = samp_len * sizeof(int32_t);  // 单声道32位
+    // 分配I2S缓冲区 放在PSRAM堆内存中
+    auto *i2s_buff = static_cast<int32_t *>(heap_caps_malloc((samp_len_bytes), MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM));
+    // 创建I2S转发副本 放在PSRAM堆内存中
+    auto *raw_pcm16 = static_cast<int16_t *>(heap_caps_malloc(samp_len * sizeof(int16_t), MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM));
+    if (!i2s_buff || !raw_pcm16) {
+        ESP_LOGE("SpeechRecognizer", "Failed to allocate memory for buffers");
+        if (i2s_buff) free(i2s_buff);
+        if (raw_pcm16) free(raw_pcm16);
        return;
    }
-    size_t bytes_read;
+    LatestDataForwarder::getInstance()->initialize(200);    // 初始化转发队列，最多缓存 200 帧 I2S 数据
+    size_t bytes_read;      // 读取的字节数
+
    while (tasksRunning) {
-        esp_err_t ret = i2s_channel_read(rx_handle, i2s_buff, samp_len_bytes, &bytes_read, portMAX_DELAY);
+        // 读取I2S数据
+        const esp_err_t ret = i2s_channel_read(rx_handle, i2s_buff, samp_len_bytes, &bytes_read, portMAX_DELAY);
        if (ret != ESP_OK) {
            ESP_LOGE("SpeechRecognizer", "I2S read error: %s", esp_err_to_name(ret));
-            vTaskDelay(pdMS_TO_TICKS(10));
+            std::this_thread::sleep_for(std::chrono::milliseconds(10));
            continue;
        }
-        // 处理音频数据（32位转16位）
+        // 一次性处理所有数据转换
        for (int i = 0; i < samp_len; ++i) {
-            i2s_buff[i] = i2s_buff[i] >> 14; // 32:8是有效位，转换为16位音频数据
+            // 转发数据转换：32位转16位（取高16位）
+            raw_pcm16[i] = static_cast<int16_t>(i2s_buff[i] >> 16);
+            // AFE数据转换：右移14位（在原始数据上操作）
+            i2s_buff[i] = i2s_buff[i] >> 14;    // 32:8 是有效位，8:0 是低 8 位，全部为 0，AFE 输入是 16 位语音数据，29:13 位用于放大语音信号。
        }
+        // 转发原始数据
+        LatestDataForwarder::getInstance()->injectData(raw_pcm16, samp_len);
        // 喂数据给AFE
        afe_handle->feed(afe_data, reinterpret_cast<int16_t *>(i2s_buff));
+
+        // 复制一份到滑动窗口人声检测区 (给网络) 注入音频帧到管理器
+        // AudioBufferManager::getInstance()->injectAudioFrame(
+        //     reinterpret_cast<int16_t*>(i2s_buff),
+        //     samp_len,
+        //     this->vad_state_
+        // );
+        // std::this_thread::sleep_for(std::chrono::milliseconds(5));     // 休眠5ms
    }
    free(i2s_buff);
    ESP_LOGI("SpeechRecognizer", "Feed task exited");
@@ -293,6 +343,8 @@ void SpeechRecognizer::detectTask() {
    ESP_LOGI("SpeechRecognizer", "Ready for speech recognition");
    while (tasksRunning) {
        afe_fetch_result_t* res = afe_handle->fetch(afe_data);
+        // 从res中取出 vad 状态
+        this->vad_state_ = res->vad_state;
        if (!res || res->ret_value == ESP_FAIL) {
            ESP_LOGE("SpeechRecognizer", "AFE fetch error");
            vTaskDelay(pdMS_TO_TICKS(10));
@@ -364,10 +416,6 @@ bool SpeechRecognizer::addCommand(int command_id, const std::string& phrase) {
    }
    // 打印缓存的指令
    ESP_LOGI("SpeechRecognizer", "Added command: ID=%d, Phrase=%s", command_id, phrase.c_str());
-    ESP_LOGI("SpeechRecognizer", "下面是当前已经缓存的指令:");
-    esp_mn_commands_print();
-    ESP_LOGI("SpeechRecognizer", "下面是当前已经应用的指令:");
-    esp_mn_active_commands_print();
    return true;
 }

@@ -376,6 +424,7 @@ bool SpeechRecognizer::addCommands(const std::vector<std::pair<int, std::string>
    for (const auto& cmd : commands) {
        if (!addCommand(cmd.first, cmd.second)) {
            success = false;
+            ESP_LOGE("main", "Failed to add some commands");
        }
    }
    ESP_LOGI("SpeechRecognizer", "下面是当前已经缓存的指令:");
@@ -459,6 +508,11 @@ std::string SpeechRecognizer::getCurrentState() const {
    return currentState;
 }

+afe_vad_state_t SpeechRecognizer::getVadState() const {
+    return vad_state_;
+}
+
+
 bool SpeechRecognizer::isRunning() const {
    return running;
 }