1. 完成了语音识别的C++业务层封装，测试通过

2. 试着测试了一下LVGL_GIF渲染+音乐播放+语音识别的组合简单优化后，发现lvgl渲染略显卡顿，语音识别有缓冲区空警告，不过无伤大雅，还需要进一步深度优化。
2025-09-16 01:29:17 +08:00
parent dc420c3b7a
commit 4cc761aab3
26 changed files with 134775 additions and 32 deletions
@@ -113,8 +113,98 @@ void testPetSystem() {
    std::cout << SDFileManager::getInstance()->catCommand("/sdcard/pet_data/my_pet.json") << std::endl;
 }

+
+#include "SpeechRecognizer.h"
+#include <nvs.h>
+#include <nvs_flash.h>
+// 命令回调函数
+void commandCallback(int command_id, const std::string& phrase, float probability) {
+    ESP_LOGI("Example", "Received command: ID=%d, Phrase='%s', Probability=%.2f",
+             command_id, phrase.c_str(), probability);
+
+    // 根据命令执行相应操作
+    switch (command_id) {
+        case 0:
+            ESP_LOGI("Example", "执行命令0");
+            // 执行命令0的操作
+            break;
+        case 1:
+            ESP_LOGI("Example", "执行命令1");
+            // 执行命令1的操作
+            break;
+        case 2:
+            ESP_LOGI("Example", "执行命令2");
+            // 执行命令2的操作
+            break;
+        default:
+            ESP_LOGI("Example", "未知的命令ID: %d", command_id);
+            break;
+    }
+}
+
+// 状态回调函数
+void stateCallback(const std::string& state) {
+    ESP_LOGI("Example", "状态改变到: %s", state.c_str());
+}
+#include "SDFileManager.h"
+void testMIC() {
+    // 初始化NVS
+    esp_err_t ret = nvs_flash_init();
+    if (ret == ESP_ERR_NVS_NO_FREE_PAGES || ret == ESP_ERR_NVS_NEW_VERSION_FOUND) {
+        ESP_ERROR_CHECK(nvs_flash_erase());
+        ret = nvs_flash_init();
+    }
+    ESP_ERROR_CHECK(ret);
+
+    // 初始化SD卡管理器
+    SDFileManager::getInstance()->tryInitSDCard();
+
+    // 获取SpeechRecognizer实例
+    SpeechRecognizer* recognizer = SpeechRecognizer::getInstance();
+
+    // 配置识别器
+    SpeechRecognizerConfig config;
+    config.enable_vad = true;
+    config.vad_mode = VAD_MODE_3;  // 更高的VAD灵敏度
+    config.model_path = "/sdcard/srmodels";
+
+    // 初始化
+    if (!recognizer->init(config)) {
+        ESP_LOGE("main", "Failed to initialize speech recognizer");
+        return;
+    }
+
+    // 添加自定义命令
+    std::vector<std::pair<int, std::string>> commands = {
+        {0, "kai deng"},      // 开灯
+        {1, "guan deng"},     // 关灯
+        {2, "ti gao liang du"}, // 提高亮度
+        {3, "jiang di liang du"}, // 降低亮度
+        {4, "bo fang yin yue"}, // 播放音乐
+        {5, "ting zhi bo fang"}  // 停止播放
+    };
+
+    if (!recognizer->addCommands(commands)) {
+        ESP_LOGE("main", "Failed to add some commands");
+    }
+
+    // 注册回调函数
+    recognizer->registerCommandCallback(commandCallback);
+    recognizer->registerStateCallback(stateCallback);
+
+    // 开始识别
+    if (!recognizer->start()) {
+        ESP_LOGE("main", "Failed to start speech recognition");
+        return;
+    }
+
+    ESP_LOGI("main", "Speech recognition system started successfully");
+
+}
+
 void Cpp_Hand() {
-    testPetSystem();
+    testMIC();
+    // testPetSystem();

    OTAClass oc;
    oc.Init();
@@ -170,7 +170,7 @@ void OTAClass::Init() {
    AudioOutput::getInstance()->setVolume(5);

    // 同步播放
-    AudioOutput::getInstance()->playSync("/sdcard/music", "Old_Memory.mp3");
+    AudioOutput::getInstance()->playSync("/sdcard/music", "kokoronashi.mp3");


    // // 配置Wifi连接线程参数
@@ -5,7 +5,6 @@
 #include "PetDao.h"
 #include <iostream>
 #include <sstream>
-
 using namespace PetEnumConverter;

 // PetEnumConverter 实现
@@ -6,7 +6,6 @@
 #include "PetBaseClass.h"
 #include "SDFileManager.h"
 #include "cJSON.h"
-#include <unordered_map>
 #include <string>

 // 辅助函数：枚举类型与字符串的转换
@@ -62,4 +61,48 @@ private:

    // 宠物数据存储目录
    static constexpr const char* PET_DATA_DIR = "/sdcard/pet_data";
-};
+};
+
+/**
+ * 宠物数据结构(JSON)
+{
+  "name": "芝士雪豹",
+  "hp": 85,
+  "density": 120,
+  "identity": "我是顶真，是妈妈省的",
+  "stage_strategy": {
+    "current_stage": "PET_STAGE_ADULT",
+    "stage_model_map": {
+      "PET_STAGE_YOUNG": "/models/snow_leopard_young.gif",
+      "PET_STAGE_ADULT": "/models/snow_leopard_adult.gif",
+      "PET_STAGE_OLD": "/models/snow_leopard_old.gif"
+    },
+    "stage_audio_map": {
+      "PET_STAGE_YOUNG": "/audio/snow_leopard_young.mp3",
+      "PET_STAGE_ADULT": "/audio/snow_leopard_adult.mp3",
+      "PET_STAGE_OLD": "/audio/snow_leopard_old.mp3"
+    }
+  },
+  "action_strategy": {
+    "current_action": "PET_ACTION_SLEEP",
+    "action_model_map": {
+      "PET_ACTION_SLEEP": "/models/actions/sleep.gif",
+      "PET_ACTION_EAT": "/models/actions/eat.gif",
+      "PET_ACTION_HAPPY": "/models/actions/happy.gif",
+      "PET_ACTION_ANGRY": "/models/actions/angry.gif",
+      "PET_ACTION_SAD": "/models/actions/sad.gif",
+      "PET_ACTION_EVOLVE": "/models/actions/evolve.gif",
+      "PET_ACTION_TOUCH": "/models/actions/touch.gif"
+    },
+    "action_audio_map": {
+      "PET_ACTION_SLEEP": "/audio/actions/sleep.mp3",
+      "PET_ACTION_EAT": "/audio/actions/eat.mp3",
+      "PET_ACTION_HAPPY": "/audio/actions/happy.mp3",
+      "PET_ACTION_ANGRY": "/audio/actions/angry.mp3",
+      "PET_ACTION_SAD": "/audio/actions/sad.mp3",
+      "PET_ACTION_EVOLVE": "/audio/actions/evolve.mp3",
+      "PET_ACTION_TOUCH": "/audio/actions/touch.mp3"
+    }
+  }
+}
+ */
@@ -47,9 +47,9 @@ LVGLRender::LVGLRender() {
    ESP_LOGI("LVGL_Render", "LVGL_Render构造函数...创建LVGL心跳...");

    ThreadConfig trickConfig;
-    trickConfig.core_id = 1;    // 渲染分配给核1
-    trickConfig.name = "LVGL_Render_Heartbeat";
-    trickConfig.priority  = 5;
+    trickConfig.core_id = 1;    // 渲染分配给核0
+    trickConfig.name = "LVGL_Render";
+    trickConfig.priority  = 5;  //
    trickConfig.stack_size = 4096;  // 给LVGL一个较大的堆栈，避免栈溢出

    std::thread tick_thread = ThreadManager::createMemberThread(trickConfig, this, &LVGLRender::LVGL_Update);
@@ -0,0 +1,465 @@
+//
+// Created by misaki on 2025/9/15.
+//
+#include "SpeechRecognizer.h"
+#include "esp_afe_sr_models.h"
+#include "esp_mn_models.h"
+#include "esp_wn_iface.h"
+#include "esp_mn_speech_commands.h"
+#include "model_path.h"
+#include "driver/gpio.h"
+#include "soc/soc_caps.h"
+#include "esp_err.h"
+#include "nvs_flash.h"
+#include "freertos/FreeRTOS.h"
+#include "freertos/task.h"
+#include <atomic>
+#include <cstring>
+#include <memory>
+#include <utility>
+
+// 初始化静态成员变量
+SpeechRecognizer* SpeechRecognizer::instance = nullptr;
+std::mutex SpeechRecognizer::instanceMutex;
+
+SpeechRecognizer* SpeechRecognizer::getInstance() {
+    std::lock_guard<std::mutex> lock(instanceMutex);
+    if (!instance) {
+        instance = new SpeechRecognizer();
+    }
+    return instance;
+}
+
+SpeechRecognizer::SpeechRecognizer()
+    : initialized(false),
+      running(false),
+      enabled(true),
+      rx_handle(nullptr),
+      afe_handle(nullptr),
+      afe_data(nullptr),
+      models(nullptr),
+      multinet(nullptr),
+      model_data(nullptr),
+      tasksRunning(false) {
+}
+
+SpeechRecognizer::~SpeechRecognizer() {
+    deinit();
+}
+
+bool SpeechRecognizer::init(const SpeechRecognizerConfig& config) {
+    if (initialized) {
+        ESP_LOGI("SpeechRecognizer", "Already initialized");
+        return true;
+    }
+    this->config = config;
+    // 初始化I2S
+    if (!initI2S()) {
+        ESP_LOGE("SpeechRecognizer", "I2S initialization failed");
+        return false;
+    }
+    // 初始化ESP-SR
+    if (!initESP_SR()) {
+        ESP_LOGE("SpeechRecognizer", "ESP-SR initialization failed");
+        return false;
+    }
+    initialized = true;
+    ESP_LOGI("SpeechRecognizer", "Initialization completed successfully");
+    return true;
+}
+
+void SpeechRecognizer::deinit() {
+    if (!initialized) {
+        return;
+    }
+    stop();
+    // 释放ESP-SR资源
+    if (model_data && multinet) {
+        multinet->destroy(model_data);
+        model_data = nullptr;
+    }
+    if (afe_data && afe_handle) {
+        afe_handle->destroy(afe_data);
+        afe_data = nullptr;
+    }
+    if (models) {
+        // 注意：esp_srmodel_init分配的资源可能需要特殊清理
+        // 根据ESP-SR文档进行适当清理
+    }
+    // 释放I2S资源
+    if (rx_handle) {
+        i2s_channel_disable(rx_handle);
+        i2s_del_channel(rx_handle);
+        rx_handle = nullptr;
+    }
+    initialized = false;
+    ESP_LOGI("SpeechRecognizer", "Deinitialized");
+}
+
+bool SpeechRecognizer::initI2S() {
+    esp_err_t ret = ESP_OK;
+    i2s_chan_config_t chan_cfg = I2S_CHANNEL_DEFAULT_CONFIG(I2S_NUM_1, I2S_ROLE_MASTER);
+    ret = i2s_new_channel(&chan_cfg, nullptr, &rx_handle);
+    if (ret != ESP_OK) {
+        ESP_LOGE("SpeechRecognizer", "Failed to create I2S channel: %s", esp_err_to_name(ret));
+        return false;
+    }
+    i2s_std_config_t std_cfg = {
+        .clk_cfg = I2S_STD_CLK_DEFAULT_CONFIG(16000),
+        .slot_cfg = I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG(I2S_DATA_BIT_WIDTH_32BIT, I2S_SLOT_MODE_MONO),
+        .gpio_cfg = {
+            .mclk = GPIO_NUM_NC,
+            .bclk = config.bclk_pin,
+            .ws = config.ws_pin,
+            .dout = GPIO_NUM_NC,
+            .din = config.din_pin,
+            .invert_flags = {
+                .mclk_inv = false,
+                .bclk_inv = false,
+                .ws_inv = false,
+            },
+        },
+    };
+    std_cfg.slot_cfg.slot_mask = I2S_STD_SLOT_RIGHT;
+    ret = i2s_channel_init_std_mode(rx_handle, &std_cfg);
+    if (ret != ESP_OK) {
+        ESP_LOGE("SpeechRecognizer", "Failed to init I2S standard mode: %s", esp_err_to_name(ret));
+        return false;
+    }
+    ESP_LOGI("SpeechRecognizer", "I2S initialized successfully");
+    return true;
+}
+
+bool SpeechRecognizer::initESP_SR() {
+    // 获取AFE句柄
+    afe_handle = &ESP_AFE_SR_HANDLE;
+    if (!afe_handle) {
+        ESP_LOGE("SpeechRecognizer", "Failed to get AFE handle");
+        return false;
+    }
+    // 初始化模型
+    models = esp_srmodel_init(config.model_path.c_str());
+    if (!models) {
+        ESP_LOGE("SpeechRecognizer", "Failed to initialize models from path: %s", config.model_path.c_str());
+        return false;
+    }
+    // 配置AFE
+    afe_config_t afe_config = {
+        .aec_init = config.enable_aec,
+        .se_init = config.enable_se,
+        .vad_init = config.enable_vad,
+        .wakenet_init = false,  // 禁用唤醒词
+        .voice_communication_init = false,
+        .voice_communication_agc_init = false,
+        .voice_communication_agc_gain = 15,
+        .vad_mode = config.vad_mode,
+        .wakenet_model_name = nullptr,
+        .wakenet_model_name_2 = nullptr,
+        .wakenet_mode = DET_MODE_2CH_90,
+        .afe_mode = SR_MODE_LOW_COST,
+        .afe_perferred_core = 0,
+        .afe_perferred_priority = 5,
+        .afe_ringbuf_size = 10,
+        .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM,
+        .afe_linear_gain = 1.0,
+        .agc_mode = AFE_MN_PEAK_AGC_MODE_2,
+        .pcm_config = {
+            .total_ch_num = 2,
+            .mic_num = 1,
+            .ref_num = 1,
+            .sample_rate = 16000,
+        },
+        .debug_init = false,
+        .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, nullptr}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, nullptr}},
+    };
+    afe_data = afe_handle->create_from_config(&afe_config);
+    if (!afe_data) {
+        ESP_LOGE("SpeechRecognizer", "Failed to create AFE data from config");
+        return false;
+    }
+    // 加载MultiNet模型(采用esp-sr提供的宏来处理不同语种的模型的处理问题)
+#if defined(CONFIG_SR_MN_CN_MULTINET5_RECOGNITION_QUANT8) || defined(CONFIG_SR_MN_CN_MULTINET6_QUANT) || defined(CONFIG_SR_MN_CN_MULTINET6_AC_QUANT)
+    char *mn_name = esp_srmodel_filter(models, ESP_MN_PREFIX, ESP_MN_CHINESE);
+#else
+    char *mn_name = esp_srmodel_filter(models, ESP_MN_PREFIX, ESP_MN_ENGLISH);
+#endif
+    if (!mn_name) {
+        ESP_LOGE("SpeechRecognizer", "No MultiNet model found");
+        return false;
+    }
+    multinet = esp_mn_handle_from_name(mn_name);
+    if (!multinet) {
+        ESP_LOGE("SpeechRecognizer", "Failed to get MultiNet handle");
+        return false;
+    }
+    model_data = multinet->create(mn_name, config.detection_timeout);
+    if (!model_data) {
+        ESP_LOGE("SpeechRecognizer", "Failed to create MultiNet model data");
+        return false;
+    }
+    ESP_LOGI("SpeechRecognizer", "ESP-SR initialized successfully with model: %s", mn_name);
+    return true;
+}
+
+bool SpeechRecognizer::start() {
+    if (!initialized) {
+        ESP_LOGE("SpeechRecognizer", "Not initialized");
+        return false;
+    }
+    if (running) {
+        ESP_LOGI("SpeechRecognizer", "Already running");
+        return true;
+    }
+    // 启用I2S通道
+    esp_err_t ret = i2s_channel_enable(rx_handle);
+    if (ret != ESP_OK) {
+        ESP_LOGE("SpeechRecognizer", "Failed to enable I2S channel: %s", esp_err_to_name(ret));
+        return false;
+    }
+    // 启动任务
+    tasksRunning = true;
+    // 使用ThreadManager创建任务
+    feedThread = ThreadManager::createMemberThread(
+        config.feed_thread_config, this, &SpeechRecognizer::feedTask);
+
+    detectThread = ThreadManager::createMemberThread(
+        config.detect_thread_config, this, &SpeechRecognizer::detectTask);
+    running = true;
+    updateState("started");
+    ESP_LOGI("SpeechRecognizer", "Speech recognition started");
+    return true;
+}
+
+void SpeechRecognizer::stop() {
+    if (!running) {
+        return;
+    }
+    tasksRunning = false;
+    // 等待任务结束
+    if (feedThread.joinable()) {
+        feedThread.join();
+    }
+    if (detectThread.joinable()) {
+        detectThread.join();
+    }
+    // 禁用I2S通道
+    if (rx_handle) {
+        i2s_channel_disable(rx_handle);
+    }
+    running = false;
+    updateState("stopped");
+    ESP_LOGI("SpeechRecognizer", "Speech recognition stopped");
+}
+
+void SpeechRecognizer::feedTask() {
+    ThreadManager::printThreadInfo("Feed task started");
+    int audio_chunksize = afe_handle->get_feed_chunksize(afe_data);
+    int nch = afe_handle->get_channel_num(afe_data);
+    size_t samp_len = audio_chunksize;
+    size_t samp_len_bytes = samp_len * sizeof(int32_t);  // 单声道32位
+    auto *i2s_buff = static_cast<int32_t *>(malloc(samp_len_bytes));
+    if (!i2s_buff) {
+        ESP_LOGE("SpeechRecognizer", "Failed to allocate memory for I2S buffer");
+        return;
+    }
+    size_t bytes_read;
+    while (tasksRunning) {
+        esp_err_t ret = i2s_channel_read(rx_handle, i2s_buff, samp_len_bytes, &bytes_read, portMAX_DELAY);
+        if (ret != ESP_OK) {
+            ESP_LOGE("SpeechRecognizer", "I2S read error: %s", esp_err_to_name(ret));
+            vTaskDelay(pdMS_TO_TICKS(10));
+            continue;
+        }
+        // 处理音频数据（32位转16位）
+        for (int i = 0; i < samp_len; ++i) {
+            i2s_buff[i] = i2s_buff[i] >> 14; // 32:8是有效位，转换为16位音频数据
+        }
+        // 喂数据给AFE
+        afe_handle->feed(afe_data, reinterpret_cast<int16_t *>(i2s_buff));
+    }
+    free(i2s_buff);
+    ESP_LOGI("SpeechRecognizer", "Feed task exited");
+}
+
+void SpeechRecognizer::detectTask() {
+    ThreadManager::printThreadInfo("Detect task started");
+    int afe_chunksize = afe_handle->get_fetch_chunksize(afe_data);
+    int mu_chunksize = multinet->get_samp_chunksize(model_data);
+    if (mu_chunksize != afe_chunksize) {
+        ESP_LOGE("SpeechRecognizer", "Chunk size mismatch: AFE=%d, MultiNet=%d", afe_chunksize, mu_chunksize);
+        return;
+    }
+    updateState("ready");
+    ESP_LOGI("SpeechRecognizer", "Ready for speech recognition");
+    while (tasksRunning) {
+        afe_fetch_result_t* res = afe_handle->fetch(afe_data);
+        if (!res || res->ret_value == ESP_FAIL) {
+            ESP_LOGE("SpeechRecognizer", "AFE fetch error");
+            vTaskDelay(pdMS_TO_TICKS(10));
+            continue;
+        }
+        if (!enabled) {
+            vTaskDelay(pdMS_TO_TICKS(100));
+            continue;
+        }
+        // 使用MultiNet进行语音检测
+        esp_mn_state_t mn_state = multinet->detect(model_data, res->data);
+        if (mn_state == ESP_MN_STATE_DETECTING) {
+            // 检测中，不做处理
+            continue;
+        } else if (mn_state == ESP_MN_STATE_DETECTED) {
+            // 检测到语音命令
+            esp_mn_results_t *mn_result = multinet->get_results(model_data);
+            if (mn_result && mn_result->num > 0) {
+                SpeechRecognitionResult result;
+                result.command_id = mn_result->command_id[0];
+                result.phrase = mn_result->string;
+                result.probability = mn_result->prob[0];
+                result.phrase_id = mn_result->phrase_id[0];
+
+                handleRecognitionResult(result);
+            }
+        } else if (mn_state == ESP_MN_STATE_TIMEOUT) {
+            // 识别超时
+            updateState("timeout");
+            esp_mn_results_t *mn_result = multinet->get_results(model_data);
+            ESP_LOGI("SpeechRecognizer", "Detection timeout: %s",
+                    mn_result && mn_result->string ? mn_result->string : "");
+            // 重置检测状态
+            multinet->clean(model_data);
+            updateState("ready");
+        }
+    }
+    ESP_LOGI("SpeechRecognizer", "Detect task exited");
+}
+
+bool SpeechRecognizer::addCommand(int command_id, const std::string& phrase) {
+    // 加强检查，确保所有相关指针都有效
+    if (!multinet) {
+        ESP_LOGE("SpeechRecognizer", "MultiNet handle is null");
+        return false;
+    }
+    if (!model_data) {
+        ESP_LOGE("SpeechRecognizer", "Model data is null");
+        return false;
+    }
+    esp_err_t ret = esp_mn_commands_add(command_id, phrase.c_str());
+    if (ret != ESP_OK) {
+        ESP_LOGE("SpeechRecognizer", "Failed to add command: %s", esp_err_to_name(ret));
+        return false;
+    }
+    // 更新命令列表
+    esp_mn_error_t *ret_mn = esp_mn_commands_update();
+    if (ret_mn) {
+        if (ret_mn->num >= 1) {
+            ESP_LOGE("SpeechRecognizer", "无法更新的指令数量: %d", ret_mn->num);
+            for (int i = 0; i < ret_mn->num; i++) {
+                ESP_LOGE("SpeechRecognizer", "无法更新的指令名称: %s，对应的id=%d", ret_mn->phrases[i]->string, ret_mn->phrases[i]->command_id);
+            }
+            return false;
+        }
+        else {
+            ESP_LOGE("SpeechRecognizer", "无法更新的指令短语数组为空");
+        }
+    }
+    // 打印缓存的指令
+    ESP_LOGI("SpeechRecognizer", "Added command: ID=%d, Phrase=%s", command_id, phrase.c_str());
+    ESP_LOGI("SpeechRecognizer", "下面是当前已经缓存的指令:");
+    esp_mn_commands_print();
+    ESP_LOGI("SpeechRecognizer", "下面是当前已经应用的指令:");
+    esp_mn_active_commands_print();
+    return true;
+}
+
+bool SpeechRecognizer::addCommands(const std::vector<std::pair<int, std::string>>& commands) {
+    bool success = true;
+    for (const auto& cmd : commands) {
+        if (!addCommand(cmd.first, cmd.second)) {
+            success = false;
+        }
+    }
+    ESP_LOGI("SpeechRecognizer", "下面是当前已经缓存的指令:");
+    esp_mn_commands_print();
+    ESP_LOGI("SpeechRecognizer", "下面是当前已经应用的指令:");
+    esp_mn_active_commands_print();
+    return success;
+}
+
+void SpeechRecognizer::clearCommands() {
+    if (multinet && model_data) {
+        esp_mn_commands_clear();
+        ESP_LOGI("SpeechRecognizer", "All commands cleared");
+    }
+}
+
+void SpeechRecognizer::registerCommandCallback(SpeechCommandCallback callback) {
+    commandCallback = std::move(callback);
+}
+
+void SpeechRecognizer::registerStateCallback(SpeechStateCallback callback) {
+    stateCallback = std::move(callback);
+}
+
+void SpeechRecognizer::handleRecognitionResult(const SpeechRecognitionResult& result) {
+    // 保存到历史记录
+    {
+        std::lock_guard<std::mutex> lock(historyMutex);
+        lastResult = result;
+        history.push_back(result);
+
+        // 限制历史记录大小
+        if (history.size() > 100) {
+            history.erase(history.begin());
+        }
+    }
+    // 记录日志
+    ESP_LOGI("SpeechRecognizer", "Command detected: ID=%d, Phrase='%s', Probability=%.2f",
+            result.command_id, result.phrase.c_str(), result.probability);
+    updateState("command_detected");
+    // 调用回调函数
+    if (commandCallback) {
+        commandCallback(result.command_id, result.phrase, result.probability);
+    }
+    // 重置检测状态，准备下一次识别
+    if (multinet && model_data) {
+        multinet->clean(model_data);
+    }
+    updateState("ready");
+}
+
+void SpeechRecognizer::updateState(const std::string& state) {
+    {
+        std::lock_guard<std::mutex> lock(stateMutex);
+        currentState = state;
+    }
+    ESP_LOGI("SpeechRecognizer", "State changed: %s", state.c_str());
+    if (stateCallback) {
+        stateCallback(state);
+    }
+}
+
+SpeechRecognitionResult SpeechRecognizer::getLastResult() const {
+    std::lock_guard<std::mutex> lock(historyMutex);
+    return lastResult;
+}
+
+std::vector<SpeechRecognitionResult> SpeechRecognizer::getHistory() const {
+    std::lock_guard<std::mutex> lock(historyMutex);
+    return history;
+}
+
+void SpeechRecognizer::setEnabled(bool enabled) {
+    this->enabled = enabled;
+    updateState(enabled ? "enabled" : "disabled");
+    ESP_LOGI("SpeechRecognizer", "Speech recognition %s", enabled ? "enabled" : "disabled");
+}
+
+std::string SpeechRecognizer::getCurrentState() const {
+    std::lock_guard<std::mutex> lock(stateMutex);
+    return currentState;
+}
+
+bool SpeechRecognizer::isRunning() const {
+    return running;
+}
+
@@ -0,0 +1,171 @@
+//
+// Created by misaki on 2025/9/15.
+//
+#pragma once
+#include <functional>
+#include <vector>
+#include <string>
+#include <mutex>
+#include <atomic>
+#include "esp_afe_sr_iface.h"
+#include "model_path.h"
+#include "esp_mn_iface.h"
+#include "driver/i2s_std.h"
+#include "esp_log.h"
+#include "ThreadManager.h"
+
+// 前向声明
+struct model_iface_data_t;
+
+// 语音命令回调函数类型
+using SpeechCommandCallback = std::function<void(int command_id, const std::string& phrase, float probability)>;
+
+// 识别状态回调函数类型
+using SpeechStateCallback = std::function<void(const std::string& state)>;
+
+// 识别结果结构体
+struct SpeechRecognitionResult {
+    int command_id;
+    std::string phrase;
+    float probability;
+    int phrase_id;
+};
+
+// 配置结构体
+struct SpeechRecognizerConfig {
+    // I2S配置
+    gpio_num_t bclk_pin = GPIO_NUM_15;
+    gpio_num_t ws_pin = GPIO_NUM_2;
+    gpio_num_t din_pin = GPIO_NUM_39;
+
+    // 音频处理配置
+    bool enable_aec = false;      // 回声消除
+    bool enable_se = false;       // 降噪
+    bool enable_vad = false;       // 语音活动检测
+    vad_mode_t vad_mode = VAD_MODE_0; // VAD灵敏度
+    // 模型路径
+    std::string model_path = "/sdcard/srmodels";
+    // 线程配置
+    ThreadConfig feed_thread_config = {"SR_Feed", 0, 4096, 3, false};
+    ThreadConfig detect_thread_config = {"SR_Detect", 1, 6 * 1024, 5, false};
+    // 识别超时时间(ms)
+    int detection_timeout = 6000;
+};
+
+class SpeechRecognizer {
+public:
+    // 获取单例实例
+    static SpeechRecognizer* getInstance();
+
+    // 删除拷贝构造函数和赋值运算符
+    SpeechRecognizer(const SpeechRecognizer&) = delete;
+    SpeechRecognizer& operator=(const SpeechRecognizer&) = delete;
+
+    // 初始化语音识别系统
+    bool init(const SpeechRecognizerConfig& config = SpeechRecognizerConfig());
+
+    // 反初始化
+    void deinit();
+
+    // 添加自定义语音命令
+    bool addCommand(int command_id, const std::string& phrase);
+
+    // 批量添加语音命令
+    bool addCommands(const std::vector<std::pair<int, std::string>>& commands);
+
+    // 清除所有语音命令
+    void clearCommands();
+
+    // 开始语音识别
+    bool start();
+
+    // 停止语音识别
+    void stop();
+
+    // 是否正在运行
+    bool isRunning() const;
+
+    // 注册命令回调函数
+    void registerCommandCallback(SpeechCommandCallback callback);
+
+    // 注册状态回调函数
+    void registerStateCallback(SpeechStateCallback callback);
+
+    // 获取最后一次识别结果
+    SpeechRecognitionResult getLastResult() const;
+
+    // 获取识别历史
+    std::vector<SpeechRecognitionResult> getHistory() const;
+
+    // 设置VAD灵敏度
+    // void setVadSensitivity(vad_mode_t mode);
+
+    // 启用/禁用语音识别
+    void setEnabled(bool enabled);
+
+    // 获取当前识别状态
+    std::string getCurrentState() const;
+
+private:
+    SpeechRecognizer();
+    ~SpeechRecognizer();
+
+    // I2S初始化
+    bool initI2S();
+
+    // ESP-SR初始化
+    bool initESP_SR();
+
+    // 喂数据任务（音频采集）
+    void feedTask();
+
+    // 检测任务（语音识别）
+    void detectTask();
+
+    // 处理识别结果
+    void handleRecognitionResult(const SpeechRecognitionResult& result);
+
+    // 更新识别状态
+    void updateState(const std::string& state);
+
+    // 静态成员函数用于C接口兼容
+    static void feedTaskWrapper(void* arg);
+    static void detectTaskWrapper(void* arg);
+
+private:
+    static SpeechRecognizer* instance;
+    static std::mutex instanceMutex;
+
+    SpeechRecognizerConfig config;
+    bool initialized;
+    bool running;
+    bool enabled;
+
+    // I2S相关
+    i2s_chan_handle_t rx_handle;
+
+    // ESP-SR相关
+    const esp_afe_sr_iface_t* afe_handle;
+    esp_afe_sr_data_t* afe_data;
+    srmodel_list_t* models;
+    esp_mn_iface_t* multinet;
+    model_iface_data_t* model_data;
+
+    // 命令回调
+    SpeechCommandCallback commandCallback;
+    SpeechStateCallback stateCallback;
+
+    // 识别结果
+    SpeechRecognitionResult lastResult;
+    std::vector<SpeechRecognitionResult> history;
+    mutable std::mutex historyMutex;
+
+    // 任务句柄
+    std::thread feedThread;
+    std::thread detectThread;
+    std::atomic<bool> tasksRunning;
+
+    // 当前状态
+    std::string currentState;
+    mutable std::mutex stateMutex;
+};