// // Created by misaki on 2025/9/15. // #include "SpeechRecognizer.h" #include "esp_afe_sr_models.h" #include "esp_mn_models.h" #include "esp_wn_iface.h" #include "esp_mn_speech_commands.h" #include "model_path.h" #include "driver/gpio.h" #include "soc/soc_caps.h" #include "esp_err.h" #include "nvs_flash.h" #include "freertos/FreeRTOS.h" #include "freertos/task.h" #include #include #include #include // 初始化静态成员变量 SpeechRecognizer* SpeechRecognizer::instance = nullptr; std::mutex SpeechRecognizer::instanceMutex; SpeechRecognizer* SpeechRecognizer::getInstance() { std::lock_guard lock(instanceMutex); if (!instance) { instance = new SpeechRecognizer(); } return instance; } SpeechRecognizer::SpeechRecognizer() : initialized(false), running(false), enabled(true), rx_handle(nullptr), afe_handle(nullptr), afe_data(nullptr), models(nullptr), multinet(nullptr), model_data(nullptr), tasksRunning(false) { } SpeechRecognizer::~SpeechRecognizer() { deinit(); } bool SpeechRecognizer::init(const SpeechRecognizerConfig& config) { if (initialized) { ESP_LOGI("SpeechRecognizer", "Already initialized"); return true; } this->config = config; // 初始化I2S if (!initI2S()) { ESP_LOGE("SpeechRecognizer", "I2S initialization failed"); return false; } // 初始化ESP-SR if (!initESP_SR()) { ESP_LOGE("SpeechRecognizer", "ESP-SR initialization failed"); return false; } initialized = true; ESP_LOGI("SpeechRecognizer", "Initialization completed successfully"); return true; } void SpeechRecognizer::deinit() { if (!initialized) { return; } stop(); // 释放ESP-SR资源 if (model_data && multinet) { multinet->destroy(model_data); model_data = nullptr; } if (afe_data && afe_handle) { afe_handle->destroy(afe_data); afe_data = nullptr; } if (models) { // 注意:esp_srmodel_init分配的资源可能需要特殊清理 // 根据ESP-SR文档进行适当清理 } // 释放I2S资源 if (rx_handle) { i2s_channel_disable(rx_handle); i2s_del_channel(rx_handle); rx_handle = nullptr; } initialized = false; ESP_LOGI("SpeechRecognizer", "Deinitialized"); } bool SpeechRecognizer::initI2S() { esp_err_t ret = ESP_OK; i2s_chan_config_t chan_cfg = I2S_CHANNEL_DEFAULT_CONFIG(I2S_NUM_1, I2S_ROLE_MASTER); ret = i2s_new_channel(&chan_cfg, nullptr, &rx_handle); if (ret != ESP_OK) { ESP_LOGE("SpeechRecognizer", "Failed to create I2S channel: %s", esp_err_to_name(ret)); return false; } i2s_std_config_t std_cfg = { .clk_cfg = I2S_STD_CLK_DEFAULT_CONFIG(16000), .slot_cfg = I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG(I2S_DATA_BIT_WIDTH_32BIT, I2S_SLOT_MODE_MONO), .gpio_cfg = { .mclk = GPIO_NUM_NC, .bclk = config.bclk_pin, .ws = config.ws_pin, .dout = GPIO_NUM_NC, .din = config.din_pin, .invert_flags = { .mclk_inv = false, .bclk_inv = false, .ws_inv = false, }, }, }; std_cfg.slot_cfg.slot_mask = I2S_STD_SLOT_RIGHT; ret = i2s_channel_init_std_mode(rx_handle, &std_cfg); if (ret != ESP_OK) { ESP_LOGE("SpeechRecognizer", "Failed to init I2S standard mode: %s", esp_err_to_name(ret)); return false; } ESP_LOGI("SpeechRecognizer", "I2S initialized successfully"); return true; } bool SpeechRecognizer::initESP_SR() { // 获取AFE句柄 afe_handle = &ESP_AFE_SR_HANDLE; if (!afe_handle) { ESP_LOGE("SpeechRecognizer", "Failed to get AFE handle"); return false; } // 初始化模型 models = esp_srmodel_init(config.model_path.c_str()); if (!models) { ESP_LOGE("SpeechRecognizer", "Failed to initialize models from path: %s", config.model_path.c_str()); return false; } // 配置AFE afe_config_t afe_config = { .aec_init = config.enable_aec, .se_init = config.enable_se, .vad_init = config.enable_vad, .wakenet_init = false, // 禁用唤醒词 .voice_communication_init = false, .voice_communication_agc_init = false, .voice_communication_agc_gain = 15, .vad_mode = config.vad_mode, .wakenet_model_name = nullptr, .wakenet_model_name_2 = nullptr, .wakenet_mode = DET_MODE_2CH_90, .afe_mode = SR_MODE_LOW_COST, .afe_perferred_core = 0, .afe_perferred_priority = 5, .afe_ringbuf_size = 10, .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, .afe_linear_gain = 1.0, .agc_mode = AFE_MN_PEAK_AGC_MODE_2, .pcm_config = { .total_ch_num = 2, .mic_num = 1, .ref_num = 1, .sample_rate = 16000, }, .debug_init = false, .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, nullptr}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, nullptr}}, }; afe_data = afe_handle->create_from_config(&afe_config); if (!afe_data) { ESP_LOGE("SpeechRecognizer", "Failed to create AFE data from config"); return false; } // 加载MultiNet模型(采用esp-sr提供的宏来处理不同语种的模型的处理问题) #if defined(CONFIG_SR_MN_CN_MULTINET5_RECOGNITION_QUANT8) || defined(CONFIG_SR_MN_CN_MULTINET6_QUANT) || defined(CONFIG_SR_MN_CN_MULTINET6_AC_QUANT) char *mn_name = esp_srmodel_filter(models, ESP_MN_PREFIX, ESP_MN_CHINESE); #else char *mn_name = esp_srmodel_filter(models, ESP_MN_PREFIX, ESP_MN_ENGLISH); #endif if (!mn_name) { ESP_LOGE("SpeechRecognizer", "No MultiNet model found"); return false; } multinet = esp_mn_handle_from_name(mn_name); if (!multinet) { ESP_LOGE("SpeechRecognizer", "Failed to get MultiNet handle"); return false; } model_data = multinet->create(mn_name, config.detection_timeout); if (!model_data) { ESP_LOGE("SpeechRecognizer", "Failed to create MultiNet model data"); return false; } ESP_LOGI("SpeechRecognizer", "ESP-SR initialized successfully with model: %s", mn_name); return true; } bool SpeechRecognizer::start() { if (!initialized) { ESP_LOGE("SpeechRecognizer", "Not initialized"); return false; } if (running) { ESP_LOGI("SpeechRecognizer", "Already running"); return true; } // 启用I2S通道 esp_err_t ret = i2s_channel_enable(rx_handle); if (ret != ESP_OK) { ESP_LOGE("SpeechRecognizer", "Failed to enable I2S channel: %s", esp_err_to_name(ret)); return false; } // 启动任务 tasksRunning = true; // 使用ThreadManager创建任务 feedThread = ThreadManager::createMemberThread( config.feed_thread_config, this, &SpeechRecognizer::feedTask); detectThread = ThreadManager::createMemberThread( config.detect_thread_config, this, &SpeechRecognizer::detectTask); running = true; updateState("started"); ESP_LOGI("SpeechRecognizer", "Speech recognition started"); return true; } void SpeechRecognizer::stop() { if (!running) { return; } tasksRunning = false; // 等待任务结束 if (feedThread.joinable()) { feedThread.join(); } if (detectThread.joinable()) { detectThread.join(); } // 禁用I2S通道 if (rx_handle) { i2s_channel_disable(rx_handle); } running = false; updateState("stopped"); ESP_LOGI("SpeechRecognizer", "Speech recognition stopped"); } void SpeechRecognizer::feedTask() { ThreadManager::printThreadInfo("Feed task started"); int audio_chunksize = afe_handle->get_feed_chunksize(afe_data); int nch = afe_handle->get_channel_num(afe_data); size_t samp_len = audio_chunksize; size_t samp_len_bytes = samp_len * sizeof(int32_t); // 单声道32位 auto *i2s_buff = static_cast(malloc(samp_len_bytes)); if (!i2s_buff) { ESP_LOGE("SpeechRecognizer", "Failed to allocate memory for I2S buffer"); return; } size_t bytes_read; while (tasksRunning) { esp_err_t ret = i2s_channel_read(rx_handle, i2s_buff, samp_len_bytes, &bytes_read, portMAX_DELAY); if (ret != ESP_OK) { ESP_LOGE("SpeechRecognizer", "I2S read error: %s", esp_err_to_name(ret)); vTaskDelay(pdMS_TO_TICKS(10)); continue; } // 处理音频数据(32位转16位) for (int i = 0; i < samp_len; ++i) { i2s_buff[i] = i2s_buff[i] >> 14; // 32:8是有效位,转换为16位音频数据 } // 喂数据给AFE afe_handle->feed(afe_data, reinterpret_cast(i2s_buff)); } free(i2s_buff); ESP_LOGI("SpeechRecognizer", "Feed task exited"); } void SpeechRecognizer::detectTask() { ThreadManager::printThreadInfo("Detect task started"); int afe_chunksize = afe_handle->get_fetch_chunksize(afe_data); int mu_chunksize = multinet->get_samp_chunksize(model_data); if (mu_chunksize != afe_chunksize) { ESP_LOGE("SpeechRecognizer", "Chunk size mismatch: AFE=%d, MultiNet=%d", afe_chunksize, mu_chunksize); return; } updateState("ready"); ESP_LOGI("SpeechRecognizer", "Ready for speech recognition"); while (tasksRunning) { afe_fetch_result_t* res = afe_handle->fetch(afe_data); if (!res || res->ret_value == ESP_FAIL) { ESP_LOGE("SpeechRecognizer", "AFE fetch error"); vTaskDelay(pdMS_TO_TICKS(10)); continue; } if (!enabled) { vTaskDelay(pdMS_TO_TICKS(100)); continue; } // 使用MultiNet进行语音检测 esp_mn_state_t mn_state = multinet->detect(model_data, res->data); if (mn_state == ESP_MN_STATE_DETECTING) { // 检测中,不做处理 continue; } else if (mn_state == ESP_MN_STATE_DETECTED) { // 检测到语音命令 esp_mn_results_t *mn_result = multinet->get_results(model_data); if (mn_result && mn_result->num > 0) { SpeechRecognitionResult result; result.command_id = mn_result->command_id[0]; result.phrase = mn_result->string; result.probability = mn_result->prob[0]; result.phrase_id = mn_result->phrase_id[0]; handleRecognitionResult(result); } } else if (mn_state == ESP_MN_STATE_TIMEOUT) { // 识别超时 updateState("timeout"); esp_mn_results_t *mn_result = multinet->get_results(model_data); ESP_LOGI("SpeechRecognizer", "Detection timeout: %s", mn_result && mn_result->string ? mn_result->string : ""); // 重置检测状态 multinet->clean(model_data); updateState("ready"); } } ESP_LOGI("SpeechRecognizer", "Detect task exited"); } bool SpeechRecognizer::addCommand(int command_id, const std::string& phrase) { // 加强检查,确保所有相关指针都有效 if (!multinet) { ESP_LOGE("SpeechRecognizer", "MultiNet handle is null"); return false; } if (!model_data) { ESP_LOGE("SpeechRecognizer", "Model data is null"); return false; } esp_err_t ret = esp_mn_commands_add(command_id, phrase.c_str()); if (ret != ESP_OK) { ESP_LOGE("SpeechRecognizer", "Failed to add command: %s", esp_err_to_name(ret)); return false; } // 更新命令列表 esp_mn_error_t *ret_mn = esp_mn_commands_update(); if (ret_mn) { if (ret_mn->num >= 1) { ESP_LOGE("SpeechRecognizer", "无法更新的指令数量: %d", ret_mn->num); for (int i = 0; i < ret_mn->num; i++) { ESP_LOGE("SpeechRecognizer", "无法更新的指令名称: %s,对应的id=%d", ret_mn->phrases[i]->string, ret_mn->phrases[i]->command_id); } return false; } else { ESP_LOGE("SpeechRecognizer", "无法更新的指令短语数组为空"); } } // 打印缓存的指令 ESP_LOGI("SpeechRecognizer", "Added command: ID=%d, Phrase=%s", command_id, phrase.c_str()); ESP_LOGI("SpeechRecognizer", "下面是当前已经缓存的指令:"); esp_mn_commands_print(); ESP_LOGI("SpeechRecognizer", "下面是当前已经应用的指令:"); esp_mn_active_commands_print(); return true; } bool SpeechRecognizer::addCommands(const std::vector>& commands) { bool success = true; for (const auto& cmd : commands) { if (!addCommand(cmd.first, cmd.second)) { success = false; } } ESP_LOGI("SpeechRecognizer", "下面是当前已经缓存的指令:"); esp_mn_commands_print(); ESP_LOGI("SpeechRecognizer", "下面是当前已经应用的指令:"); esp_mn_active_commands_print(); return success; } void SpeechRecognizer::clearCommands() { if (multinet && model_data) { esp_mn_commands_clear(); ESP_LOGI("SpeechRecognizer", "All commands cleared"); } } void SpeechRecognizer::registerCommandCallback(SpeechCommandCallback callback) { commandCallback = std::move(callback); } void SpeechRecognizer::registerStateCallback(SpeechStateCallback callback) { stateCallback = std::move(callback); } void SpeechRecognizer::handleRecognitionResult(const SpeechRecognitionResult& result) { // 保存到历史记录 { std::lock_guard lock(historyMutex); lastResult = result; history.push_back(result); // 限制历史记录大小 if (history.size() > 100) { history.erase(history.begin()); } } // 记录日志 ESP_LOGI("SpeechRecognizer", "Command detected: ID=%d, Phrase='%s', Probability=%.2f", result.command_id, result.phrase.c_str(), result.probability); updateState("command_detected"); // 调用回调函数 if (commandCallback) { commandCallback(result.command_id, result.phrase, result.probability); } // 重置检测状态,准备下一次识别 if (multinet && model_data) { multinet->clean(model_data); } updateState("ready"); } void SpeechRecognizer::updateState(const std::string& state) { { std::lock_guard lock(stateMutex); currentState = state; } ESP_LOGI("SpeechRecognizer", "State changed: %s", state.c_str()); if (stateCallback) { stateCallback(state); } } SpeechRecognitionResult SpeechRecognizer::getLastResult() const { std::lock_guard lock(historyMutex); return lastResult; } std::vector SpeechRecognizer::getHistory() const { std::lock_guard lock(historyMutex); return history; } void SpeechRecognizer::setEnabled(bool enabled) { this->enabled = enabled; updateState(enabled ? "enabled" : "disabled"); ESP_LOGI("SpeechRecognizer", "Speech recognition %s", enabled ? "enabled" : "disabled"); } std::string SpeechRecognizer::getCurrentState() const { std::lock_guard lock(stateMutex); return currentState; } bool SpeechRecognizer::isRunning() const { return running; }