1. 完成了语音识别的C++业务层封装,测试通过

2. 试着测试了一下LVGL_GIF渲染+音乐播放+语音识别的组合简单优化后,
          发现lvgl渲染略显卡顿,语音识别有缓冲区空警告,不过无伤大雅,还需要进一步深度优化。
This commit is contained in:
Misaki
2025-09-16 01:29:17 +08:00
parent dc420c3b7a
commit 4cc761aab3
26 changed files with 134775 additions and 32 deletions
@@ -47,9 +47,9 @@ LVGLRender::LVGLRender() {
ESP_LOGI("LVGL_Render", "LVGL_Render构造函数...创建LVGL心跳...");
ThreadConfig trickConfig;
trickConfig.core_id = 1; // 渲染分配给核1
trickConfig.name = "LVGL_Render_Heartbeat";
trickConfig.priority = 5;
trickConfig.core_id = 1; // 渲染分配给核0
trickConfig.name = "LVGL_Render";
trickConfig.priority = 5; //
trickConfig.stack_size = 4096; // 给LVGL一个较大的堆栈,避免栈溢出
std::thread tick_thread = ThreadManager::createMemberThread(trickConfig, this, &LVGLRender::LVGL_Update);
@@ -0,0 +1,465 @@
//
// Created by misaki on 2025/9/15.
//
#include "SpeechRecognizer.h"
#include "esp_afe_sr_models.h"
#include "esp_mn_models.h"
#include "esp_wn_iface.h"
#include "esp_mn_speech_commands.h"
#include "model_path.h"
#include "driver/gpio.h"
#include "soc/soc_caps.h"
#include "esp_err.h"
#include "nvs_flash.h"
#include "freertos/FreeRTOS.h"
#include "freertos/task.h"
#include <atomic>
#include <cstring>
#include <memory>
#include <utility>
// 初始化静态成员变量
SpeechRecognizer* SpeechRecognizer::instance = nullptr;
std::mutex SpeechRecognizer::instanceMutex;
SpeechRecognizer* SpeechRecognizer::getInstance() {
std::lock_guard<std::mutex> lock(instanceMutex);
if (!instance) {
instance = new SpeechRecognizer();
}
return instance;
}
SpeechRecognizer::SpeechRecognizer()
: initialized(false),
running(false),
enabled(true),
rx_handle(nullptr),
afe_handle(nullptr),
afe_data(nullptr),
models(nullptr),
multinet(nullptr),
model_data(nullptr),
tasksRunning(false) {
}
SpeechRecognizer::~SpeechRecognizer() {
deinit();
}
bool SpeechRecognizer::init(const SpeechRecognizerConfig& config) {
if (initialized) {
ESP_LOGI("SpeechRecognizer", "Already initialized");
return true;
}
this->config = config;
// 初始化I2S
if (!initI2S()) {
ESP_LOGE("SpeechRecognizer", "I2S initialization failed");
return false;
}
// 初始化ESP-SR
if (!initESP_SR()) {
ESP_LOGE("SpeechRecognizer", "ESP-SR initialization failed");
return false;
}
initialized = true;
ESP_LOGI("SpeechRecognizer", "Initialization completed successfully");
return true;
}
void SpeechRecognizer::deinit() {
if (!initialized) {
return;
}
stop();
// 释放ESP-SR资源
if (model_data && multinet) {
multinet->destroy(model_data);
model_data = nullptr;
}
if (afe_data && afe_handle) {
afe_handle->destroy(afe_data);
afe_data = nullptr;
}
if (models) {
// 注意:esp_srmodel_init分配的资源可能需要特殊清理
// 根据ESP-SR文档进行适当清理
}
// 释放I2S资源
if (rx_handle) {
i2s_channel_disable(rx_handle);
i2s_del_channel(rx_handle);
rx_handle = nullptr;
}
initialized = false;
ESP_LOGI("SpeechRecognizer", "Deinitialized");
}
bool SpeechRecognizer::initI2S() {
esp_err_t ret = ESP_OK;
i2s_chan_config_t chan_cfg = I2S_CHANNEL_DEFAULT_CONFIG(I2S_NUM_1, I2S_ROLE_MASTER);
ret = i2s_new_channel(&chan_cfg, nullptr, &rx_handle);
if (ret != ESP_OK) {
ESP_LOGE("SpeechRecognizer", "Failed to create I2S channel: %s", esp_err_to_name(ret));
return false;
}
i2s_std_config_t std_cfg = {
.clk_cfg = I2S_STD_CLK_DEFAULT_CONFIG(16000),
.slot_cfg = I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG(I2S_DATA_BIT_WIDTH_32BIT, I2S_SLOT_MODE_MONO),
.gpio_cfg = {
.mclk = GPIO_NUM_NC,
.bclk = config.bclk_pin,
.ws = config.ws_pin,
.dout = GPIO_NUM_NC,
.din = config.din_pin,
.invert_flags = {
.mclk_inv = false,
.bclk_inv = false,
.ws_inv = false,
},
},
};
std_cfg.slot_cfg.slot_mask = I2S_STD_SLOT_RIGHT;
ret = i2s_channel_init_std_mode(rx_handle, &std_cfg);
if (ret != ESP_OK) {
ESP_LOGE("SpeechRecognizer", "Failed to init I2S standard mode: %s", esp_err_to_name(ret));
return false;
}
ESP_LOGI("SpeechRecognizer", "I2S initialized successfully");
return true;
}
bool SpeechRecognizer::initESP_SR() {
// 获取AFE句柄
afe_handle = &ESP_AFE_SR_HANDLE;
if (!afe_handle) {
ESP_LOGE("SpeechRecognizer", "Failed to get AFE handle");
return false;
}
// 初始化模型
models = esp_srmodel_init(config.model_path.c_str());
if (!models) {
ESP_LOGE("SpeechRecognizer", "Failed to initialize models from path: %s", config.model_path.c_str());
return false;
}
// 配置AFE
afe_config_t afe_config = {
.aec_init = config.enable_aec,
.se_init = config.enable_se,
.vad_init = config.enable_vad,
.wakenet_init = false, // 禁用唤醒词
.voice_communication_init = false,
.voice_communication_agc_init = false,
.voice_communication_agc_gain = 15,
.vad_mode = config.vad_mode,
.wakenet_model_name = nullptr,
.wakenet_model_name_2 = nullptr,
.wakenet_mode = DET_MODE_2CH_90,
.afe_mode = SR_MODE_LOW_COST,
.afe_perferred_core = 0,
.afe_perferred_priority = 5,
.afe_ringbuf_size = 10,
.memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM,
.afe_linear_gain = 1.0,
.agc_mode = AFE_MN_PEAK_AGC_MODE_2,
.pcm_config = {
.total_ch_num = 2,
.mic_num = 1,
.ref_num = 1,
.sample_rate = 16000,
},
.debug_init = false,
.debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, nullptr}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, nullptr}},
};
afe_data = afe_handle->create_from_config(&afe_config);
if (!afe_data) {
ESP_LOGE("SpeechRecognizer", "Failed to create AFE data from config");
return false;
}
// 加载MultiNet模型(采用esp-sr提供的宏来处理不同语种的模型的处理问题)
#if defined(CONFIG_SR_MN_CN_MULTINET5_RECOGNITION_QUANT8) || defined(CONFIG_SR_MN_CN_MULTINET6_QUANT) || defined(CONFIG_SR_MN_CN_MULTINET6_AC_QUANT)
char *mn_name = esp_srmodel_filter(models, ESP_MN_PREFIX, ESP_MN_CHINESE);
#else
char *mn_name = esp_srmodel_filter(models, ESP_MN_PREFIX, ESP_MN_ENGLISH);
#endif
if (!mn_name) {
ESP_LOGE("SpeechRecognizer", "No MultiNet model found");
return false;
}
multinet = esp_mn_handle_from_name(mn_name);
if (!multinet) {
ESP_LOGE("SpeechRecognizer", "Failed to get MultiNet handle");
return false;
}
model_data = multinet->create(mn_name, config.detection_timeout);
if (!model_data) {
ESP_LOGE("SpeechRecognizer", "Failed to create MultiNet model data");
return false;
}
ESP_LOGI("SpeechRecognizer", "ESP-SR initialized successfully with model: %s", mn_name);
return true;
}
bool SpeechRecognizer::start() {
if (!initialized) {
ESP_LOGE("SpeechRecognizer", "Not initialized");
return false;
}
if (running) {
ESP_LOGI("SpeechRecognizer", "Already running");
return true;
}
// 启用I2S通道
esp_err_t ret = i2s_channel_enable(rx_handle);
if (ret != ESP_OK) {
ESP_LOGE("SpeechRecognizer", "Failed to enable I2S channel: %s", esp_err_to_name(ret));
return false;
}
// 启动任务
tasksRunning = true;
// 使用ThreadManager创建任务
feedThread = ThreadManager::createMemberThread(
config.feed_thread_config, this, &SpeechRecognizer::feedTask);
detectThread = ThreadManager::createMemberThread(
config.detect_thread_config, this, &SpeechRecognizer::detectTask);
running = true;
updateState("started");
ESP_LOGI("SpeechRecognizer", "Speech recognition started");
return true;
}
void SpeechRecognizer::stop() {
if (!running) {
return;
}
tasksRunning = false;
// 等待任务结束
if (feedThread.joinable()) {
feedThread.join();
}
if (detectThread.joinable()) {
detectThread.join();
}
// 禁用I2S通道
if (rx_handle) {
i2s_channel_disable(rx_handle);
}
running = false;
updateState("stopped");
ESP_LOGI("SpeechRecognizer", "Speech recognition stopped");
}
void SpeechRecognizer::feedTask() {
ThreadManager::printThreadInfo("Feed task started");
int audio_chunksize = afe_handle->get_feed_chunksize(afe_data);
int nch = afe_handle->get_channel_num(afe_data);
size_t samp_len = audio_chunksize;
size_t samp_len_bytes = samp_len * sizeof(int32_t); // 单声道32位
auto *i2s_buff = static_cast<int32_t *>(malloc(samp_len_bytes));
if (!i2s_buff) {
ESP_LOGE("SpeechRecognizer", "Failed to allocate memory for I2S buffer");
return;
}
size_t bytes_read;
while (tasksRunning) {
esp_err_t ret = i2s_channel_read(rx_handle, i2s_buff, samp_len_bytes, &bytes_read, portMAX_DELAY);
if (ret != ESP_OK) {
ESP_LOGE("SpeechRecognizer", "I2S read error: %s", esp_err_to_name(ret));
vTaskDelay(pdMS_TO_TICKS(10));
continue;
}
// 处理音频数据(32位转16位)
for (int i = 0; i < samp_len; ++i) {
i2s_buff[i] = i2s_buff[i] >> 14; // 32:8是有效位,转换为16位音频数据
}
// 喂数据给AFE
afe_handle->feed(afe_data, reinterpret_cast<int16_t *>(i2s_buff));
}
free(i2s_buff);
ESP_LOGI("SpeechRecognizer", "Feed task exited");
}
void SpeechRecognizer::detectTask() {
ThreadManager::printThreadInfo("Detect task started");
int afe_chunksize = afe_handle->get_fetch_chunksize(afe_data);
int mu_chunksize = multinet->get_samp_chunksize(model_data);
if (mu_chunksize != afe_chunksize) {
ESP_LOGE("SpeechRecognizer", "Chunk size mismatch: AFE=%d, MultiNet=%d", afe_chunksize, mu_chunksize);
return;
}
updateState("ready");
ESP_LOGI("SpeechRecognizer", "Ready for speech recognition");
while (tasksRunning) {
afe_fetch_result_t* res = afe_handle->fetch(afe_data);
if (!res || res->ret_value == ESP_FAIL) {
ESP_LOGE("SpeechRecognizer", "AFE fetch error");
vTaskDelay(pdMS_TO_TICKS(10));
continue;
}
if (!enabled) {
vTaskDelay(pdMS_TO_TICKS(100));
continue;
}
// 使用MultiNet进行语音检测
esp_mn_state_t mn_state = multinet->detect(model_data, res->data);
if (mn_state == ESP_MN_STATE_DETECTING) {
// 检测中,不做处理
continue;
} else if (mn_state == ESP_MN_STATE_DETECTED) {
// 检测到语音命令
esp_mn_results_t *mn_result = multinet->get_results(model_data);
if (mn_result && mn_result->num > 0) {
SpeechRecognitionResult result;
result.command_id = mn_result->command_id[0];
result.phrase = mn_result->string;
result.probability = mn_result->prob[0];
result.phrase_id = mn_result->phrase_id[0];
handleRecognitionResult(result);
}
} else if (mn_state == ESP_MN_STATE_TIMEOUT) {
// 识别超时
updateState("timeout");
esp_mn_results_t *mn_result = multinet->get_results(model_data);
ESP_LOGI("SpeechRecognizer", "Detection timeout: %s",
mn_result && mn_result->string ? mn_result->string : "");
// 重置检测状态
multinet->clean(model_data);
updateState("ready");
}
}
ESP_LOGI("SpeechRecognizer", "Detect task exited");
}
bool SpeechRecognizer::addCommand(int command_id, const std::string& phrase) {
// 加强检查,确保所有相关指针都有效
if (!multinet) {
ESP_LOGE("SpeechRecognizer", "MultiNet handle is null");
return false;
}
if (!model_data) {
ESP_LOGE("SpeechRecognizer", "Model data is null");
return false;
}
esp_err_t ret = esp_mn_commands_add(command_id, phrase.c_str());
if (ret != ESP_OK) {
ESP_LOGE("SpeechRecognizer", "Failed to add command: %s", esp_err_to_name(ret));
return false;
}
// 更新命令列表
esp_mn_error_t *ret_mn = esp_mn_commands_update();
if (ret_mn) {
if (ret_mn->num >= 1) {
ESP_LOGE("SpeechRecognizer", "无法更新的指令数量: %d", ret_mn->num);
for (int i = 0; i < ret_mn->num; i++) {
ESP_LOGE("SpeechRecognizer", "无法更新的指令名称: %s,对应的id=%d", ret_mn->phrases[i]->string, ret_mn->phrases[i]->command_id);
}
return false;
}
else {
ESP_LOGE("SpeechRecognizer", "无法更新的指令短语数组为空");
}
}
// 打印缓存的指令
ESP_LOGI("SpeechRecognizer", "Added command: ID=%d, Phrase=%s", command_id, phrase.c_str());
ESP_LOGI("SpeechRecognizer", "下面是当前已经缓存的指令:");
esp_mn_commands_print();
ESP_LOGI("SpeechRecognizer", "下面是当前已经应用的指令:");
esp_mn_active_commands_print();
return true;
}
bool SpeechRecognizer::addCommands(const std::vector<std::pair<int, std::string>>& commands) {
bool success = true;
for (const auto& cmd : commands) {
if (!addCommand(cmd.first, cmd.second)) {
success = false;
}
}
ESP_LOGI("SpeechRecognizer", "下面是当前已经缓存的指令:");
esp_mn_commands_print();
ESP_LOGI("SpeechRecognizer", "下面是当前已经应用的指令:");
esp_mn_active_commands_print();
return success;
}
void SpeechRecognizer::clearCommands() {
if (multinet && model_data) {
esp_mn_commands_clear();
ESP_LOGI("SpeechRecognizer", "All commands cleared");
}
}
void SpeechRecognizer::registerCommandCallback(SpeechCommandCallback callback) {
commandCallback = std::move(callback);
}
void SpeechRecognizer::registerStateCallback(SpeechStateCallback callback) {
stateCallback = std::move(callback);
}
void SpeechRecognizer::handleRecognitionResult(const SpeechRecognitionResult& result) {
// 保存到历史记录
{
std::lock_guard<std::mutex> lock(historyMutex);
lastResult = result;
history.push_back(result);
// 限制历史记录大小
if (history.size() > 100) {
history.erase(history.begin());
}
}
// 记录日志
ESP_LOGI("SpeechRecognizer", "Command detected: ID=%d, Phrase='%s', Probability=%.2f",
result.command_id, result.phrase.c_str(), result.probability);
updateState("command_detected");
// 调用回调函数
if (commandCallback) {
commandCallback(result.command_id, result.phrase, result.probability);
}
// 重置检测状态,准备下一次识别
if (multinet && model_data) {
multinet->clean(model_data);
}
updateState("ready");
}
void SpeechRecognizer::updateState(const std::string& state) {
{
std::lock_guard<std::mutex> lock(stateMutex);
currentState = state;
}
ESP_LOGI("SpeechRecognizer", "State changed: %s", state.c_str());
if (stateCallback) {
stateCallback(state);
}
}
SpeechRecognitionResult SpeechRecognizer::getLastResult() const {
std::lock_guard<std::mutex> lock(historyMutex);
return lastResult;
}
std::vector<SpeechRecognitionResult> SpeechRecognizer::getHistory() const {
std::lock_guard<std::mutex> lock(historyMutex);
return history;
}
void SpeechRecognizer::setEnabled(bool enabled) {
this->enabled = enabled;
updateState(enabled ? "enabled" : "disabled");
ESP_LOGI("SpeechRecognizer", "Speech recognition %s", enabled ? "enabled" : "disabled");
}
std::string SpeechRecognizer::getCurrentState() const {
std::lock_guard<std::mutex> lock(stateMutex);
return currentState;
}
bool SpeechRecognizer::isRunning() const {
return running;
}
@@ -0,0 +1,171 @@
//
// Created by misaki on 2025/9/15.
//
#pragma once
#include <functional>
#include <vector>
#include <string>
#include <mutex>
#include <atomic>
#include "esp_afe_sr_iface.h"
#include "model_path.h"
#include "esp_mn_iface.h"
#include "driver/i2s_std.h"
#include "esp_log.h"
#include "ThreadManager.h"
// 前向声明
struct model_iface_data_t;
// 语音命令回调函数类型
using SpeechCommandCallback = std::function<void(int command_id, const std::string& phrase, float probability)>;
// 识别状态回调函数类型
using SpeechStateCallback = std::function<void(const std::string& state)>;
// 识别结果结构体
struct SpeechRecognitionResult {
int command_id;
std::string phrase;
float probability;
int phrase_id;
};
// 配置结构体
struct SpeechRecognizerConfig {
// I2S配置
gpio_num_t bclk_pin = GPIO_NUM_15;
gpio_num_t ws_pin = GPIO_NUM_2;
gpio_num_t din_pin = GPIO_NUM_39;
// 音频处理配置
bool enable_aec = false; // 回声消除
bool enable_se = false; // 降噪
bool enable_vad = false; // 语音活动检测
vad_mode_t vad_mode = VAD_MODE_0; // VAD灵敏度
// 模型路径
std::string model_path = "/sdcard/srmodels";
// 线程配置
ThreadConfig feed_thread_config = {"SR_Feed", 0, 4096, 3, false};
ThreadConfig detect_thread_config = {"SR_Detect", 1, 6 * 1024, 5, false};
// 识别超时时间(ms)
int detection_timeout = 6000;
};
class SpeechRecognizer {
public:
// 获取单例实例
static SpeechRecognizer* getInstance();
// 删除拷贝构造函数和赋值运算符
SpeechRecognizer(const SpeechRecognizer&) = delete;
SpeechRecognizer& operator=(const SpeechRecognizer&) = delete;
// 初始化语音识别系统
bool init(const SpeechRecognizerConfig& config = SpeechRecognizerConfig());
// 反初始化
void deinit();
// 添加自定义语音命令
bool addCommand(int command_id, const std::string& phrase);
// 批量添加语音命令
bool addCommands(const std::vector<std::pair<int, std::string>>& commands);
// 清除所有语音命令
void clearCommands();
// 开始语音识别
bool start();
// 停止语音识别
void stop();
// 是否正在运行
bool isRunning() const;
// 注册命令回调函数
void registerCommandCallback(SpeechCommandCallback callback);
// 注册状态回调函数
void registerStateCallback(SpeechStateCallback callback);
// 获取最后一次识别结果
SpeechRecognitionResult getLastResult() const;
// 获取识别历史
std::vector<SpeechRecognitionResult> getHistory() const;
// 设置VAD灵敏度
// void setVadSensitivity(vad_mode_t mode);
// 启用/禁用语音识别
void setEnabled(bool enabled);
// 获取当前识别状态
std::string getCurrentState() const;
private:
SpeechRecognizer();
~SpeechRecognizer();
// I2S初始化
bool initI2S();
// ESP-SR初始化
bool initESP_SR();
// 喂数据任务(音频采集)
void feedTask();
// 检测任务(语音识别)
void detectTask();
// 处理识别结果
void handleRecognitionResult(const SpeechRecognitionResult& result);
// 更新识别状态
void updateState(const std::string& state);
// 静态成员函数用于C接口兼容
static void feedTaskWrapper(void* arg);
static void detectTaskWrapper(void* arg);
private:
static SpeechRecognizer* instance;
static std::mutex instanceMutex;
SpeechRecognizerConfig config;
bool initialized;
bool running;
bool enabled;
// I2S相关
i2s_chan_handle_t rx_handle;
// ESP-SR相关
const esp_afe_sr_iface_t* afe_handle;
esp_afe_sr_data_t* afe_data;
srmodel_list_t* models;
esp_mn_iface_t* multinet;
model_iface_data_t* model_data;
// 命令回调
SpeechCommandCallback commandCallback;
SpeechStateCallback stateCallback;
// 识别结果
SpeechRecognitionResult lastResult;
std::vector<SpeechRecognitionResult> history;
mutable std::mutex historyMutex;
// 任务句柄
std::thread feedThread;
std::thread detectThread;
std::atomic<bool> tasksRunning;
// 当前状态
std::string currentState;
mutable std::mutex stateMutex;
};