1. 完成了语音识别的C++业务层封装,测试通过

2. 试着测试了一下LVGL_GIF渲染+音乐播放+语音识别的组合简单优化后,
          发现lvgl渲染略显卡顿,语音识别有缓冲区空警告,不过无伤大雅,还需要进一步深度优化。
This commit is contained in:
Misaki
2025-09-16 01:29:17 +08:00
parent dc420c3b7a
commit 4cc761aab3
26 changed files with 134775 additions and 32 deletions
+1
View File
@@ -4,3 +4,4 @@ log
# 组件库,配置好idf环境后,运行idf.py reconfigure就会自动生成出来 # 组件库,配置好idf环境后,运行idf.py reconfigure就会自动生成出来
managed_components managed_components
.idea .idea
.venv
+91 -1
View File
@@ -113,8 +113,98 @@ void testPetSystem() {
std::cout << SDFileManager::getInstance()->catCommand("/sdcard/pet_data/my_pet.json") << std::endl; std::cout << SDFileManager::getInstance()->catCommand("/sdcard/pet_data/my_pet.json") << std::endl;
} }
#include "SpeechRecognizer.h"
#include <nvs.h>
#include <nvs_flash.h>
// 命令回调函数
void commandCallback(int command_id, const std::string& phrase, float probability) {
ESP_LOGI("Example", "Received command: ID=%d, Phrase='%s', Probability=%.2f",
command_id, phrase.c_str(), probability);
// 根据命令执行相应操作
switch (command_id) {
case 0:
ESP_LOGI("Example", "执行命令0");
// 执行命令0的操作
break;
case 1:
ESP_LOGI("Example", "执行命令1");
// 执行命令1的操作
break;
case 2:
ESP_LOGI("Example", "执行命令2");
// 执行命令2的操作
break;
default:
ESP_LOGI("Example", "未知的命令ID: %d", command_id);
break;
}
}
// 状态回调函数
void stateCallback(const std::string& state) {
ESP_LOGI("Example", "状态改变到: %s", state.c_str());
}
#include "SDFileManager.h"
void testMIC() {
// 初始化NVS
esp_err_t ret = nvs_flash_init();
if (ret == ESP_ERR_NVS_NO_FREE_PAGES || ret == ESP_ERR_NVS_NEW_VERSION_FOUND) {
ESP_ERROR_CHECK(nvs_flash_erase());
ret = nvs_flash_init();
}
ESP_ERROR_CHECK(ret);
// 初始化SD卡管理器
SDFileManager::getInstance()->tryInitSDCard();
// 获取SpeechRecognizer实例
SpeechRecognizer* recognizer = SpeechRecognizer::getInstance();
// 配置识别器
SpeechRecognizerConfig config;
config.enable_vad = true;
config.vad_mode = VAD_MODE_3; // 更高的VAD灵敏度
config.model_path = "/sdcard/srmodels";
// 初始化
if (!recognizer->init(config)) {
ESP_LOGE("main", "Failed to initialize speech recognizer");
return;
}
// 添加自定义命令
std::vector<std::pair<int, std::string>> commands = {
{0, "kai deng"}, // 开灯
{1, "guan deng"}, // 关灯
{2, "ti gao liang du"}, // 提高亮度
{3, "jiang di liang du"}, // 降低亮度
{4, "bo fang yin yue"}, // 播放音乐
{5, "ting zhi bo fang"} // 停止播放
};
if (!recognizer->addCommands(commands)) {
ESP_LOGE("main", "Failed to add some commands");
}
// 注册回调函数
recognizer->registerCommandCallback(commandCallback);
recognizer->registerStateCallback(stateCallback);
// 开始识别
if (!recognizer->start()) {
ESP_LOGE("main", "Failed to start speech recognition");
return;
}
ESP_LOGI("main", "Speech recognition system started successfully");
}
void Cpp_Hand() { void Cpp_Hand() {
testPetSystem(); testMIC();
// testPetSystem();
OTAClass oc; OTAClass oc;
oc.Init(); oc.Init();
+1 -1
View File
@@ -170,7 +170,7 @@ void OTAClass::Init() {
AudioOutput::getInstance()->setVolume(5); AudioOutput::getInstance()->setVolume(5);
// 同步播放 // 同步播放
AudioOutput::getInstance()->playSync("/sdcard/music", "Old_Memory.mp3"); AudioOutput::getInstance()->playSync("/sdcard/music", "kokoronashi.mp3");
// // 配置Wifi连接线程参数 // // 配置Wifi连接线程参数
-1
View File
@@ -5,7 +5,6 @@
#include "PetDao.h" #include "PetDao.h"
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
using namespace PetEnumConverter; using namespace PetEnumConverter;
// PetEnumConverter 实现 // PetEnumConverter 实现
+45 -2
View File
@@ -6,7 +6,6 @@
#include "PetBaseClass.h" #include "PetBaseClass.h"
#include "SDFileManager.h" #include "SDFileManager.h"
#include "cJSON.h" #include "cJSON.h"
#include <unordered_map>
#include <string> #include <string>
// 辅助函数:枚举类型与字符串的转换 // 辅助函数:枚举类型与字符串的转换
@@ -62,4 +61,48 @@ private:
// 宠物数据存储目录 // 宠物数据存储目录
static constexpr const char* PET_DATA_DIR = "/sdcard/pet_data"; static constexpr const char* PET_DATA_DIR = "/sdcard/pet_data";
}; };
/**
* 宠物数据结构(JSON)
{
"name": "芝士雪豹",
"hp": 85,
"density": 120,
"identity": "我是顶真,是妈妈省的",
"stage_strategy": {
"current_stage": "PET_STAGE_ADULT",
"stage_model_map": {
"PET_STAGE_YOUNG": "/models/snow_leopard_young.gif",
"PET_STAGE_ADULT": "/models/snow_leopard_adult.gif",
"PET_STAGE_OLD": "/models/snow_leopard_old.gif"
},
"stage_audio_map": {
"PET_STAGE_YOUNG": "/audio/snow_leopard_young.mp3",
"PET_STAGE_ADULT": "/audio/snow_leopard_adult.mp3",
"PET_STAGE_OLD": "/audio/snow_leopard_old.mp3"
}
},
"action_strategy": {
"current_action": "PET_ACTION_SLEEP",
"action_model_map": {
"PET_ACTION_SLEEP": "/models/actions/sleep.gif",
"PET_ACTION_EAT": "/models/actions/eat.gif",
"PET_ACTION_HAPPY": "/models/actions/happy.gif",
"PET_ACTION_ANGRY": "/models/actions/angry.gif",
"PET_ACTION_SAD": "/models/actions/sad.gif",
"PET_ACTION_EVOLVE": "/models/actions/evolve.gif",
"PET_ACTION_TOUCH": "/models/actions/touch.gif"
},
"action_audio_map": {
"PET_ACTION_SLEEP": "/audio/actions/sleep.mp3",
"PET_ACTION_EAT": "/audio/actions/eat.mp3",
"PET_ACTION_HAPPY": "/audio/actions/happy.mp3",
"PET_ACTION_ANGRY": "/audio/actions/angry.mp3",
"PET_ACTION_SAD": "/audio/actions/sad.mp3",
"PET_ACTION_EVOLVE": "/audio/actions/evolve.mp3",
"PET_ACTION_TOUCH": "/audio/actions/touch.mp3"
}
}
}
*/
@@ -47,9 +47,9 @@ LVGLRender::LVGLRender() {
ESP_LOGI("LVGL_Render", "LVGL_Render构造函数...创建LVGL心跳..."); ESP_LOGI("LVGL_Render", "LVGL_Render构造函数...创建LVGL心跳...");
ThreadConfig trickConfig; ThreadConfig trickConfig;
trickConfig.core_id = 1; // 渲染分配给核1 trickConfig.core_id = 1; // 渲染分配给核0
trickConfig.name = "LVGL_Render_Heartbeat"; trickConfig.name = "LVGL_Render";
trickConfig.priority = 5; trickConfig.priority = 5; //
trickConfig.stack_size = 4096; // 给LVGL一个较大的堆栈,避免栈溢出 trickConfig.stack_size = 4096; // 给LVGL一个较大的堆栈,避免栈溢出
std::thread tick_thread = ThreadManager::createMemberThread(trickConfig, this, &LVGLRender::LVGL_Update); std::thread tick_thread = ThreadManager::createMemberThread(trickConfig, this, &LVGLRender::LVGL_Update);
@@ -0,0 +1,465 @@
//
// Created by misaki on 2025/9/15.
//
#include "SpeechRecognizer.h"
#include "esp_afe_sr_models.h"
#include "esp_mn_models.h"
#include "esp_wn_iface.h"
#include "esp_mn_speech_commands.h"
#include "model_path.h"
#include "driver/gpio.h"
#include "soc/soc_caps.h"
#include "esp_err.h"
#include "nvs_flash.h"
#include "freertos/FreeRTOS.h"
#include "freertos/task.h"
#include <atomic>
#include <cstring>
#include <memory>
#include <utility>
// 初始化静态成员变量
SpeechRecognizer* SpeechRecognizer::instance = nullptr;
std::mutex SpeechRecognizer::instanceMutex;
SpeechRecognizer* SpeechRecognizer::getInstance() {
std::lock_guard<std::mutex> lock(instanceMutex);
if (!instance) {
instance = new SpeechRecognizer();
}
return instance;
}
SpeechRecognizer::SpeechRecognizer()
: initialized(false),
running(false),
enabled(true),
rx_handle(nullptr),
afe_handle(nullptr),
afe_data(nullptr),
models(nullptr),
multinet(nullptr),
model_data(nullptr),
tasksRunning(false) {
}
SpeechRecognizer::~SpeechRecognizer() {
deinit();
}
bool SpeechRecognizer::init(const SpeechRecognizerConfig& config) {
if (initialized) {
ESP_LOGI("SpeechRecognizer", "Already initialized");
return true;
}
this->config = config;
// 初始化I2S
if (!initI2S()) {
ESP_LOGE("SpeechRecognizer", "I2S initialization failed");
return false;
}
// 初始化ESP-SR
if (!initESP_SR()) {
ESP_LOGE("SpeechRecognizer", "ESP-SR initialization failed");
return false;
}
initialized = true;
ESP_LOGI("SpeechRecognizer", "Initialization completed successfully");
return true;
}
void SpeechRecognizer::deinit() {
if (!initialized) {
return;
}
stop();
// 释放ESP-SR资源
if (model_data && multinet) {
multinet->destroy(model_data);
model_data = nullptr;
}
if (afe_data && afe_handle) {
afe_handle->destroy(afe_data);
afe_data = nullptr;
}
if (models) {
// 注意:esp_srmodel_init分配的资源可能需要特殊清理
// 根据ESP-SR文档进行适当清理
}
// 释放I2S资源
if (rx_handle) {
i2s_channel_disable(rx_handle);
i2s_del_channel(rx_handle);
rx_handle = nullptr;
}
initialized = false;
ESP_LOGI("SpeechRecognizer", "Deinitialized");
}
bool SpeechRecognizer::initI2S() {
esp_err_t ret = ESP_OK;
i2s_chan_config_t chan_cfg = I2S_CHANNEL_DEFAULT_CONFIG(I2S_NUM_1, I2S_ROLE_MASTER);
ret = i2s_new_channel(&chan_cfg, nullptr, &rx_handle);
if (ret != ESP_OK) {
ESP_LOGE("SpeechRecognizer", "Failed to create I2S channel: %s", esp_err_to_name(ret));
return false;
}
i2s_std_config_t std_cfg = {
.clk_cfg = I2S_STD_CLK_DEFAULT_CONFIG(16000),
.slot_cfg = I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG(I2S_DATA_BIT_WIDTH_32BIT, I2S_SLOT_MODE_MONO),
.gpio_cfg = {
.mclk = GPIO_NUM_NC,
.bclk = config.bclk_pin,
.ws = config.ws_pin,
.dout = GPIO_NUM_NC,
.din = config.din_pin,
.invert_flags = {
.mclk_inv = false,
.bclk_inv = false,
.ws_inv = false,
},
},
};
std_cfg.slot_cfg.slot_mask = I2S_STD_SLOT_RIGHT;
ret = i2s_channel_init_std_mode(rx_handle, &std_cfg);
if (ret != ESP_OK) {
ESP_LOGE("SpeechRecognizer", "Failed to init I2S standard mode: %s", esp_err_to_name(ret));
return false;
}
ESP_LOGI("SpeechRecognizer", "I2S initialized successfully");
return true;
}
bool SpeechRecognizer::initESP_SR() {
// 获取AFE句柄
afe_handle = &ESP_AFE_SR_HANDLE;
if (!afe_handle) {
ESP_LOGE("SpeechRecognizer", "Failed to get AFE handle");
return false;
}
// 初始化模型
models = esp_srmodel_init(config.model_path.c_str());
if (!models) {
ESP_LOGE("SpeechRecognizer", "Failed to initialize models from path: %s", config.model_path.c_str());
return false;
}
// 配置AFE
afe_config_t afe_config = {
.aec_init = config.enable_aec,
.se_init = config.enable_se,
.vad_init = config.enable_vad,
.wakenet_init = false, // 禁用唤醒词
.voice_communication_init = false,
.voice_communication_agc_init = false,
.voice_communication_agc_gain = 15,
.vad_mode = config.vad_mode,
.wakenet_model_name = nullptr,
.wakenet_model_name_2 = nullptr,
.wakenet_mode = DET_MODE_2CH_90,
.afe_mode = SR_MODE_LOW_COST,
.afe_perferred_core = 0,
.afe_perferred_priority = 5,
.afe_ringbuf_size = 10,
.memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM,
.afe_linear_gain = 1.0,
.agc_mode = AFE_MN_PEAK_AGC_MODE_2,
.pcm_config = {
.total_ch_num = 2,
.mic_num = 1,
.ref_num = 1,
.sample_rate = 16000,
},
.debug_init = false,
.debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, nullptr}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, nullptr}},
};
afe_data = afe_handle->create_from_config(&afe_config);
if (!afe_data) {
ESP_LOGE("SpeechRecognizer", "Failed to create AFE data from config");
return false;
}
// 加载MultiNet模型(采用esp-sr提供的宏来处理不同语种的模型的处理问题)
#if defined(CONFIG_SR_MN_CN_MULTINET5_RECOGNITION_QUANT8) || defined(CONFIG_SR_MN_CN_MULTINET6_QUANT) || defined(CONFIG_SR_MN_CN_MULTINET6_AC_QUANT)
char *mn_name = esp_srmodel_filter(models, ESP_MN_PREFIX, ESP_MN_CHINESE);
#else
char *mn_name = esp_srmodel_filter(models, ESP_MN_PREFIX, ESP_MN_ENGLISH);
#endif
if (!mn_name) {
ESP_LOGE("SpeechRecognizer", "No MultiNet model found");
return false;
}
multinet = esp_mn_handle_from_name(mn_name);
if (!multinet) {
ESP_LOGE("SpeechRecognizer", "Failed to get MultiNet handle");
return false;
}
model_data = multinet->create(mn_name, config.detection_timeout);
if (!model_data) {
ESP_LOGE("SpeechRecognizer", "Failed to create MultiNet model data");
return false;
}
ESP_LOGI("SpeechRecognizer", "ESP-SR initialized successfully with model: %s", mn_name);
return true;
}
bool SpeechRecognizer::start() {
if (!initialized) {
ESP_LOGE("SpeechRecognizer", "Not initialized");
return false;
}
if (running) {
ESP_LOGI("SpeechRecognizer", "Already running");
return true;
}
// 启用I2S通道
esp_err_t ret = i2s_channel_enable(rx_handle);
if (ret != ESP_OK) {
ESP_LOGE("SpeechRecognizer", "Failed to enable I2S channel: %s", esp_err_to_name(ret));
return false;
}
// 启动任务
tasksRunning = true;
// 使用ThreadManager创建任务
feedThread = ThreadManager::createMemberThread(
config.feed_thread_config, this, &SpeechRecognizer::feedTask);
detectThread = ThreadManager::createMemberThread(
config.detect_thread_config, this, &SpeechRecognizer::detectTask);
running = true;
updateState("started");
ESP_LOGI("SpeechRecognizer", "Speech recognition started");
return true;
}
void SpeechRecognizer::stop() {
if (!running) {
return;
}
tasksRunning = false;
// 等待任务结束
if (feedThread.joinable()) {
feedThread.join();
}
if (detectThread.joinable()) {
detectThread.join();
}
// 禁用I2S通道
if (rx_handle) {
i2s_channel_disable(rx_handle);
}
running = false;
updateState("stopped");
ESP_LOGI("SpeechRecognizer", "Speech recognition stopped");
}
void SpeechRecognizer::feedTask() {
ThreadManager::printThreadInfo("Feed task started");
int audio_chunksize = afe_handle->get_feed_chunksize(afe_data);
int nch = afe_handle->get_channel_num(afe_data);
size_t samp_len = audio_chunksize;
size_t samp_len_bytes = samp_len * sizeof(int32_t); // 单声道32位
auto *i2s_buff = static_cast<int32_t *>(malloc(samp_len_bytes));
if (!i2s_buff) {
ESP_LOGE("SpeechRecognizer", "Failed to allocate memory for I2S buffer");
return;
}
size_t bytes_read;
while (tasksRunning) {
esp_err_t ret = i2s_channel_read(rx_handle, i2s_buff, samp_len_bytes, &bytes_read, portMAX_DELAY);
if (ret != ESP_OK) {
ESP_LOGE("SpeechRecognizer", "I2S read error: %s", esp_err_to_name(ret));
vTaskDelay(pdMS_TO_TICKS(10));
continue;
}
// 处理音频数据(32位转16位)
for (int i = 0; i < samp_len; ++i) {
i2s_buff[i] = i2s_buff[i] >> 14; // 32:8是有效位,转换为16位音频数据
}
// 喂数据给AFE
afe_handle->feed(afe_data, reinterpret_cast<int16_t *>(i2s_buff));
}
free(i2s_buff);
ESP_LOGI("SpeechRecognizer", "Feed task exited");
}
void SpeechRecognizer::detectTask() {
ThreadManager::printThreadInfo("Detect task started");
int afe_chunksize = afe_handle->get_fetch_chunksize(afe_data);
int mu_chunksize = multinet->get_samp_chunksize(model_data);
if (mu_chunksize != afe_chunksize) {
ESP_LOGE("SpeechRecognizer", "Chunk size mismatch: AFE=%d, MultiNet=%d", afe_chunksize, mu_chunksize);
return;
}
updateState("ready");
ESP_LOGI("SpeechRecognizer", "Ready for speech recognition");
while (tasksRunning) {
afe_fetch_result_t* res = afe_handle->fetch(afe_data);
if (!res || res->ret_value == ESP_FAIL) {
ESP_LOGE("SpeechRecognizer", "AFE fetch error");
vTaskDelay(pdMS_TO_TICKS(10));
continue;
}
if (!enabled) {
vTaskDelay(pdMS_TO_TICKS(100));
continue;
}
// 使用MultiNet进行语音检测
esp_mn_state_t mn_state = multinet->detect(model_data, res->data);
if (mn_state == ESP_MN_STATE_DETECTING) {
// 检测中,不做处理
continue;
} else if (mn_state == ESP_MN_STATE_DETECTED) {
// 检测到语音命令
esp_mn_results_t *mn_result = multinet->get_results(model_data);
if (mn_result && mn_result->num > 0) {
SpeechRecognitionResult result;
result.command_id = mn_result->command_id[0];
result.phrase = mn_result->string;
result.probability = mn_result->prob[0];
result.phrase_id = mn_result->phrase_id[0];
handleRecognitionResult(result);
}
} else if (mn_state == ESP_MN_STATE_TIMEOUT) {
// 识别超时
updateState("timeout");
esp_mn_results_t *mn_result = multinet->get_results(model_data);
ESP_LOGI("SpeechRecognizer", "Detection timeout: %s",
mn_result && mn_result->string ? mn_result->string : "");
// 重置检测状态
multinet->clean(model_data);
updateState("ready");
}
}
ESP_LOGI("SpeechRecognizer", "Detect task exited");
}
bool SpeechRecognizer::addCommand(int command_id, const std::string& phrase) {
// 加强检查,确保所有相关指针都有效
if (!multinet) {
ESP_LOGE("SpeechRecognizer", "MultiNet handle is null");
return false;
}
if (!model_data) {
ESP_LOGE("SpeechRecognizer", "Model data is null");
return false;
}
esp_err_t ret = esp_mn_commands_add(command_id, phrase.c_str());
if (ret != ESP_OK) {
ESP_LOGE("SpeechRecognizer", "Failed to add command: %s", esp_err_to_name(ret));
return false;
}
// 更新命令列表
esp_mn_error_t *ret_mn = esp_mn_commands_update();
if (ret_mn) {
if (ret_mn->num >= 1) {
ESP_LOGE("SpeechRecognizer", "无法更新的指令数量: %d", ret_mn->num);
for (int i = 0; i < ret_mn->num; i++) {
ESP_LOGE("SpeechRecognizer", "无法更新的指令名称: %s,对应的id=%d", ret_mn->phrases[i]->string, ret_mn->phrases[i]->command_id);
}
return false;
}
else {
ESP_LOGE("SpeechRecognizer", "无法更新的指令短语数组为空");
}
}
// 打印缓存的指令
ESP_LOGI("SpeechRecognizer", "Added command: ID=%d, Phrase=%s", command_id, phrase.c_str());
ESP_LOGI("SpeechRecognizer", "下面是当前已经缓存的指令:");
esp_mn_commands_print();
ESP_LOGI("SpeechRecognizer", "下面是当前已经应用的指令:");
esp_mn_active_commands_print();
return true;
}
bool SpeechRecognizer::addCommands(const std::vector<std::pair<int, std::string>>& commands) {
bool success = true;
for (const auto& cmd : commands) {
if (!addCommand(cmd.first, cmd.second)) {
success = false;
}
}
ESP_LOGI("SpeechRecognizer", "下面是当前已经缓存的指令:");
esp_mn_commands_print();
ESP_LOGI("SpeechRecognizer", "下面是当前已经应用的指令:");
esp_mn_active_commands_print();
return success;
}
void SpeechRecognizer::clearCommands() {
if (multinet && model_data) {
esp_mn_commands_clear();
ESP_LOGI("SpeechRecognizer", "All commands cleared");
}
}
void SpeechRecognizer::registerCommandCallback(SpeechCommandCallback callback) {
commandCallback = std::move(callback);
}
void SpeechRecognizer::registerStateCallback(SpeechStateCallback callback) {
stateCallback = std::move(callback);
}
void SpeechRecognizer::handleRecognitionResult(const SpeechRecognitionResult& result) {
// 保存到历史记录
{
std::lock_guard<std::mutex> lock(historyMutex);
lastResult = result;
history.push_back(result);
// 限制历史记录大小
if (history.size() > 100) {
history.erase(history.begin());
}
}
// 记录日志
ESP_LOGI("SpeechRecognizer", "Command detected: ID=%d, Phrase='%s', Probability=%.2f",
result.command_id, result.phrase.c_str(), result.probability);
updateState("command_detected");
// 调用回调函数
if (commandCallback) {
commandCallback(result.command_id, result.phrase, result.probability);
}
// 重置检测状态,准备下一次识别
if (multinet && model_data) {
multinet->clean(model_data);
}
updateState("ready");
}
void SpeechRecognizer::updateState(const std::string& state) {
{
std::lock_guard<std::mutex> lock(stateMutex);
currentState = state;
}
ESP_LOGI("SpeechRecognizer", "State changed: %s", state.c_str());
if (stateCallback) {
stateCallback(state);
}
}
SpeechRecognitionResult SpeechRecognizer::getLastResult() const {
std::lock_guard<std::mutex> lock(historyMutex);
return lastResult;
}
std::vector<SpeechRecognitionResult> SpeechRecognizer::getHistory() const {
std::lock_guard<std::mutex> lock(historyMutex);
return history;
}
void SpeechRecognizer::setEnabled(bool enabled) {
this->enabled = enabled;
updateState(enabled ? "enabled" : "disabled");
ESP_LOGI("SpeechRecognizer", "Speech recognition %s", enabled ? "enabled" : "disabled");
}
std::string SpeechRecognizer::getCurrentState() const {
std::lock_guard<std::mutex> lock(stateMutex);
return currentState;
}
bool SpeechRecognizer::isRunning() const {
return running;
}
@@ -0,0 +1,171 @@
//
// Created by misaki on 2025/9/15.
//
#pragma once
#include <functional>
#include <vector>
#include <string>
#include <mutex>
#include <atomic>
#include "esp_afe_sr_iface.h"
#include "model_path.h"
#include "esp_mn_iface.h"
#include "driver/i2s_std.h"
#include "esp_log.h"
#include "ThreadManager.h"
// 前向声明
struct model_iface_data_t;
// 语音命令回调函数类型
using SpeechCommandCallback = std::function<void(int command_id, const std::string& phrase, float probability)>;
// 识别状态回调函数类型
using SpeechStateCallback = std::function<void(const std::string& state)>;
// 识别结果结构体
struct SpeechRecognitionResult {
int command_id;
std::string phrase;
float probability;
int phrase_id;
};
// 配置结构体
struct SpeechRecognizerConfig {
// I2S配置
gpio_num_t bclk_pin = GPIO_NUM_15;
gpio_num_t ws_pin = GPIO_NUM_2;
gpio_num_t din_pin = GPIO_NUM_39;
// 音频处理配置
bool enable_aec = false; // 回声消除
bool enable_se = false; // 降噪
bool enable_vad = false; // 语音活动检测
vad_mode_t vad_mode = VAD_MODE_0; // VAD灵敏度
// 模型路径
std::string model_path = "/sdcard/srmodels";
// 线程配置
ThreadConfig feed_thread_config = {"SR_Feed", 0, 4096, 3, false};
ThreadConfig detect_thread_config = {"SR_Detect", 1, 6 * 1024, 5, false};
// 识别超时时间(ms)
int detection_timeout = 6000;
};
class SpeechRecognizer {
public:
// 获取单例实例
static SpeechRecognizer* getInstance();
// 删除拷贝构造函数和赋值运算符
SpeechRecognizer(const SpeechRecognizer&) = delete;
SpeechRecognizer& operator=(const SpeechRecognizer&) = delete;
// 初始化语音识别系统
bool init(const SpeechRecognizerConfig& config = SpeechRecognizerConfig());
// 反初始化
void deinit();
// 添加自定义语音命令
bool addCommand(int command_id, const std::string& phrase);
// 批量添加语音命令
bool addCommands(const std::vector<std::pair<int, std::string>>& commands);
// 清除所有语音命令
void clearCommands();
// 开始语音识别
bool start();
// 停止语音识别
void stop();
// 是否正在运行
bool isRunning() const;
// 注册命令回调函数
void registerCommandCallback(SpeechCommandCallback callback);
// 注册状态回调函数
void registerStateCallback(SpeechStateCallback callback);
// 获取最后一次识别结果
SpeechRecognitionResult getLastResult() const;
// 获取识别历史
std::vector<SpeechRecognitionResult> getHistory() const;
// 设置VAD灵敏度
// void setVadSensitivity(vad_mode_t mode);
// 启用/禁用语音识别
void setEnabled(bool enabled);
// 获取当前识别状态
std::string getCurrentState() const;
private:
SpeechRecognizer();
~SpeechRecognizer();
// I2S初始化
bool initI2S();
// ESP-SR初始化
bool initESP_SR();
// 喂数据任务(音频采集)
void feedTask();
// 检测任务(语音识别)
void detectTask();
// 处理识别结果
void handleRecognitionResult(const SpeechRecognitionResult& result);
// 更新识别状态
void updateState(const std::string& state);
// 静态成员函数用于C接口兼容
static void feedTaskWrapper(void* arg);
static void detectTaskWrapper(void* arg);
private:
static SpeechRecognizer* instance;
static std::mutex instanceMutex;
SpeechRecognizerConfig config;
bool initialized;
bool running;
bool enabled;
// I2S相关
i2s_chan_handle_t rx_handle;
// ESP-SR相关
const esp_afe_sr_iface_t* afe_handle;
esp_afe_sr_data_t* afe_data;
srmodel_list_t* models;
esp_mn_iface_t* multinet;
model_iface_data_t* model_data;
// 命令回调
SpeechCommandCallback commandCallback;
SpeechStateCallback stateCallback;
// 识别结果
SpeechRecognitionResult lastResult;
std::vector<SpeechRecognitionResult> history;
mutable std::mutex historyMutex;
// 任务句柄
std::thread feedThread;
std::thread detectThread;
std::atomic<bool> tasksRunning;
// 当前状态
std::string currentState;
mutable std::mutex stateMutex;
};
Binary file not shown.
@@ -0,0 +1,146 @@
# pip3 install g2p_en
from g2p_en import G2p
import argparse
# python3 gen_sr_commands.py "Turn on the light,Switch on the light;Turn off the light,Switch off the light,Go dark;\
# Start fan;Stop fan;Volume down,Turn down;Mute sound;Next song;Pause playback"
# enum {
# SR_CMD_TURN_ON_THE_LIGHT,
# SR_CMD_TURN_OFF_THE_LIGHT,
# SR_CMD_START_FAN,
# SR_CMD_STOP_FAN,
# SR_CMD_VOLUME_DOWN,
# SR_CMD_MUTE_SOUND,
# SR_CMD_NEXT_SONG,
# SR_CMD_PAUSE_PLAYBACK,
# };
# static const sr_cmd_t sr_commands[] = {
# { 0, "Turn on the light", "TkN nN jc LiT"},
# { 0, "Switch on the light", "SWgp nN jc LiT"},
# { 1, "Turn off the light", "TkN eF jc LiT"},
# { 1, "Switch off the light", "SWgp eF jc LiT"},
# { 1, "Go dark", "Gb DnRK"},
# { 2, "Start fan", "STnRT FaN"},
# { 3, "Stop fan", "STnP FaN"},
# { 4, "Volume down", "VnLYoM DtN"},
# { 4, "Turn down", "TkN DtN"},
# { 5, "Mute sound", "MYoT StND"},
# { 6, "Next song", "NfKST Sel"},
# { 7, "Pause playback", "PeZ PLdBaK"},
# };
def english_g2p(text):
g2p = G2p()
out = "static const sr_cmd_t sr_commands[] = {\n"
enum = "enum {\n"
alphabet = {
"AE1": "a",
"N": "N",
" ": " ",
"OW1": "b",
"V": "V",
"AH0": "c",
"L": "L",
"F": "F",
"EY1": "d",
"S": "S",
"B": "B",
"R": "R",
"AO1": "e",
"D": "D",
"AH1": "c",
"EH1": "f",
"OW0": "b",
"IH0": "g",
"G": "G",
"HH": "h",
"K": "K",
"IH1": "g",
"W": "W",
"AY1": "i",
"T": "T",
"M": "M",
"Z": "Z",
"DH": "j",
"ER0": "k",
"P": "P",
"NG": "l",
"IY1": "m",
"AA1": "n",
"Y": "Y",
"UW1": "o",
"IY0": "m",
"EH2": "f",
"CH": "p",
"AE0": "a",
"JH": "q",
"ZH": "r",
"AA2": "n",
"SH": "s",
"AW1": "t",
"OY1": "u",
"AW2": "t",
"IH2": "g",
"AE2": "a",
"EY2": "d",
"ER1": "k",
"TH": "v",
"UH1": "w",
"UW2": "o",
"OW2": "b",
"AY2": "i",
"UW0": "o",
"AH2": "c",
"EH0": "f",
"AW0": "t",
"AO2": "e",
"AO0": "e",
"UH0": "w",
"UH2": "w",
"AA0": "n",
"AY0": "i",
"IY2": "m",
"EY0": "d",
"ER2": "k",
"OY2": "u",
"OY0": "u",
}
cmd_id = 0
phrase_id = 0
text_list = text.split(";")
for item in text_list:
item = item.split(",")
phrase_id = 0
for phrase in item:
labels = g2p(phrase)
phoneme = ""
for char in labels:
if char not in alphabet:
print("skip %s, not found in alphabet")
continue
else:
phoneme += alphabet[char]
out += " { " + str(cmd_id) + ', "' + phrase + '", "' + phoneme + '"},\n'
if phrase_id == 0:
enum += " SR_CMD_" + phrase.upper().replace(" ", "_") + ",\n"
phrase_id += 1
cmd_id += 1
out += "};"
enum += "};"
# print(text)
print(enum)
print(out)
return out
if __name__ == "__main__":
parser = argparse.ArgumentParser(prog="English Speech Commands G2P")
parser.add_argument("text", type=str, default=None, help="input text")
args = parser.parse_args()
if args.text is not None:
english_g2p(args.text)
@@ -0,0 +1,2 @@
import nltk
nltk.download('averaged_perceptron_tagger')
Binary file not shown.
@@ -0,0 +1,76 @@
The Carnegie Mellon Pronouncing Dictionary [cmudict.0.7a]
ftp://ftp.cs.cmu.edu/project/speech/dict/
https://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/trunk/cmudict/cmudict.0.7a
Copyright (C) 1993-2008 Carnegie Mellon University. All rights reserved.
File Format: Each line consists of an uppercased word,
a counter (for alternative pronunciations), and a transcription.
Vowels are marked for stress (1=primary, 2=secondary, 0=no stress).
E.g.: NATURAL 1 N AE1 CH ER0 AH0 L
The dictionary contains 127069 entries. Of these, 119400 words are assigned
a unique pronunciation, 6830 words have two pronunciations, and 839 words have
three or more pronunciations. Many of these are fast-speech variants.
Phonemes: There are 39 phonemes, as shown below:
Phoneme Example Translation Phoneme Example Translation
------- ------- ----------- ------- ------- -----------
AA odd AA D AE at AE T
AH hut HH AH T AO ought AO T
AW cow K AW AY hide HH AY D
B be B IY CH cheese CH IY Z
D dee D IY DH thee DH IY
EH Ed EH D ER hurt HH ER T
EY ate EY T F fee F IY
G green G R IY N HH he HH IY
IH it IH T IY eat IY T
JH gee JH IY K key K IY
L lee L IY M me M IY
N knee N IY NG ping P IH NG
OW oat OW T OY toy T OY
P pee P IY R read R IY D
S sea S IY SH she SH IY
T tea T IY TH theta TH EY T AH
UH hood HH UH D UW two T UW
V vee V IY W we W IY
Y yield Y IY L D Z zee Z IY
ZH seizure S IY ZH ER
(For NLTK, entries have been sorted so that, e.g. FIRE 1 and FIRE 2
are contiguous, and not separated by FIRE'S 1.)
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
The contents of this file are deemed to be source code.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
This work was supported in part by funding from the Defense Advanced
Research Projects Agency, the Office of Naval Research and the National
Science Foundation of the United States of America, and by member
companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge
the contributions of many volunteers to the expansion and improvement of
this dictionary.
THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
File diff suppressed because it is too large Load Diff
@@ -0,0 +1 @@
[".", "(", ")", ":", "''", "EX", "JJS", "WRB", "VBG", "VBP", "NN", "SYM", "VB", "UH", "NNPS", "NNP", "``", "$", "NNS", "JJR", "MD", "RP", "VBD", "DT", "POS", "RBR", ",", "VBZ", "PDT", "VBN", "WP$", "WDT", "WP", "PRP$", "CD", "IN", "#", "CC", "RB", "FW", "RBS", "PRP", "LS", "JJ", "TO"]
+1 -1
View File
@@ -91,7 +91,7 @@ void Audio_Init(void)
.mute_fn = audio_mute_function, .mute_fn = audio_mute_function,
.write_fn = bsp_i2s_write, .write_fn = bsp_i2s_write,
.clk_set_fn = bsp_i2s_reconfig_clk, .clk_set_fn = bsp_i2s_reconfig_clk,
.priority = 5, .priority = 3,
.coreID = 0 // 运行在0号核,避免与lvgl抢占资源 .coreID = 0 // 运行在0号核,避免与lvgl抢占资源
}; };
ret = audio_player_new(config); ret = audio_player_new(config);
+1 -1
View File
@@ -344,7 +344,7 @@ void MIC_Speech_init()
afe_config.pcm_config.mic_num = 1; afe_config.pcm_config.mic_num = 1;
afe_config.pcm_config.ref_num = 1; afe_config.pcm_config.ref_num = 1;
afe_config.pcm_config.sample_rate = 16000; afe_config.pcm_config.sample_rate = 16000;
afe_config.wakenet_model_name = esp_srmodel_filter(MIC_Speech.models, ESP_WN_PREFIX, NULL); afe_config.wakenet_model_name = esp_srmodel_filter(MIC_Speech.models, ESP_WN_PREFIX, NULL); // 获取唤醒词模型
MIC_Speech.afe_data = MIC_Speech.afe_handle->create_from_config(&afe_config); MIC_Speech.afe_data = MIC_Speech.afe_handle->create_from_config(&afe_config);
// 注意两个任务被分配了不同的核心与优先级,这是为了防止AFE(Audio Front-End)内部环形缓冲区溢出 // 注意两个任务被分配了不同的核心与优先级,这是为了防止AFE(Audio Front-End)内部环形缓冲区溢出
+2
View File
@@ -38,6 +38,7 @@ idf_component_register(SRCS "Bionic_sphere.c"
"../Bionic_Core/ToolsClass/AudioOutput/AudioOutput.cpp" # 音频输出类库 "../Bionic_Core/ToolsClass/AudioOutput/AudioOutput.cpp" # 音频输出类库
"../Bionic_Core/ToolsClass/LVGL_Render/LVGLRender.cpp" # LVGL渲染类库 "../Bionic_Core/ToolsClass/LVGL_Render/LVGLRender.cpp" # LVGL渲染类库
"../Bionic_Core/ToolsClass/SDFileManager/SDFileManager.cpp" # SD文件管理类库 "../Bionic_Core/ToolsClass/SDFileManager/SDFileManager.cpp" # SD文件管理类库
"../Bionic_Core/ToolsClass/SpeechRecognizer/SpeechRecognizer.cpp" # 语音识别类库
"../Bionic_Core/ToolsClass/WifiConnectors/WifiConnectors.cpp" # WIFI连接类库 "../Bionic_Core/ToolsClass/WifiConnectors/WifiConnectors.cpp" # WIFI连接类库
"../Bionic_Core/ToolsClass/ThreadManager/ThreadManager.cpp" # 线程管理类库 "../Bionic_Core/ToolsClass/ThreadManager/ThreadManager.cpp" # 线程管理类库
"../Bionic_Core/CppHandle/CppHandle.cpp" # C++&C兼容库 "../Bionic_Core/CppHandle/CppHandle.cpp" # C++&C兼容库
@@ -73,6 +74,7 @@ idf_component_register(SRCS "Bionic_sphere.c"
"../Bionic_Core/ToolsClass/AudioOutput" "../Bionic_Core/ToolsClass/AudioOutput"
"../Bionic_Core/ToolsClass/LVGL_Render" "../Bionic_Core/ToolsClass/LVGL_Render"
"../Bionic_Core/ToolsClass/SDFileManager" "../Bionic_Core/ToolsClass/SDFileManager"
"../Bionic_Core/ToolsClass/SpeechRecognizer"
"../Bionic_Core/ToolsClass/WifiConnectors" "../Bionic_Core/ToolsClass/WifiConnectors"
"../Bionic_Core/ToolsClass/ThreadManager" "../Bionic_Core/ToolsClass/ThreadManager"
"../Bionic_Core/CppHandle" "../Bionic_Core/CppHandle"
+20 -20
View File
@@ -560,26 +560,26 @@ CONFIG_SR_MN_EN_NONE=y
# #
# Add Chinese speech commands # Add Chinese speech commands
# #
CONFIG_CN_SPEECH_COMMAND_ID0="da kai kong tiao" CONFIG_CN_SPEECH_COMMAND_ID0=""
CONFIG_CN_SPEECH_COMMAND_ID1="guan bi kong tiao" CONFIG_CN_SPEECH_COMMAND_ID1=""
CONFIG_CN_SPEECH_COMMAND_ID2="zeng da feng su" CONFIG_CN_SPEECH_COMMAND_ID2=""
CONFIG_CN_SPEECH_COMMAND_ID3="jian xiao feng su" CONFIG_CN_SPEECH_COMMAND_ID3=""
CONFIG_CN_SPEECH_COMMAND_ID4="sheng gao yi du" CONFIG_CN_SPEECH_COMMAND_ID4=""
CONFIG_CN_SPEECH_COMMAND_ID5="jiang di yi du" CONFIG_CN_SPEECH_COMMAND_ID5=""
CONFIG_CN_SPEECH_COMMAND_ID6="zhi re mo shi" CONFIG_CN_SPEECH_COMMAND_ID6=""
CONFIG_CN_SPEECH_COMMAND_ID7="zhi leng mo shi" CONFIG_CN_SPEECH_COMMAND_ID7=""
CONFIG_CN_SPEECH_COMMAND_ID8="song feng mo shi" CONFIG_CN_SPEECH_COMMAND_ID8=""
CONFIG_CN_SPEECH_COMMAND_ID9="jie neng mo shi" CONFIG_CN_SPEECH_COMMAND_ID9=""
CONFIG_CN_SPEECH_COMMAND_ID10="chu shi mo shi" CONFIG_CN_SPEECH_COMMAND_ID10=""
CONFIG_CN_SPEECH_COMMAND_ID11="jian kang mo shi" CONFIG_CN_SPEECH_COMMAND_ID11=""
CONFIG_CN_SPEECH_COMMAND_ID12="shui mian mo shi" CONFIG_CN_SPEECH_COMMAND_ID12=""
CONFIG_CN_SPEECH_COMMAND_ID13="da kai lan ya" CONFIG_CN_SPEECH_COMMAND_ID13=""
CONFIG_CN_SPEECH_COMMAND_ID14="guan bi lan ya" CONFIG_CN_SPEECH_COMMAND_ID14=""
CONFIG_CN_SPEECH_COMMAND_ID15="kai shi bo fang" CONFIG_CN_SPEECH_COMMAND_ID15=""
CONFIG_CN_SPEECH_COMMAND_ID16="zan ting bo fang" CONFIG_CN_SPEECH_COMMAND_ID16=""
CONFIG_CN_SPEECH_COMMAND_ID17="ding shi yi xiao shi" CONFIG_CN_SPEECH_COMMAND_ID17=""
CONFIG_CN_SPEECH_COMMAND_ID18="da kai dian deng" CONFIG_CN_SPEECH_COMMAND_ID18=""
CONFIG_CN_SPEECH_COMMAND_ID19="guan bi dian deng" CONFIG_CN_SPEECH_COMMAND_ID19=""
CONFIG_CN_SPEECH_COMMAND_ID20="" CONFIG_CN_SPEECH_COMMAND_ID20=""
CONFIG_CN_SPEECH_COMMAND_ID21="" CONFIG_CN_SPEECH_COMMAND_ID21=""
CONFIG_CN_SPEECH_COMMAND_ID22="" CONFIG_CN_SPEECH_COMMAND_ID22=""
+2 -2
View File
@@ -1679,7 +1679,7 @@ CONFIG_ESP_SYSTEM_MEMPROT_FEATURE_LOCK=y
CONFIG_ESP_SYSTEM_EVENT_QUEUE_SIZE=32 CONFIG_ESP_SYSTEM_EVENT_QUEUE_SIZE=32
CONFIG_ESP_SYSTEM_EVENT_TASK_STACK_SIZE=2304 CONFIG_ESP_SYSTEM_EVENT_TASK_STACK_SIZE=2304
CONFIG_ESP_MAIN_TASK_STACK_SIZE=3584 CONFIG_ESP_MAIN_TASK_STACK_SIZE=8192
CONFIG_ESP_MAIN_TASK_AFFINITY_CPU0=y CONFIG_ESP_MAIN_TASK_AFFINITY_CPU0=y
# CONFIG_ESP_MAIN_TASK_AFFINITY_CPU1 is not set # CONFIG_ESP_MAIN_TASK_AFFINITY_CPU1 is not set
# CONFIG_ESP_MAIN_TASK_AFFINITY_NO_AFFINITY is not set # CONFIG_ESP_MAIN_TASK_AFFINITY_NO_AFFINITY is not set
@@ -3221,7 +3221,7 @@ CONFIG_ESP32S3_DEFAULT_CPU_FREQ_240=y
CONFIG_ESP32S3_DEFAULT_CPU_FREQ_MHZ=240 CONFIG_ESP32S3_DEFAULT_CPU_FREQ_MHZ=240
CONFIG_SYSTEM_EVENT_QUEUE_SIZE=32 CONFIG_SYSTEM_EVENT_QUEUE_SIZE=32
CONFIG_SYSTEM_EVENT_TASK_STACK_SIZE=2304 CONFIG_SYSTEM_EVENT_TASK_STACK_SIZE=2304
CONFIG_MAIN_TASK_STACK_SIZE=3584 CONFIG_MAIN_TASK_STACK_SIZE=8192
CONFIG_CONSOLE_UART_DEFAULT=y CONFIG_CONSOLE_UART_DEFAULT=y
# CONFIG_CONSOLE_UART_CUSTOM is not set # CONFIG_CONSOLE_UART_CUSTOM is not set
# CONFIG_CONSOLE_UART_NONE is not set # CONFIG_CONSOLE_UART_NONE is not set
+8
View File
@@ -224,3 +224,11 @@
- [x] 1. 历时两天,完整且完美的设计了宠物类,使用到了多种设计模式,完成了低耦合,高内聚的完美代码,测试也完美通过。 - [x] 1. 历时两天,完整且完美的设计了宠物类,使用到了多种设计模式,完成了低耦合,高内聚的完美代码,测试也完美通过。
- [x] 2. 顺便完善了底层通信类的封装,基于websocket,基准测试通过,但存在一点很小的线程bug,似乎是来自于esp32 idf底层的问题,待解决 - [x] 2. 顺便完善了底层通信类的封装,基于websocket,基准测试通过,但存在一点很小的线程bug,似乎是来自于esp32 idf底层的问题,待解决
#### Day15 2025.9.16
##### 主要目标:完成具体业务开发&各种优化
实际完成任务:
- [x] 1. 完成了语音识别的C++业务层封装,测试通过
- [x] 2. 试着测试了一下LVGL_GIF渲染+音乐播放+语音识别的组合简单优化后,
发现lvgl渲染略显卡顿,语音识别有缓冲区空警告,不过无伤大雅,还需要进一步深度优化。