Bionic_sphere/Bionic_Core/ToolsClass/SpeechRecognizer/SpeechRecognizer.cpp

//
// Created by misaki on 2025/9/15.
//
#include "SpeechRecognizer.h"
#include "esp_afe_sr_models.h"
#include "esp_mn_models.h"
#include "esp_wn_iface.h"
#include "esp_mn_speech_commands.h"
#include "model_path.h"
#include "driver/gpio.h"
#include "soc/soc_caps.h"
#include "esp_err.h"
#include "nvs_flash.h"
#include "freertos/FreeRTOS.h"
#include "freertos/task.h"
#include <atomic>
#include <cstring>
#include <memory>
#include <utility>

// 初始化静态成员变量
SpeechRecognizer* SpeechRecognizer::instance = nullptr;
std::mutex SpeechRecognizer::instanceMutex;

SpeechRecognizer* SpeechRecognizer::getInstance() {
    std::lock_guard<std::mutex> lock(instanceMutex);
    if (!instance) {
        instance = new SpeechRecognizer();
    }
    return instance;
}

SpeechRecognizer::SpeechRecognizer()
    : initialized(false),
      running(false),
      enabled(true),
      rx_handle(nullptr),
      afe_handle(nullptr),
      afe_data(nullptr),
      models(nullptr),
      multinet(nullptr),
      model_data(nullptr),
      tasksRunning(false) {
}

SpeechRecognizer::~SpeechRecognizer() {
    deinit();
}

bool SpeechRecognizer::init(const SpeechRecognizerConfig& config) {
    if (initialized) {
        ESP_LOGI("SpeechRecognizer", "Already initialized");
        return true;
    }
    this->config = config;
    // 初始化I2S
    if (!initI2S()) {
        ESP_LOGE("SpeechRecognizer", "I2S initialization failed");
        return false;
    }
    // 初始化ESP-SR
    if (!initESP_SR()) {
        ESP_LOGE("SpeechRecognizer", "ESP-SR initialization failed");
        return false;
    }
    initialized = true;
    ESP_LOGI("SpeechRecognizer", "Initialization completed successfully");
    return true;
}

void SpeechRecognizer::deinit() {
    if (!initialized) {
        return;
    }
    stop();
    // 释放ESP-SR资源
    if (model_data && multinet) {
        multinet->destroy(model_data);
        model_data = nullptr;
    }
    if (afe_data && afe_handle) {
        afe_handle->destroy(afe_data);
        afe_data = nullptr;
    }
    if (models) {
        // 注意：esp_srmodel_init分配的资源可能需要特殊清理
        // 根据ESP-SR文档进行适当清理
    }
    // 释放I2S资源
    if (rx_handle) {
        i2s_channel_disable(rx_handle);
        i2s_del_channel(rx_handle);
        rx_handle = nullptr;
    }
    initialized = false;
    ESP_LOGI("SpeechRecognizer", "Deinitialized");
}

bool SpeechRecognizer::initI2S() {
    esp_err_t ret = ESP_OK;
    i2s_chan_config_t chan_cfg = I2S_CHANNEL_DEFAULT_CONFIG(I2S_NUM_1, I2S_ROLE_MASTER);
    ret = i2s_new_channel(&chan_cfg, nullptr, &rx_handle);
    if (ret != ESP_OK) {
        ESP_LOGE("SpeechRecognizer", "Failed to create I2S channel: %s", esp_err_to_name(ret));
        return false;
    }
    i2s_std_config_t std_cfg = {
        .clk_cfg = I2S_STD_CLK_DEFAULT_CONFIG(16000),
        .slot_cfg = I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG(I2S_DATA_BIT_WIDTH_32BIT, I2S_SLOT_MODE_MONO),
        .gpio_cfg = {
            .mclk = GPIO_NUM_NC,
            .bclk = config.bclk_pin,
            .ws = config.ws_pin,
            .dout = GPIO_NUM_NC,
            .din = config.din_pin,
            .invert_flags = {
                .mclk_inv = false,
                .bclk_inv = false,
                .ws_inv = false,
            },
        },
    };
    std_cfg.slot_cfg.slot_mask = I2S_STD_SLOT_RIGHT;
    ret = i2s_channel_init_std_mode(rx_handle, &std_cfg);
    if (ret != ESP_OK) {
        ESP_LOGE("SpeechRecognizer", "Failed to init I2S standard mode: %s", esp_err_to_name(ret));
        return false;
    }
    ESP_LOGI("SpeechRecognizer", "I2S initialized successfully");
    return true;
}

bool SpeechRecognizer::initESP_SR() {
    // 获取AFE句柄
    afe_handle = &ESP_AFE_SR_HANDLE;
    if (!afe_handle) {
        ESP_LOGE("SpeechRecognizer", "Failed to get AFE handle");
        return false;
    }
    // 初始化模型
    models = esp_srmodel_init(config.model_path.c_str());
    if (!models) {
        ESP_LOGE("SpeechRecognizer", "Failed to initialize models from path: %s", config.model_path.c_str());
        return false;
    }
    // 配置AFE
    afe_config_t afe_config = {
        .aec_init = config.enable_aec,
        .se_init = config.enable_se,
        .vad_init = config.enable_vad,
        .wakenet_init = false,  // 禁用唤醒词
        .voice_communication_init = false,
        .voice_communication_agc_init = false,
        .voice_communication_agc_gain = 15,
        .vad_mode = config.vad_mode,
        .wakenet_model_name = nullptr,
        .wakenet_model_name_2 = nullptr,
        .wakenet_mode = DET_MODE_2CH_90,
        .afe_mode = SR_MODE_LOW_COST,
        .afe_perferred_core = 0,
        .afe_perferred_priority = 5,
        .afe_ringbuf_size = 10,
        .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM,
        .afe_linear_gain = 1.0,
        .agc_mode = AFE_MN_PEAK_AGC_MODE_2,
        .pcm_config = {
            .total_ch_num = 2,
            .mic_num = 1,
            .ref_num = 1,
            .sample_rate = 16000,
        },
        .debug_init = false,
        .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, nullptr}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, nullptr}},
    };
    afe_data = afe_handle->create_from_config(&afe_config);
    if (!afe_data) {
        ESP_LOGE("SpeechRecognizer", "Failed to create AFE data from config");
        return false;
    }
    // 加载MultiNet模型(采用esp-sr提供的宏来处理不同语种的模型的处理问题)
#if defined(CONFIG_SR_MN_CN_MULTINET5_RECOGNITION_QUANT8) || defined(CONFIG_SR_MN_CN_MULTINET6_QUANT) || defined(CONFIG_SR_MN_CN_MULTINET6_AC_QUANT)
    char *mn_name = esp_srmodel_filter(models, ESP_MN_PREFIX, ESP_MN_CHINESE);
#else
    char *mn_name = esp_srmodel_filter(models, ESP_MN_PREFIX, ESP_MN_ENGLISH);
#endif
    if (!mn_name) {
        ESP_LOGE("SpeechRecognizer", "No MultiNet model found");
        return false;
    }
    multinet = esp_mn_handle_from_name(mn_name);
    if (!multinet) {
        ESP_LOGE("SpeechRecognizer", "Failed to get MultiNet handle");
        return false;
    }
    model_data = multinet->create(mn_name, config.detection_timeout);
    if (!model_data) {
        ESP_LOGE("SpeechRecognizer", "Failed to create MultiNet model data");
        return false;
    }
    ESP_LOGI("SpeechRecognizer", "ESP-SR initialized successfully with model: %s", mn_name);
    return true;
}

bool SpeechRecognizer::start() {
    if (!initialized) {
        ESP_LOGE("SpeechRecognizer", "Not initialized");
        return false;
    }
    if (running) {
        ESP_LOGI("SpeechRecognizer", "Already running");
        return true;
    }
    // 启用I2S通道
    esp_err_t ret = i2s_channel_enable(rx_handle);
    if (ret != ESP_OK) {
        ESP_LOGE("SpeechRecognizer", "Failed to enable I2S channel: %s", esp_err_to_name(ret));
        return false;
    }
    // 启动任务
    tasksRunning = true;
    // 使用ThreadManager创建任务
    feedThread = ThreadManager::createMemberThread(
        config.feed_thread_config, this, &SpeechRecognizer::feedTask);

    detectThread = ThreadManager::createMemberThread(
        config.detect_thread_config, this, &SpeechRecognizer::detectTask);
    running = true;
    updateState("started");
    ESP_LOGI("SpeechRecognizer", "Speech recognition started");
    return true;
}

void SpeechRecognizer::stop() {
    if (!running) {
        return;
    }
    tasksRunning = false;
    // 等待任务结束
    if (feedThread.joinable()) {
        feedThread.join();
    }
    if (detectThread.joinable()) {
        detectThread.join();
    }
    // 禁用I2S通道
    if (rx_handle) {
        i2s_channel_disable(rx_handle);
    }
    running = false;
    updateState("stopped");
    ESP_LOGI("SpeechRecognizer", "Speech recognition stopped");
}

void SpeechRecognizer::feedTask() {
    ThreadManager::printThreadInfo("Feed task started");
    int audio_chunksize = afe_handle->get_feed_chunksize(afe_data);
    int nch = afe_handle->get_channel_num(afe_data);
    size_t samp_len = audio_chunksize;
    size_t samp_len_bytes = samp_len * sizeof(int32_t);  // 单声道32位
    auto *i2s_buff = static_cast<int32_t *>(malloc(samp_len_bytes));
    if (!i2s_buff) {
        ESP_LOGE("SpeechRecognizer", "Failed to allocate memory for I2S buffer");
        return;
    }
    size_t bytes_read;
    while (tasksRunning) {
        esp_err_t ret = i2s_channel_read(rx_handle, i2s_buff, samp_len_bytes, &bytes_read, portMAX_DELAY);
        if (ret != ESP_OK) {
            ESP_LOGE("SpeechRecognizer", "I2S read error: %s", esp_err_to_name(ret));
            vTaskDelay(pdMS_TO_TICKS(10));
            continue;
        }
        // 处理音频数据（32位转16位）
        for (int i = 0; i < samp_len; ++i) {
            i2s_buff[i] = i2s_buff[i] >> 14; // 32:8是有效位，转换为16位音频数据
        }
        // 喂数据给AFE
        afe_handle->feed(afe_data, reinterpret_cast<int16_t *>(i2s_buff));
    }
    free(i2s_buff);
    ESP_LOGI("SpeechRecognizer", "Feed task exited");
}

void SpeechRecognizer::detectTask() {
    ThreadManager::printThreadInfo("Detect task started");
    int afe_chunksize = afe_handle->get_fetch_chunksize(afe_data);
    int mu_chunksize = multinet->get_samp_chunksize(model_data);
    if (mu_chunksize != afe_chunksize) {
        ESP_LOGE("SpeechRecognizer", "Chunk size mismatch: AFE=%d, MultiNet=%d", afe_chunksize, mu_chunksize);
        return;
    }
    updateState("ready");
    ESP_LOGI("SpeechRecognizer", "Ready for speech recognition");
    while (tasksRunning) {
        afe_fetch_result_t* res = afe_handle->fetch(afe_data);
        if (!res || res->ret_value == ESP_FAIL) {
            ESP_LOGE("SpeechRecognizer", "AFE fetch error");
            vTaskDelay(pdMS_TO_TICKS(10));
            continue;
        }
        if (!enabled) {
            vTaskDelay(pdMS_TO_TICKS(100));
            continue;
        }
        // 使用MultiNet进行语音检测
        esp_mn_state_t mn_state = multinet->detect(model_data, res->data);
        if (mn_state == ESP_MN_STATE_DETECTING) {
            // 检测中，不做处理
            continue;
        } else if (mn_state == ESP_MN_STATE_DETECTED) {
            // 检测到语音命令
            esp_mn_results_t *mn_result = multinet->get_results(model_data);
            if (mn_result && mn_result->num > 0) {
                SpeechRecognitionResult result;
                result.command_id = mn_result->command_id[0];
                result.phrase = mn_result->string;
                result.probability = mn_result->prob[0];
                result.phrase_id = mn_result->phrase_id[0];

                handleRecognitionResult(result);
            }
        } else if (mn_state == ESP_MN_STATE_TIMEOUT) {
            // 识别超时
            updateState("timeout");
            esp_mn_results_t *mn_result = multinet->get_results(model_data);
            ESP_LOGI("SpeechRecognizer", "Detection timeout: %s",
                    mn_result && mn_result->string ? mn_result->string : "");
            // 重置检测状态
            multinet->clean(model_data);
            updateState("ready");
        }
    }
    ESP_LOGI("SpeechRecognizer", "Detect task exited");
}

bool SpeechRecognizer::addCommand(int command_id, const std::string& phrase) {
    // 加强检查，确保所有相关指针都有效
    if (!multinet) {
        ESP_LOGE("SpeechRecognizer", "MultiNet handle is null");
        return false;
    }
    if (!model_data) {
        ESP_LOGE("SpeechRecognizer", "Model data is null");
        return false;
    }
    esp_err_t ret = esp_mn_commands_add(command_id, phrase.c_str());
    if (ret != ESP_OK) {
        ESP_LOGE("SpeechRecognizer", "Failed to add command: %s", esp_err_to_name(ret));
        return false;
    }
    // 更新命令列表
    esp_mn_error_t *ret_mn = esp_mn_commands_update();
    if (ret_mn) {
        if (ret_mn->num >= 1) {
            ESP_LOGE("SpeechRecognizer", "无法更新的指令数量: %d", ret_mn->num);
            for (int i = 0; i < ret_mn->num; i++) {
                ESP_LOGE("SpeechRecognizer", "无法更新的指令名称: %s，对应的id=%d", ret_mn->phrases[i]->string, ret_mn->phrases[i]->command_id);
            }
            return false;
        }
        else {
            ESP_LOGE("SpeechRecognizer", "无法更新的指令短语数组为空");
        }
    }
    // 打印缓存的指令
    ESP_LOGI("SpeechRecognizer", "Added command: ID=%d, Phrase=%s", command_id, phrase.c_str());
    ESP_LOGI("SpeechRecognizer", "下面是当前已经缓存的指令:");
    esp_mn_commands_print();
    ESP_LOGI("SpeechRecognizer", "下面是当前已经应用的指令:");
    esp_mn_active_commands_print();
    return true;
}

bool SpeechRecognizer::addCommands(const std::vector<std::pair<int, std::string>>& commands) {
    bool success = true;
    for (const auto& cmd : commands) {
        if (!addCommand(cmd.first, cmd.second)) {
            success = false;
        }
    }
    ESP_LOGI("SpeechRecognizer", "下面是当前已经缓存的指令:");
    esp_mn_commands_print();
    ESP_LOGI("SpeechRecognizer", "下面是当前已经应用的指令:");
    esp_mn_active_commands_print();
    return success;
}

void SpeechRecognizer::clearCommands() {
    if (multinet && model_data) {
        esp_mn_commands_clear();
        ESP_LOGI("SpeechRecognizer", "All commands cleared");
    }
}

void SpeechRecognizer::registerCommandCallback(SpeechCommandCallback callback) {
    commandCallback = std::move(callback);
}

void SpeechRecognizer::registerStateCallback(SpeechStateCallback callback) {
    stateCallback = std::move(callback);
}

void SpeechRecognizer::handleRecognitionResult(const SpeechRecognitionResult& result) {
    // 保存到历史记录
    {
        std::lock_guard<std::mutex> lock(historyMutex);
        lastResult = result;
        history.push_back(result);

        // 限制历史记录大小
        if (history.size() > 100) {
            history.erase(history.begin());
        }
    }
    // 记录日志
    ESP_LOGI("SpeechRecognizer", "Command detected: ID=%d, Phrase='%s', Probability=%.2f",
            result.command_id, result.phrase.c_str(), result.probability);
    updateState("command_detected");
    // 调用回调函数
    if (commandCallback) {
        commandCallback(result.command_id, result.phrase, result.probability);
    }
    // 重置检测状态，准备下一次识别
    if (multinet && model_data) {
        multinet->clean(model_data);
    }
    updateState("ready");
}

void SpeechRecognizer::updateState(const std::string& state) {
    {
        std::lock_guard<std::mutex> lock(stateMutex);
        currentState = state;
    }
    ESP_LOGI("SpeechRecognizer", "State changed: %s", state.c_str());
    if (stateCallback) {
        stateCallback(state);
    }
}

SpeechRecognitionResult SpeechRecognizer::getLastResult() const {
    std::lock_guard<std::mutex> lock(historyMutex);
    return lastResult;
}

std::vector<SpeechRecognitionResult> SpeechRecognizer::getHistory() const {
    std::lock_guard<std::mutex> lock(historyMutex);
    return history;
}

void SpeechRecognizer::setEnabled(bool enabled) {
    this->enabled = enabled;
    updateState(enabled ? "enabled" : "disabled");
    ESP_LOGI("SpeechRecognizer", "Speech recognition %s", enabled ? "enabled" : "disabled");
}

std::string SpeechRecognizer::getCurrentState() const {
    std::lock_guard<std::mutex> lock(stateMutex);
    return currentState;
}

bool SpeechRecognizer::isRunning() const {
    return running;
}