Bionic_sphere/Bionic_Core/ToolsClass/SpeechRecognizer/VadSlidingWindow.h

//
// Created by misaki on 2025/9/29.
//
/**
 * 本模块用于处理音频数据，使用滑动窗口和VAD算法进行人声语音段过滤
 * 依赖于VAD人声检测数据，并通过滑动窗口管理区间数据，以精确过滤出人声音频数据
 * @author  Misaki
 * @date    2025/9/29
 */
#pragma once
#include <vector>
#include <queue>
#include <memory>
#include <mutex>
#include <condition_variable>
#include <atomic>

// PCM帧数据结构
struct PCMFrame {
    std::vector<int16_t> audio_data;  // PCM音频数据
    size_t data_length;               // 数据长度（样本数）
    bool vad_status;                  // VAD检测状态
    uint64_t timestamp;               // 时间戳（可选，用于调试）

    PCMFrame() : data_length(0), vad_status(false), timestamp(0) {}

    PCMFrame(const int16_t* data, const size_t len, const bool vad, const uint64_t ts = 0)
        : data_length(len), vad_status(vad), timestamp(ts) {
        audio_data.assign(data, data + len);
    }
};

// 滑动窗口结构
struct SlidingWindow {
    std::vector<PCMFrame> frames;     // 窗口内的所有帧
    size_t total_frames;              // 总帧数
    size_t voice_frames;              // 人声帧数
    double voice_ratio;               // 人声占比
    bool is_active;                   // 窗口是否处于活跃状态

    SlidingWindow() : total_frames(0), voice_frames(0), voice_ratio(0.0), is_active(false) {}

    // 计算人声占比
    void calculateVoiceRatio() {
        if (total_frames > 0) {
            voice_ratio = static_cast<double>(voice_frames) / total_frames;
        } else {
            voice_ratio = 0.0;
        }
    }
};

class AudioBufferManager {
private:
    // 单例实例
    static AudioBufferManager* instance;
    static std::mutex instance_mutex;

    // 配置参数
    struct Config {
        size_t max_window_frames;         // 单个窗口最大帧数
        size_t max_queue_size;            // 队列最大长度
        size_t silence_threshold_frames;  // 静音阈值帧数
        double voice_ratio_threshold;     // 人声占比阈值
        size_t pre_voice_frames;          // 人声开始前预保留帧数
        size_t post_voice_frames;         // 人声结束后保留帧数
    } config;

    // 内部状态
    std::unique_ptr<SlidingWindow> current_window;
    std::queue<std::unique_ptr<SlidingWindow>> completed_windows;
    std::vector<PCMFrame> pre_voice_buffer;  // 人声开始前的预缓存

    // 状态跟踪
    std::atomic<bool> in_voice_segment;
    size_t consecutive_silence_frames;
    size_t current_frame_count;

    // 线程同步
    std::mutex data_mutex;
    std::condition_variable data_condition;

    // 内存使用跟踪
    size_t estimated_memory_usage;
    const size_t MAX_MEMORY_BYTES = 512 * 1024;  // 512KB

private:
    AudioBufferManager() {
        initializeDefaultConfig();
        resetState();
    }

    void initializeDefaultConfig() {
        // 默认配置：基于16kHz采样率，每帧20ms（320样本）
        config.max_window_frames = 500;      // 10秒音频（500 * 20ms）
        config.max_queue_size = 8;           // 队列最多8个窗口
        config.silence_threshold_frames = 15; // 300ms静音判定结束（15 * 20ms）
        config.voice_ratio_threshold = 0.3;  // 30%人声占比阈值
        config.pre_voice_frames = 5;         // 人声开始前保留100ms
        config.post_voice_frames = 10;       // 人声结束后保留200ms

        estimated_memory_usage = 0;
    }

    void resetState() {
        current_window = std::make_unique<SlidingWindow>();
        in_voice_segment = false;
        consecutive_silence_frames = 0;
        current_frame_count = 0;
        pre_voice_buffer.clear();
        pre_voice_buffer.reserve(config.pre_voice_frames);
    }

    // 估算单个帧的内存使用
    size_t estimateFrameMemory(const PCMFrame& frame) const {
        return sizeof(PCMFrame) + (frame.audio_data.capacity() * sizeof(int16_t));
    }

    // 估算窗口内存使用
    size_t estimateWindowMemory(const SlidingWindow& window) const {
        size_t memory = sizeof(SlidingWindow);
        for (const auto& frame : window.frames) {
            memory += estimateFrameMemory(frame);
        }
        return memory;
    }

    // 检查内存限制
    bool checkMemoryConstraints() const {
        return estimated_memory_usage < MAX_MEMORY_BYTES;
    }

public:
    // 删除拷贝构造函数和赋值运算符
    AudioBufferManager(const AudioBufferManager&) = delete;
    AudioBufferManager& operator=(const AudioBufferManager&) = delete;

    // 获取单例实例
    static AudioBufferManager* getInstance() {
        std::lock_guard<std::mutex> lock(instance_mutex);
        if (!instance) {
            instance = new AudioBufferManager();
        }
        return instance;
    }

    /**
     * @brief 配置管理器参数
     * @param max_window_frames     单个窗口最大帧数
     * @param max_queue_size        队列最大长度
     * @param silence_threshold     静音阈值帧数
     * @param voice_ratio_threshold 人声占比阈值
     * @param pre_voice_frames      人声开始前预保留帧数
     * @param post_voice_frames     人声结束后保留帧数
     */
    void configure(const size_t max_window_frames = 500,
                   const size_t max_queue_size = 8,
                   const size_t silence_threshold = 15,
                   const double voice_ratio_threshold = 0.2,
                   const size_t pre_voice_frames = 5,
                   const size_t post_voice_frames = 10) {
        std::lock_guard<std::mutex> lock(data_mutex);

        config.max_window_frames = max_window_frames;
        config.max_queue_size = max_queue_size;
        config.silence_threshold_frames = silence_threshold;
        config.voice_ratio_threshold = voice_ratio_threshold;
        config.pre_voice_frames = pre_voice_frames;
        config.post_voice_frames = post_voice_frames;

        // 重新初始化状态
        resetState();
    }

    /**
     * @brief 注入新的音频帧数据
     * @param audio_data PCM音频数据指针
     * @param data_length 数据长度（样本数）
     * @param vad_status 当前帧的VAD状态
     */
    void injectAudioFrame(const int16_t* audio_data, size_t data_length, bool vad_status) {
        std::lock_guard<std::mutex> lock(data_mutex);

        // 创建新帧
        PCMFrame new_frame(audio_data, data_length, vad_status, current_frame_count++);
        size_t frame_memory = estimateFrameMemory(new_frame);

        // 检查内存限制
        if (!checkMemoryConstraints()) {
            // 内存不足，采取清理策略
            if (!completed_windows.empty()) {
                auto old_window = std::move(completed_windows.front());
                completed_windows.pop();
                estimated_memory_usage -= estimateWindowMemory(*old_window);
            }
        }

        // 更新预缓存
        updatePreVoiceBuffer(new_frame);

        // 状态机处理
        if (!in_voice_segment) {
            handleNonVoiceState(new_frame, frame_memory);
        } else {
            handleVoiceState(new_frame, frame_memory);
        }

        estimated_memory_usage += frame_memory;
    }

    /**
     * @brief 获取可用的音频窗口数据
     * @param timeout_ms 超时时间（毫秒）
     * @return 滑动窗口指针，如果没有可用数据则返回nullptr
     */
    std::unique_ptr<SlidingWindow> getAudioWindow(int timeout_ms = 0) {
        std::unique_lock<std::mutex> lock(data_mutex);
        if (completed_windows.empty()) {
            if (timeout_ms <= 0) {
                return nullptr;
            }
            // 等待数据可用
            if (data_condition.wait_for(lock,
                std::chrono::milliseconds(timeout_ms)) == std::cv_status::timeout) {
                return nullptr;
            }
            if (completed_windows.empty()) {
                return nullptr;
            }
        }
        auto window = std::move(completed_windows.front());
        completed_windows.pop();
        estimated_memory_usage -= estimateWindowMemory(*window);
        return window;
    }

    /**
     * @brief 检查是否有可用的音频数据
     */
    bool hasAvailableData() {
        std::lock_guard<std::mutex> lock(data_mutex);
        return !completed_windows.empty();
    }

    /**
     * @brief 获取当前队列大小
     */
    size_t getQueueSize() {
        std::lock_guard<std::mutex> lock(data_mutex);
        return completed_windows.size();
    }

    /**
     * @brief 获取估计的内存使用量
     */
    size_t getEstimatedMemoryUsage() const {
        return estimated_memory_usage;
    }

    /**
     * @brief 强制结束当前语音段（如果有）
     */
    void forceEndCurrentSegment() {
        std::lock_guard<std::mutex> lock(data_mutex);
        if (in_voice_segment && current_window->total_frames > 0) {
            finalizeCurrentWindow();
        }
    }

    /**
     * @brief 清空所有缓存数据
     */
    void clearAllData() {
        std::lock_guard<std::mutex> lock(data_mutex);
        resetState();
        std::queue<std::unique_ptr<SlidingWindow>> empty_queue;
        std::swap(completed_windows, empty_queue);
        estimated_memory_usage = 0;
    }

private:
    // 更新人声开始前的预缓存
    void updatePreVoiceBuffer(const PCMFrame& frame) {
        pre_voice_buffer.push_back(frame);

        // 保持预缓存大小不超过配置值
        if (pre_voice_buffer.size() > config.pre_voice_frames) {
            pre_voice_buffer.erase(pre_voice_buffer.begin());
        }
    }

    // 处理非人声状态
    void handleNonVoiceState(const PCMFrame& frame, size_t frame_memory) {
        if (frame.vad_status) {
            // 检测到人声开始
            in_voice_segment = true;
            consecutive_silence_frames = 0;

            // 将预缓存数据加入当前窗口
            for (const auto& pre_frame : pre_voice_buffer) {
                current_window->frames.push_back(pre_frame);
                current_window->total_frames++;
                if (pre_frame.vad_status) {
                    current_window->voice_frames++;
                }
            }

            // 添加当前帧
            addFrameToCurrentWindow(frame);
        }
        // 非人声状态下，不进行其他处理
    }

    // 处理人声状态
    void handleVoiceState(const PCMFrame& frame, size_t frame_memory) {
        if (frame.vad_status) {
            // 仍然是人声，重置静音计数
            consecutive_silence_frames = 0;
        } else {
            // 静音帧
            consecutive_silence_frames++;
        }

        // 添加当前帧到窗口
        addFrameToCurrentWindow(frame);

        // 检查是否需要结束当前语音段
        if (consecutive_silence_frames >= config.silence_threshold_frames ||
            current_window->frames.size() >= config.max_window_frames) {

            // 添加人声结束后的保留帧
            addPostVoiceFrames();

            // 完成当前窗口
            finalizeCurrentWindow();
        }
    }

    // 添加帧到当前窗口
    void addFrameToCurrentWindow(const PCMFrame& frame) {
        current_window->frames.push_back(frame);
        current_window->total_frames++;
        if (frame.vad_status) {
            current_window->voice_frames++;
        }
        current_window->calculateVoiceRatio();
    }

    // 添加人声结束后的保留帧
    void addPostVoiceFrames() {
        // 这个函数在实际实现中需要缓存后续的帧
        // 简化实现：在当前设计中，我们依赖静音阈值来自然包含结束后的帧
    }

    // 完成当前窗口的处理
    void finalizeCurrentWindow() {
        // 计算最终的人声占比
        current_window->calculateVoiceRatio();

        // 检查人声占比是否达到阈值
        if (current_window->voice_ratio >= config.voice_ratio_threshold) {
            // 窗口有效，加入队列
            if (completed_windows.size() >= config.max_queue_size) {
                // 队列已满，移除最旧的数据
                auto old_window = std::move(completed_windows.front());
                completed_windows.pop();
                estimated_memory_usage -= estimateWindowMemory(*old_window);
            }

            completed_windows.push(std::move(current_window));
            data_condition.notify_one();  // 通知等待的消费者
        }

        // 重置状态，开始新的窗口
        resetState();
    }
};