Files
Bionic_sphere/Bionic_Core/ToolsClass/SpeechRecognizer/VadSlidingWindow.h
T
Misaki ba5e47bc77 这是一次长久的提交:
1. 应用界面增加了返回主页的按钮
2. 修复了gif渲染内存泄漏的严重bug
3. 将PetDao当中的cJSON API替换为cpp_json,完美通过测试
4. 整合已经实现的各种上层建筑,实现了一个宠物对话基本业务应用,用于样品测试展示用
5. 重构了音频播放类,使其更modern,更加便于移植和拓展
2025-10-16 11:36:45 +08:00

378 lines
12 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//
// Created by misaki on 2025/9/29.
//
/**
* 本模块用于处理音频数据,使用滑动窗口和VAD算法进行人声语音段过滤
* 依赖于VAD人声检测数据,并通过滑动窗口管理区间数据,以精确过滤出人声音频数据
* @author Misaki
* @date 2025/9/29
*/
#pragma once
#include <vector>
#include <queue>
#include <memory>
#include <mutex>
#include <condition_variable>
#include <atomic>
// PCM帧数据结构
struct PCMFrame {
std::vector<int16_t> audio_data; // PCM音频数据
size_t data_length; // 数据长度(样本数)
bool vad_status; // VAD检测状态
uint64_t timestamp; // 时间戳(可选,用于调试)
PCMFrame() : data_length(0), vad_status(false), timestamp(0) {}
PCMFrame(const int16_t* data, const size_t len, const bool vad, const uint64_t ts = 0)
: data_length(len), vad_status(vad), timestamp(ts) {
audio_data.assign(data, data + len);
}
};
// 滑动窗口结构
struct SlidingWindow {
std::vector<PCMFrame> frames; // 窗口内的所有帧
size_t total_frames; // 总帧数
size_t voice_frames; // 人声帧数
double voice_ratio; // 人声占比
bool is_active; // 窗口是否处于活跃状态
SlidingWindow() : total_frames(0), voice_frames(0), voice_ratio(0.0), is_active(false) {}
// 计算人声占比
void calculateVoiceRatio() {
if (total_frames > 0) {
voice_ratio = static_cast<double>(voice_frames) / total_frames;
} else {
voice_ratio = 0.0;
}
}
};
class AudioBufferManager {
private:
// 单例实例
static AudioBufferManager* instance;
static std::mutex instance_mutex;
// 配置参数
struct Config {
size_t max_window_frames; // 单个窗口最大帧数
size_t max_queue_size; // 队列最大长度
size_t silence_threshold_frames; // 静音阈值帧数
double voice_ratio_threshold; // 人声占比阈值
size_t pre_voice_frames; // 人声开始前预保留帧数
size_t post_voice_frames; // 人声结束后保留帧数
} config;
// 内部状态
std::unique_ptr<SlidingWindow> current_window;
std::queue<std::unique_ptr<SlidingWindow>> completed_windows;
std::vector<PCMFrame> pre_voice_buffer; // 人声开始前的预缓存
// 状态跟踪
std::atomic<bool> in_voice_segment;
size_t consecutive_silence_frames;
size_t current_frame_count;
// 线程同步
std::mutex data_mutex;
std::condition_variable data_condition;
// 内存使用跟踪
size_t estimated_memory_usage;
const size_t MAX_MEMORY_BYTES = 512 * 1024; // 512KB
private:
AudioBufferManager() {
initializeDefaultConfig();
resetState();
}
void initializeDefaultConfig() {
// 默认配置:基于16kHz采样率,每帧20ms(320样本)
config.max_window_frames = 500; // 10秒音频(500 * 20ms
config.max_queue_size = 8; // 队列最多8个窗口
config.silence_threshold_frames = 15; // 300ms静音判定结束(15 * 20ms
config.voice_ratio_threshold = 0.3; // 30%人声占比阈值
config.pre_voice_frames = 5; // 人声开始前保留100ms
config.post_voice_frames = 10; // 人声结束后保留200ms
estimated_memory_usage = 0;
}
void resetState() {
current_window = std::make_unique<SlidingWindow>();
in_voice_segment = false;
consecutive_silence_frames = 0;
current_frame_count = 0;
pre_voice_buffer.clear();
pre_voice_buffer.reserve(config.pre_voice_frames);
}
// 估算单个帧的内存使用
size_t estimateFrameMemory(const PCMFrame& frame) const {
return sizeof(PCMFrame) + (frame.audio_data.capacity() * sizeof(int16_t));
}
// 估算窗口内存使用
size_t estimateWindowMemory(const SlidingWindow& window) const {
size_t memory = sizeof(SlidingWindow);
for (const auto& frame : window.frames) {
memory += estimateFrameMemory(frame);
}
return memory;
}
// 检查内存限制
bool checkMemoryConstraints() const {
return estimated_memory_usage < MAX_MEMORY_BYTES;
}
public:
// 删除拷贝构造函数和赋值运算符
AudioBufferManager(const AudioBufferManager&) = delete;
AudioBufferManager& operator=(const AudioBufferManager&) = delete;
// 获取单例实例
static AudioBufferManager* getInstance() {
std::lock_guard<std::mutex> lock(instance_mutex);
if (!instance) {
instance = new AudioBufferManager();
}
return instance;
}
/**
* @brief 配置管理器参数
* @param max_window_frames 单个窗口最大帧数
* @param max_queue_size 队列最大长度
* @param silence_threshold 静音阈值帧数
* @param voice_ratio_threshold 人声占比阈值
* @param pre_voice_frames 人声开始前预保留帧数
* @param post_voice_frames 人声结束后保留帧数
*/
void configure(const size_t max_window_frames = 500,
const size_t max_queue_size = 8,
const size_t silence_threshold = 15,
const double voice_ratio_threshold = 0.2,
const size_t pre_voice_frames = 5,
const size_t post_voice_frames = 10) {
std::lock_guard<std::mutex> lock(data_mutex);
config.max_window_frames = max_window_frames;
config.max_queue_size = max_queue_size;
config.silence_threshold_frames = silence_threshold;
config.voice_ratio_threshold = voice_ratio_threshold;
config.pre_voice_frames = pre_voice_frames;
config.post_voice_frames = post_voice_frames;
// 重新初始化状态
resetState();
}
/**
* @brief 注入新的音频帧数据
* @param audio_data PCM音频数据指针
* @param data_length 数据长度(样本数)
* @param vad_status 当前帧的VAD状态
*/
void injectAudioFrame(const int16_t* audio_data, size_t data_length, bool vad_status) {
std::lock_guard<std::mutex> lock(data_mutex);
// 创建新帧
PCMFrame new_frame(audio_data, data_length, vad_status, current_frame_count++);
size_t frame_memory = estimateFrameMemory(new_frame);
// 检查内存限制
if (!checkMemoryConstraints()) {
// 内存不足,采取清理策略
if (!completed_windows.empty()) {
auto old_window = std::move(completed_windows.front());
completed_windows.pop();
estimated_memory_usage -= estimateWindowMemory(*old_window);
}
}
// 更新预缓存
updatePreVoiceBuffer(new_frame);
// 状态机处理
if (!in_voice_segment) {
handleNonVoiceState(new_frame, frame_memory);
} else {
handleVoiceState(new_frame, frame_memory);
}
estimated_memory_usage += frame_memory;
}
/**
* @brief 获取可用的音频窗口数据
* @param timeout_ms 超时时间(毫秒)
* @return 滑动窗口指针,如果没有可用数据则返回nullptr
*/
std::unique_ptr<SlidingWindow> getAudioWindow(int timeout_ms = 0) {
std::unique_lock<std::mutex> lock(data_mutex);
if (completed_windows.empty()) {
if (timeout_ms <= 0) {
return nullptr;
}
// 等待数据可用
if (data_condition.wait_for(lock,
std::chrono::milliseconds(timeout_ms)) == std::cv_status::timeout) {
return nullptr;
}
if (completed_windows.empty()) {
return nullptr;
}
}
auto window = std::move(completed_windows.front());
completed_windows.pop();
estimated_memory_usage -= estimateWindowMemory(*window);
return window;
}
/**
* @brief 检查是否有可用的音频数据
*/
bool hasAvailableData() {
std::lock_guard<std::mutex> lock(data_mutex);
return !completed_windows.empty();
}
/**
* @brief 获取当前队列大小
*/
size_t getQueueSize() {
std::lock_guard<std::mutex> lock(data_mutex);
return completed_windows.size();
}
/**
* @brief 获取估计的内存使用量
*/
size_t getEstimatedMemoryUsage() const {
return estimated_memory_usage;
}
/**
* @brief 强制结束当前语音段(如果有)
*/
void forceEndCurrentSegment() {
std::lock_guard<std::mutex> lock(data_mutex);
if (in_voice_segment && current_window->total_frames > 0) {
finalizeCurrentWindow();
}
}
/**
* @brief 清空所有缓存数据
*/
void clearAllData() {
std::lock_guard<std::mutex> lock(data_mutex);
resetState();
std::queue<std::unique_ptr<SlidingWindow>> empty_queue;
std::swap(completed_windows, empty_queue);
estimated_memory_usage = 0;
}
private:
// 更新人声开始前的预缓存
void updatePreVoiceBuffer(const PCMFrame& frame) {
pre_voice_buffer.push_back(frame);
// 保持预缓存大小不超过配置值
if (pre_voice_buffer.size() > config.pre_voice_frames) {
pre_voice_buffer.erase(pre_voice_buffer.begin());
}
}
// 处理非人声状态
void handleNonVoiceState(const PCMFrame& frame, size_t frame_memory) {
if (frame.vad_status) {
// 检测到人声开始
in_voice_segment = true;
consecutive_silence_frames = 0;
// 将预缓存数据加入当前窗口
for (const auto& pre_frame : pre_voice_buffer) {
current_window->frames.push_back(pre_frame);
current_window->total_frames++;
if (pre_frame.vad_status) {
current_window->voice_frames++;
}
}
// 添加当前帧
addFrameToCurrentWindow(frame);
}
// 非人声状态下,不进行其他处理
}
// 处理人声状态
void handleVoiceState(const PCMFrame& frame, size_t frame_memory) {
if (frame.vad_status) {
// 仍然是人声,重置静音计数
consecutive_silence_frames = 0;
} else {
// 静音帧
consecutive_silence_frames++;
}
// 添加当前帧到窗口
addFrameToCurrentWindow(frame);
// 检查是否需要结束当前语音段
if (consecutive_silence_frames >= config.silence_threshold_frames ||
current_window->frames.size() >= config.max_window_frames) {
// 添加人声结束后的保留帧
addPostVoiceFrames();
// 完成当前窗口
finalizeCurrentWindow();
}
}
// 添加帧到当前窗口
void addFrameToCurrentWindow(const PCMFrame& frame) {
current_window->frames.push_back(frame);
current_window->total_frames++;
if (frame.vad_status) {
current_window->voice_frames++;
}
current_window->calculateVoiceRatio();
}
// 添加人声结束后的保留帧
void addPostVoiceFrames() {
// 这个函数在实际实现中需要缓存后续的帧
// 简化实现:在当前设计中,我们依赖静音阈值来自然包含结束后的帧
}
// 完成当前窗口的处理
void finalizeCurrentWindow() {
// 计算最终的人声占比
current_window->calculateVoiceRatio();
// 检查人声占比是否达到阈值
if (current_window->voice_ratio >= config.voice_ratio_threshold) {
// 窗口有效,加入队列
if (completed_windows.size() >= config.max_queue_size) {
// 队列已满,移除最旧的数据
auto old_window = std::move(completed_windows.front());
completed_windows.pop();
estimated_memory_usage -= estimateWindowMemory(*old_window);
}
completed_windows.push(std::move(current_window));
data_condition.notify_one(); // 通知等待的消费者
}
// 重置状态,开始新的窗口
resetState();
}
};