ba5e47bc77
1. 应用界面增加了返回主页的按钮 2. 修复了gif渲染内存泄漏的严重bug 3. 将PetDao当中的cJSON API替换为cpp_json,完美通过测试 4. 整合已经实现的各种上层建筑,实现了一个宠物对话基本业务应用,用于样品测试展示用 5. 重构了音频播放类,使其更modern,更加便于移植和拓展
378 lines
12 KiB
C++
378 lines
12 KiB
C++
//
|
||
// Created by misaki on 2025/9/29.
|
||
//
|
||
/**
|
||
* 本模块用于处理音频数据,使用滑动窗口和VAD算法进行人声语音段过滤
|
||
* 依赖于VAD人声检测数据,并通过滑动窗口管理区间数据,以精确过滤出人声音频数据
|
||
* @author Misaki
|
||
* @date 2025/9/29
|
||
*/
|
||
#pragma once
|
||
#include <vector>
|
||
#include <queue>
|
||
#include <memory>
|
||
#include <mutex>
|
||
#include <condition_variable>
|
||
#include <atomic>
|
||
|
||
// PCM帧数据结构
|
||
struct PCMFrame {
|
||
std::vector<int16_t> audio_data; // PCM音频数据
|
||
size_t data_length; // 数据长度(样本数)
|
||
bool vad_status; // VAD检测状态
|
||
uint64_t timestamp; // 时间戳(可选,用于调试)
|
||
|
||
PCMFrame() : data_length(0), vad_status(false), timestamp(0) {}
|
||
|
||
PCMFrame(const int16_t* data, const size_t len, const bool vad, const uint64_t ts = 0)
|
||
: data_length(len), vad_status(vad), timestamp(ts) {
|
||
audio_data.assign(data, data + len);
|
||
}
|
||
};
|
||
|
||
// 滑动窗口结构
|
||
struct SlidingWindow {
|
||
std::vector<PCMFrame> frames; // 窗口内的所有帧
|
||
size_t total_frames; // 总帧数
|
||
size_t voice_frames; // 人声帧数
|
||
double voice_ratio; // 人声占比
|
||
bool is_active; // 窗口是否处于活跃状态
|
||
|
||
SlidingWindow() : total_frames(0), voice_frames(0), voice_ratio(0.0), is_active(false) {}
|
||
|
||
// 计算人声占比
|
||
void calculateVoiceRatio() {
|
||
if (total_frames > 0) {
|
||
voice_ratio = static_cast<double>(voice_frames) / total_frames;
|
||
} else {
|
||
voice_ratio = 0.0;
|
||
}
|
||
}
|
||
};
|
||
|
||
class AudioBufferManager {
|
||
private:
|
||
// 单例实例
|
||
static AudioBufferManager* instance;
|
||
static std::mutex instance_mutex;
|
||
|
||
// 配置参数
|
||
struct Config {
|
||
size_t max_window_frames; // 单个窗口最大帧数
|
||
size_t max_queue_size; // 队列最大长度
|
||
size_t silence_threshold_frames; // 静音阈值帧数
|
||
double voice_ratio_threshold; // 人声占比阈值
|
||
size_t pre_voice_frames; // 人声开始前预保留帧数
|
||
size_t post_voice_frames; // 人声结束后保留帧数
|
||
} config;
|
||
|
||
// 内部状态
|
||
std::unique_ptr<SlidingWindow> current_window;
|
||
std::queue<std::unique_ptr<SlidingWindow>> completed_windows;
|
||
std::vector<PCMFrame> pre_voice_buffer; // 人声开始前的预缓存
|
||
|
||
// 状态跟踪
|
||
std::atomic<bool> in_voice_segment;
|
||
size_t consecutive_silence_frames;
|
||
size_t current_frame_count;
|
||
|
||
// 线程同步
|
||
std::mutex data_mutex;
|
||
std::condition_variable data_condition;
|
||
|
||
// 内存使用跟踪
|
||
size_t estimated_memory_usage;
|
||
const size_t MAX_MEMORY_BYTES = 512 * 1024; // 512KB
|
||
|
||
private:
|
||
AudioBufferManager() {
|
||
initializeDefaultConfig();
|
||
resetState();
|
||
}
|
||
|
||
void initializeDefaultConfig() {
|
||
// 默认配置:基于16kHz采样率,每帧20ms(320样本)
|
||
config.max_window_frames = 500; // 10秒音频(500 * 20ms)
|
||
config.max_queue_size = 8; // 队列最多8个窗口
|
||
config.silence_threshold_frames = 15; // 300ms静音判定结束(15 * 20ms)
|
||
config.voice_ratio_threshold = 0.3; // 30%人声占比阈值
|
||
config.pre_voice_frames = 5; // 人声开始前保留100ms
|
||
config.post_voice_frames = 10; // 人声结束后保留200ms
|
||
|
||
estimated_memory_usage = 0;
|
||
}
|
||
|
||
void resetState() {
|
||
current_window = std::make_unique<SlidingWindow>();
|
||
in_voice_segment = false;
|
||
consecutive_silence_frames = 0;
|
||
current_frame_count = 0;
|
||
pre_voice_buffer.clear();
|
||
pre_voice_buffer.reserve(config.pre_voice_frames);
|
||
}
|
||
|
||
// 估算单个帧的内存使用
|
||
size_t estimateFrameMemory(const PCMFrame& frame) const {
|
||
return sizeof(PCMFrame) + (frame.audio_data.capacity() * sizeof(int16_t));
|
||
}
|
||
|
||
// 估算窗口内存使用
|
||
size_t estimateWindowMemory(const SlidingWindow& window) const {
|
||
size_t memory = sizeof(SlidingWindow);
|
||
for (const auto& frame : window.frames) {
|
||
memory += estimateFrameMemory(frame);
|
||
}
|
||
return memory;
|
||
}
|
||
|
||
// 检查内存限制
|
||
bool checkMemoryConstraints() const {
|
||
return estimated_memory_usage < MAX_MEMORY_BYTES;
|
||
}
|
||
|
||
public:
|
||
// 删除拷贝构造函数和赋值运算符
|
||
AudioBufferManager(const AudioBufferManager&) = delete;
|
||
AudioBufferManager& operator=(const AudioBufferManager&) = delete;
|
||
|
||
// 获取单例实例
|
||
static AudioBufferManager* getInstance() {
|
||
std::lock_guard<std::mutex> lock(instance_mutex);
|
||
if (!instance) {
|
||
instance = new AudioBufferManager();
|
||
}
|
||
return instance;
|
||
}
|
||
|
||
/**
|
||
* @brief 配置管理器参数
|
||
* @param max_window_frames 单个窗口最大帧数
|
||
* @param max_queue_size 队列最大长度
|
||
* @param silence_threshold 静音阈值帧数
|
||
* @param voice_ratio_threshold 人声占比阈值
|
||
* @param pre_voice_frames 人声开始前预保留帧数
|
||
* @param post_voice_frames 人声结束后保留帧数
|
||
*/
|
||
void configure(const size_t max_window_frames = 500,
|
||
const size_t max_queue_size = 8,
|
||
const size_t silence_threshold = 15,
|
||
const double voice_ratio_threshold = 0.2,
|
||
const size_t pre_voice_frames = 5,
|
||
const size_t post_voice_frames = 10) {
|
||
std::lock_guard<std::mutex> lock(data_mutex);
|
||
|
||
config.max_window_frames = max_window_frames;
|
||
config.max_queue_size = max_queue_size;
|
||
config.silence_threshold_frames = silence_threshold;
|
||
config.voice_ratio_threshold = voice_ratio_threshold;
|
||
config.pre_voice_frames = pre_voice_frames;
|
||
config.post_voice_frames = post_voice_frames;
|
||
|
||
// 重新初始化状态
|
||
resetState();
|
||
}
|
||
|
||
/**
|
||
* @brief 注入新的音频帧数据
|
||
* @param audio_data PCM音频数据指针
|
||
* @param data_length 数据长度(样本数)
|
||
* @param vad_status 当前帧的VAD状态
|
||
*/
|
||
void injectAudioFrame(const int16_t* audio_data, size_t data_length, bool vad_status) {
|
||
std::lock_guard<std::mutex> lock(data_mutex);
|
||
|
||
// 创建新帧
|
||
PCMFrame new_frame(audio_data, data_length, vad_status, current_frame_count++);
|
||
size_t frame_memory = estimateFrameMemory(new_frame);
|
||
|
||
// 检查内存限制
|
||
if (!checkMemoryConstraints()) {
|
||
// 内存不足,采取清理策略
|
||
if (!completed_windows.empty()) {
|
||
auto old_window = std::move(completed_windows.front());
|
||
completed_windows.pop();
|
||
estimated_memory_usage -= estimateWindowMemory(*old_window);
|
||
}
|
||
}
|
||
|
||
// 更新预缓存
|
||
updatePreVoiceBuffer(new_frame);
|
||
|
||
// 状态机处理
|
||
if (!in_voice_segment) {
|
||
handleNonVoiceState(new_frame, frame_memory);
|
||
} else {
|
||
handleVoiceState(new_frame, frame_memory);
|
||
}
|
||
|
||
estimated_memory_usage += frame_memory;
|
||
}
|
||
|
||
/**
|
||
* @brief 获取可用的音频窗口数据
|
||
* @param timeout_ms 超时时间(毫秒)
|
||
* @return 滑动窗口指针,如果没有可用数据则返回nullptr
|
||
*/
|
||
std::unique_ptr<SlidingWindow> getAudioWindow(int timeout_ms = 0) {
|
||
std::unique_lock<std::mutex> lock(data_mutex);
|
||
if (completed_windows.empty()) {
|
||
if (timeout_ms <= 0) {
|
||
return nullptr;
|
||
}
|
||
// 等待数据可用
|
||
if (data_condition.wait_for(lock,
|
||
std::chrono::milliseconds(timeout_ms)) == std::cv_status::timeout) {
|
||
return nullptr;
|
||
}
|
||
if (completed_windows.empty()) {
|
||
return nullptr;
|
||
}
|
||
}
|
||
auto window = std::move(completed_windows.front());
|
||
completed_windows.pop();
|
||
estimated_memory_usage -= estimateWindowMemory(*window);
|
||
return window;
|
||
}
|
||
|
||
/**
|
||
* @brief 检查是否有可用的音频数据
|
||
*/
|
||
bool hasAvailableData() {
|
||
std::lock_guard<std::mutex> lock(data_mutex);
|
||
return !completed_windows.empty();
|
||
}
|
||
|
||
/**
|
||
* @brief 获取当前队列大小
|
||
*/
|
||
size_t getQueueSize() {
|
||
std::lock_guard<std::mutex> lock(data_mutex);
|
||
return completed_windows.size();
|
||
}
|
||
|
||
/**
|
||
* @brief 获取估计的内存使用量
|
||
*/
|
||
size_t getEstimatedMemoryUsage() const {
|
||
return estimated_memory_usage;
|
||
}
|
||
|
||
/**
|
||
* @brief 强制结束当前语音段(如果有)
|
||
*/
|
||
void forceEndCurrentSegment() {
|
||
std::lock_guard<std::mutex> lock(data_mutex);
|
||
if (in_voice_segment && current_window->total_frames > 0) {
|
||
finalizeCurrentWindow();
|
||
}
|
||
}
|
||
|
||
/**
|
||
* @brief 清空所有缓存数据
|
||
*/
|
||
void clearAllData() {
|
||
std::lock_guard<std::mutex> lock(data_mutex);
|
||
resetState();
|
||
std::queue<std::unique_ptr<SlidingWindow>> empty_queue;
|
||
std::swap(completed_windows, empty_queue);
|
||
estimated_memory_usage = 0;
|
||
}
|
||
|
||
private:
|
||
// 更新人声开始前的预缓存
|
||
void updatePreVoiceBuffer(const PCMFrame& frame) {
|
||
pre_voice_buffer.push_back(frame);
|
||
|
||
// 保持预缓存大小不超过配置值
|
||
if (pre_voice_buffer.size() > config.pre_voice_frames) {
|
||
pre_voice_buffer.erase(pre_voice_buffer.begin());
|
||
}
|
||
}
|
||
|
||
// 处理非人声状态
|
||
void handleNonVoiceState(const PCMFrame& frame, size_t frame_memory) {
|
||
if (frame.vad_status) {
|
||
// 检测到人声开始
|
||
in_voice_segment = true;
|
||
consecutive_silence_frames = 0;
|
||
|
||
// 将预缓存数据加入当前窗口
|
||
for (const auto& pre_frame : pre_voice_buffer) {
|
||
current_window->frames.push_back(pre_frame);
|
||
current_window->total_frames++;
|
||
if (pre_frame.vad_status) {
|
||
current_window->voice_frames++;
|
||
}
|
||
}
|
||
|
||
// 添加当前帧
|
||
addFrameToCurrentWindow(frame);
|
||
}
|
||
// 非人声状态下,不进行其他处理
|
||
}
|
||
|
||
// 处理人声状态
|
||
void handleVoiceState(const PCMFrame& frame, size_t frame_memory) {
|
||
if (frame.vad_status) {
|
||
// 仍然是人声,重置静音计数
|
||
consecutive_silence_frames = 0;
|
||
} else {
|
||
// 静音帧
|
||
consecutive_silence_frames++;
|
||
}
|
||
|
||
// 添加当前帧到窗口
|
||
addFrameToCurrentWindow(frame);
|
||
|
||
// 检查是否需要结束当前语音段
|
||
if (consecutive_silence_frames >= config.silence_threshold_frames ||
|
||
current_window->frames.size() >= config.max_window_frames) {
|
||
|
||
// 添加人声结束后的保留帧
|
||
addPostVoiceFrames();
|
||
|
||
// 完成当前窗口
|
||
finalizeCurrentWindow();
|
||
}
|
||
}
|
||
|
||
// 添加帧到当前窗口
|
||
void addFrameToCurrentWindow(const PCMFrame& frame) {
|
||
current_window->frames.push_back(frame);
|
||
current_window->total_frames++;
|
||
if (frame.vad_status) {
|
||
current_window->voice_frames++;
|
||
}
|
||
current_window->calculateVoiceRatio();
|
||
}
|
||
|
||
// 添加人声结束后的保留帧
|
||
void addPostVoiceFrames() {
|
||
// 这个函数在实际实现中需要缓存后续的帧
|
||
// 简化实现:在当前设计中,我们依赖静音阈值来自然包含结束后的帧
|
||
}
|
||
|
||
// 完成当前窗口的处理
|
||
void finalizeCurrentWindow() {
|
||
// 计算最终的人声占比
|
||
current_window->calculateVoiceRatio();
|
||
|
||
// 检查人声占比是否达到阈值
|
||
if (current_window->voice_ratio >= config.voice_ratio_threshold) {
|
||
// 窗口有效,加入队列
|
||
if (completed_windows.size() >= config.max_queue_size) {
|
||
// 队列已满,移除最旧的数据
|
||
auto old_window = std::move(completed_windows.front());
|
||
completed_windows.pop();
|
||
estimated_memory_usage -= estimateWindowMemory(*old_window);
|
||
}
|
||
|
||
completed_windows.push(std::move(current_window));
|
||
data_condition.notify_one(); // 通知等待的消费者
|
||
}
|
||
|
||
// 重置状态,开始新的窗口
|
||
resetState();
|
||
}
|
||
};
|