feat: 实现基于 VAD 的流式 WAV 录制识别方案
- 新增 StreamingAudioWriter 组件:持续录制 WAV 文件,VAD 检测静音段自动切换 - 静音检测:检测到 ~1s 连续静音后关闭当前文件,触发 chunkCompleted 信号 - STTTestPage 重构:移除缓冲区推理模式,改为 WAV 文件流式识别 - 每个 WAV 文件完成后在后台线程读取并推理,不阻塞继续录制 - 设置页面新增「调试音频目录」配置项 - 音频存储路径:debug 模式使用配置目录,非 debug 模式使用系统临时目录 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
dc4ebab47c
commit
01a39ddc8c
@ -67,6 +67,7 @@ set(SOURCES
|
||||
src/audio/audio_capture.cpp
|
||||
src/audio/audio_decoder.cpp
|
||||
src/audio/audio_ring_buffer.cpp
|
||||
src/audio/streaming_audio_writer.cpp
|
||||
|
||||
# UI
|
||||
src/ui/main_window.cpp
|
||||
@ -102,6 +103,7 @@ set(HEADERS
|
||||
src/audio/audio_capture.h
|
||||
src/audio/audio_decoder.h
|
||||
src/audio/audio_ring_buffer.h
|
||||
src/audio/streaming_audio_writer.h
|
||||
|
||||
src/ui/main_window.h
|
||||
src/ui/stt_test_page.h
|
||||
|
||||
@ -83,7 +83,8 @@ void ConfigManager::loadDefaults() {
|
||||
{"input_device", -1},
|
||||
{"buffer_size_ms", 20},
|
||||
{"chunk_duration_ms", 3000},
|
||||
{"padding_ms", 500}
|
||||
{"padding_ms", 500},
|
||||
{"debug_dir", ""}
|
||||
}},
|
||||
{"ui", QVariantMap{
|
||||
{"theme", "light"},
|
||||
|
||||
253
src/audio/streaming_audio_writer.cpp
Normal file
253
src/audio/streaming_audio_writer.cpp
Normal file
@ -0,0 +1,253 @@
|
||||
#include "streaming_audio_writer.h"
|
||||
#include "core/vad.h"
|
||||
#include "utils/logger.h"
|
||||
|
||||
#include <QDateTime>
|
||||
#include <QStandardPaths>
|
||||
#include <QDir>
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
|
||||
static const char* const kTag = "StreamingAudioWriter";
|
||||
|
||||
namespace impress {
|
||||
|
||||
StreamingAudioWriter::StreamingAudioWriter(QObject* parent)
|
||||
: QObject(parent)
|
||||
{
|
||||
}
|
||||
|
||||
StreamingAudioWriter::~StreamingAudioWriter() {
|
||||
stop();
|
||||
}
|
||||
|
||||
QString StreamingAudioWriter::getAudioStorageDir(bool debugEnabled, const QString& debugDir) {
|
||||
if (debugEnabled && !debugDir.isEmpty()) {
|
||||
return debugDir;
|
||||
}
|
||||
if (debugEnabled) {
|
||||
// 使用配置默认值:临时目录
|
||||
return QDir::tempPath() + "/impress_audio_debug";
|
||||
}
|
||||
|
||||
#ifdef PLATFORM_WINDOWS
|
||||
return ".";
|
||||
#else
|
||||
return QDir::tempPath();
|
||||
#endif
|
||||
}
|
||||
|
||||
bool StreamingAudioWriter::start(int sampleRate, bool debugEnabled, const QString& debugDir) {
|
||||
QMutexLocker locker(&mutex_);
|
||||
|
||||
if (recording_) {
|
||||
LOG_WARNING(kTag, "已在录制中");
|
||||
return false;
|
||||
}
|
||||
|
||||
sampleRate_ = sampleRate;
|
||||
debugEnabled_ = debugEnabled;
|
||||
debugDir_ = debugDir;
|
||||
totalSamples_ = 0;
|
||||
samplesWritten_ = 0;
|
||||
wasSpeaking_ = false;
|
||||
silenceFramesAfterSpeech_ = 0;
|
||||
|
||||
// 初始化 VAD(30ms 帧,低能量阈值)
|
||||
vad_ = std::make_unique<VoiceActivityDetector>(sampleRate_, 30, 0.015f, 3);
|
||||
|
||||
// VAD 帧大小
|
||||
vadFrameSize_ = sampleRate_ * 30 / 1000;
|
||||
if (vadFrameSize_ < 320) vadFrameSize_ = 320;
|
||||
|
||||
// 静音切换:~1s 的连续静音帧
|
||||
silenceFramesNeeded_ = 1000 / 30; // ~33 帧
|
||||
|
||||
// 确保目录存在
|
||||
QString dir = getAudioStorageDir(debugEnabled, debugDir);
|
||||
QDir d;
|
||||
if (!d.exists(dir)) {
|
||||
if (!d.mkpath(dir)) {
|
||||
LOG_ERROR(kTag, QString("无法创建音频存储目录: %1").arg(dir));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!openNewFile()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
recording_ = true;
|
||||
LOG_INFO(kTag, QString("流式录制已启动 (采样率: %1, VAD帧: %2, 静音切换: %3帧, 存储: %4)")
|
||||
.arg(sampleRate_).arg(vadFrameSize_).arg(silenceFramesNeeded_).arg(dir));
|
||||
return true;
|
||||
}
|
||||
|
||||
void StreamingAudioWriter::writeSamples(const std::vector<float>& samples) {
|
||||
QMutexLocker locker(&mutex_);
|
||||
if (!recording_ || !currentStream_) return;
|
||||
|
||||
if (samples.empty()) return;
|
||||
|
||||
// 1. 写入 WAV 文件 (float -> int16)
|
||||
for (float s : samples) {
|
||||
s = std::max(-1.0f, std::min(1.0f, s)); // clip
|
||||
int16_t val = static_cast<int16_t>(s * 32767.0f);
|
||||
*currentStream_ << val;
|
||||
samplesWritten_++;
|
||||
totalSamples_++;
|
||||
}
|
||||
|
||||
// 2. 用 VAD 检测语音活动
|
||||
bool isSpeaking = vad_->process(samples);
|
||||
|
||||
// 3. 静音段切换逻辑:
|
||||
// 检测到「说话 → 静音」的过渡,连续静音帧数达到阈值时切换
|
||||
if (isSpeaking) {
|
||||
silenceFramesAfterSpeech_ = 0;
|
||||
wasSpeaking_ = true;
|
||||
} else if (wasSpeaking_) {
|
||||
silenceFramesAfterSpeech_++;
|
||||
if (silenceFramesAfterSpeech_ >= silenceFramesNeeded_ && static_cast<int>(samplesWritten_) > sampleRate_ / 2) {
|
||||
// 至少有 0.5 秒音频才切换
|
||||
LOG_DEBUG(kTag, QString("检测到静音段 (连续 %1 帧, 能量: %2),切换 WAV 文件")
|
||||
.arg(silenceFramesAfterSpeech_)
|
||||
.arg(vad_->currentEnergy(), 0, 'f', 4));
|
||||
|
||||
// 完成当前文件
|
||||
finalizeWavFile();
|
||||
closeCurrentFile();
|
||||
|
||||
// 发射完成信号
|
||||
int durationMs = static_cast<int>(samplesWritten_ * 1000 / sampleRate_);
|
||||
QString completedPath = currentFilePath_;
|
||||
emit chunkCompleted(completedPath, durationMs);
|
||||
|
||||
// 打开新文件
|
||||
samplesWritten_ = 0;
|
||||
silenceFramesAfterSpeech_ = 0;
|
||||
wasSpeaking_ = false;
|
||||
vad_ = std::make_unique<VoiceActivityDetector>(sampleRate_, 30, 0.015f, 3);
|
||||
|
||||
if (!openNewFile()) {
|
||||
LOG_ERROR(kTag, "无法打开新的 WAV 文件,停止录制");
|
||||
recording_ = false;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
// else: 还没开始说话,不计数
|
||||
}
|
||||
|
||||
void StreamingAudioWriter::stop() {
|
||||
QMutexLocker locker(&mutex_);
|
||||
if (!recording_) return;
|
||||
|
||||
if (samplesWritten_ > 0) {
|
||||
finalizeWavFile();
|
||||
// 停止时不触发 chunkCompleted,因为最后一小段可能太短
|
||||
// 如果需要处理最后一段,可以在外部调用时手动处理
|
||||
}
|
||||
closeCurrentFile();
|
||||
|
||||
recording_ = false;
|
||||
LOG_INFO(kTag, QString("流式录制已停止 (总计: %1 样本, 约 %2 秒)")
|
||||
.arg(totalSamples_).arg(totalSamples_ * 1000.0 / sampleRate_ / 1000.0, 0, 'f', 1));
|
||||
}
|
||||
|
||||
QString StreamingAudioWriter::currentFilePath() const {
|
||||
QMutexLocker locker(&mutex_);
|
||||
return currentFilePath_;
|
||||
}
|
||||
|
||||
int StreamingAudioWriter::recordedDurationMs() const {
|
||||
QMutexLocker locker(&mutex_);
|
||||
return static_cast<int>(totalSamples_ * 1000 / sampleRate_);
|
||||
}
|
||||
|
||||
bool StreamingAudioWriter::openNewFile() {
|
||||
// 生成文件名
|
||||
QString dir = getAudioStorageDir(debugEnabled_, debugDir_);
|
||||
QString timestamp = QDateTime::currentDateTime().toString("yyyyMMdd_HHmmss_zzz");
|
||||
currentFilePath_ = QString("%1/record_%2.wav").arg(dir).arg(timestamp);
|
||||
|
||||
currentFile_ = new QFile(currentFilePath_);
|
||||
if (!currentFile_->open(QIODevice::WriteOnly)) {
|
||||
LOG_ERROR(kTag, QString("无法创建 WAV 文件: %1").arg(currentFilePath_));
|
||||
delete currentFile_;
|
||||
currentFile_ = nullptr;
|
||||
return false;
|
||||
}
|
||||
|
||||
currentStream_ = new QDataStream(currentFile_);
|
||||
currentStream_->setByteOrder(QDataStream::LittleEndian);
|
||||
|
||||
// 初始化并写入 WAV 头
|
||||
WavHeader header{};
|
||||
memcpy(header.riff, "RIFF", 4);
|
||||
memcpy(header.wave, "WAVE", 4);
|
||||
memcpy(header.fmt, "fmt ", 4);
|
||||
header.fmtSize = 16;
|
||||
header.audioFormat = 1; // PCM
|
||||
header.numChannels = 1; // mono
|
||||
header.sampleRate = static_cast<uint32_t>(sampleRate_);
|
||||
header.byteRate = static_cast<uint32_t>(sampleRate_) * 2;
|
||||
header.blockAlign = 2;
|
||||
header.bitsPerSample = 16;
|
||||
memcpy(header.data, "data", 4);
|
||||
header.dataSize = 0;
|
||||
header.fileSize = sizeof(WavHeader) - 8;
|
||||
|
||||
// 写入头
|
||||
currentStream_->writeRawData(header.riff, 4);
|
||||
*currentStream_ << header.fileSize;
|
||||
currentStream_->writeRawData(header.wave, 4);
|
||||
currentStream_->writeRawData(header.fmt, 4);
|
||||
*currentStream_ << header.fmtSize;
|
||||
*currentStream_ << header.audioFormat;
|
||||
*currentStream_ << header.numChannels;
|
||||
*currentStream_ << header.sampleRate;
|
||||
*currentStream_ << header.byteRate;
|
||||
*currentStream_ << header.blockAlign;
|
||||
*currentStream_ << header.bitsPerSample;
|
||||
currentStream_->writeRawData(header.data, 4);
|
||||
*currentStream_ << header.dataSize;
|
||||
|
||||
LOG_DEBUG(kTag, QString("新 WAV 文件已打开: %1").arg(currentFilePath_));
|
||||
return true;
|
||||
}
|
||||
|
||||
void StreamingAudioWriter::closeCurrentFile() {
|
||||
if (currentStream_) {
|
||||
delete currentStream_;
|
||||
currentStream_ = nullptr;
|
||||
}
|
||||
if (currentFile_) {
|
||||
currentFile_->close();
|
||||
delete currentFile_;
|
||||
currentFile_ = nullptr;
|
||||
}
|
||||
currentFilePath_.clear();
|
||||
}
|
||||
|
||||
void StreamingAudioWriter::finalizeWavFile() {
|
||||
if (!currentFile_ || !currentFile_->isOpen()) return;
|
||||
|
||||
// 计算实际大小
|
||||
uint32_t dataBytes = samplesWritten_ * 2; // 16-bit mono
|
||||
uint32_t fileSize = sizeof(WavHeader) + dataBytes - 8;
|
||||
|
||||
// 回写到文件头更新大小
|
||||
currentFile_->seek(4);
|
||||
currentStream_->writeRawData(reinterpret_cast<const char*>(&fileSize), 4);
|
||||
currentFile_->seek(sizeof(WavHeader) - 4); // dataSize 偏移
|
||||
currentStream_->writeRawData(reinterpret_cast<const char*>(&dataBytes), 4);
|
||||
currentFile_->flush();
|
||||
|
||||
int durationMs = static_cast<int>(samplesWritten_ * 1000 / sampleRate_);
|
||||
LOG_DEBUG(kTag, QString("WAV 文件已保存: %1 (时长: %2ms, 样本: %3)")
|
||||
.arg(currentFilePath_).arg(durationMs).arg(samplesWritten_));
|
||||
}
|
||||
|
||||
} // namespace impress
|
||||
140
src/audio/streaming_audio_writer.h
Normal file
140
src/audio/streaming_audio_writer.h
Normal file
@ -0,0 +1,140 @@
|
||||
#pragma once
|
||||
|
||||
#include <QObject>
|
||||
#include <QString>
|
||||
#include <QFile>
|
||||
#include <QDataStream>
|
||||
#include <QMutex>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
namespace impress {
|
||||
|
||||
class VoiceActivityDetector;
|
||||
|
||||
/**
|
||||
* @brief 流式音频录制器
|
||||
*
|
||||
* 将连续音频数据写入 WAV 文件,通过 VAD 检测静音段自动切换文件。
|
||||
* 完成一个 WAV 文件后,通过 signal 输出文件路径供外部识别。
|
||||
*
|
||||
* 工作流程:
|
||||
* 1. 音频数据持续写入当前 WAV 文件
|
||||
* 2. VAD 实时检测语音活动
|
||||
* 3. 检测到 ~1s 静音后,关闭当前文件、发射 chunkCompleted 信号、打开新文件
|
||||
* 4. 外部收到信号后,在后台线程对 WAV 文件进行识别
|
||||
*
|
||||
* 音频存储路径:
|
||||
* - debug_save_audio 开启 → 使用配置的 audio_debug_dir
|
||||
* - debug_save_audio 关闭 → Windows: 当前目录, Linux/Mac: 系统临时目录
|
||||
*/
|
||||
class StreamingAudioWriter : public QObject {
|
||||
Q_OBJECT
|
||||
public:
|
||||
explicit StreamingAudioWriter(QObject* parent = nullptr);
|
||||
~StreamingAudioWriter() override;
|
||||
|
||||
/**
|
||||
* @brief 开始录制(打开第一个 WAV 文件)
|
||||
* @param sampleRate 采样率 (如 16000)
|
||||
* @param debugEnabled 是否开启调试模式(保存到配置路径)
|
||||
* @param debugDir 调试目录(debugEnabled=true 时使用,为空则使用默认值)
|
||||
*/
|
||||
bool start(int sampleRate, bool debugEnabled = false, const QString& debugDir = QString());
|
||||
|
||||
/**
|
||||
* @brief 写入音频样本(归一化 PCM float,范围 -1.0 ~ 1.0)
|
||||
*
|
||||
* 此方法会:
|
||||
* 1. 写入当前 WAV 文件
|
||||
* 2. 通过 VAD 检测语音活动
|
||||
* 3. 检测到静音段时自动切换文件并触发 chunkCompleted 信号
|
||||
*/
|
||||
void writeSamples(const std::vector<float>& samples);
|
||||
|
||||
/**
|
||||
* @brief 停止录制,关闭当前文件(不触发 chunkCompleted)
|
||||
*/
|
||||
void stop();
|
||||
|
||||
/** @brief 是否正在录制 */
|
||||
bool isRecording() const { return recording_; }
|
||||
|
||||
/** @brief 当前 WAV 文件路径 */
|
||||
QString currentFilePath() const;
|
||||
|
||||
/** @brief 当前文件已写入的样本数 */
|
||||
int currentSampleCount() const { return samplesWritten_; }
|
||||
|
||||
/** @brief 已录制音频总时长(毫秒) */
|
||||
int recordedDurationMs() const;
|
||||
|
||||
/**
|
||||
* @brief 获取音频存储目录(根据 debug 状态自动选择)
|
||||
*/
|
||||
static QString getAudioStorageDir(bool debugEnabled, const QString& debugDir = QString());
|
||||
|
||||
signals:
|
||||
/**
|
||||
* @brief 一个 WAV 文件录制完成(检测到静音段切换)
|
||||
* @param filePath WAV 文件的完整路径
|
||||
* @param durationMs 音频时长(毫秒)
|
||||
*/
|
||||
void chunkCompleted(const QString& filePath, int durationMs);
|
||||
|
||||
/** @brief 录制错误 */
|
||||
void error(const QString& message);
|
||||
|
||||
private:
|
||||
/** 打开新的 WAV 文件 */
|
||||
bool openNewFile();
|
||||
|
||||
/** 关闭当前 WAV 文件(不更新文件头) */
|
||||
void closeCurrentFile();
|
||||
|
||||
/** 更新 WAV 文件头的 data chunk 大小和 RIFF 大小 */
|
||||
void finalizeWavFile();
|
||||
|
||||
// WAV 文件头结构 (44 字节)
|
||||
struct WavHeader {
|
||||
char riff[4]; // "RIFF"
|
||||
uint32_t fileSize; // 文件总大小 - 8
|
||||
char wave[4]; // "WAVE"
|
||||
char fmt[4]; // "fmt "
|
||||
uint32_t fmtSize; // fmt chunk 大小 (16)
|
||||
uint16_t audioFormat; // 音频格式 (1 = PCM)
|
||||
uint16_t numChannels; // 通道数 (1 = mono)
|
||||
uint32_t sampleRate; // 采样率
|
||||
uint32_t byteRate; // 字节率
|
||||
uint16_t blockAlign; // 块对齐
|
||||
uint16_t bitsPerSample;// 位深度 (16)
|
||||
char data[4]; // "data"
|
||||
uint32_t dataSize; // data chunk 大小
|
||||
};
|
||||
|
||||
int sampleRate_ = 16000;
|
||||
bool recording_ = false;
|
||||
bool debugEnabled_ = false;
|
||||
QString debugDir_;
|
||||
|
||||
// VAD
|
||||
std::unique_ptr<VoiceActivityDetector> vad_;
|
||||
bool wasSpeaking_ = false; // 上一帧是否在说话
|
||||
int silenceFramesAfterSpeech_ = 0; // 说话后连续静音帧数
|
||||
int silenceFramesNeeded_ = 4; // 需要多少帧静音才切换(~1s)
|
||||
|
||||
// 当前文件
|
||||
QString currentFilePath_;
|
||||
QFile* currentFile_ = nullptr;
|
||||
QDataStream* currentStream_ = nullptr;
|
||||
uint32_t samplesWritten_ = 0;
|
||||
int64_t totalSamples_ = 0;
|
||||
|
||||
// 能量计算帧大小(VAD 帧大小)
|
||||
int vadFrameSize_ = 480; // 16000 * 30ms = 480
|
||||
|
||||
mutable QMutex mutex_;
|
||||
};
|
||||
|
||||
} // namespace impress
|
||||
@ -120,6 +120,16 @@ void SettingsPage::setupUI() {
|
||||
populateAudioDevices();
|
||||
audioLayout->addRow("输入设备:", audioDeviceCombo_);
|
||||
|
||||
// 音频调试目录
|
||||
auto* debugDirRow = new QHBoxLayout();
|
||||
audioDebugDirEdit_ = new QLineEdit(this);
|
||||
audioDebugDirEdit_->setPlaceholderText("流式识别 WAV 文件保存路径(为空时使用系统临时目录)");
|
||||
audioDebugDirBtn_ = new QPushButton("浏览...", this);
|
||||
connect(audioDebugDirBtn_, &QPushButton::clicked, this, &SettingsPage::onBrowseAudioDebugDir);
|
||||
debugDirRow->addWidget(audioDebugDirEdit_);
|
||||
debugDirRow->addWidget(audioDebugDirBtn_);
|
||||
audioLayout->addRow("调试音频目录:", debugDirRow);
|
||||
|
||||
bufferSizeSpin_ = new QSpinBox(this);
|
||||
bufferSizeSpin_->setRange(10, 100);
|
||||
bufferSizeSpin_->setValue(20);
|
||||
@ -206,6 +216,7 @@ void SettingsPage::loadFromConfig() {
|
||||
// 恢复音频设备选择
|
||||
int savedDevice = configManager_->get("audio.input_device").toInt();
|
||||
selectAudioDevice(savedDevice);
|
||||
audioDebugDirEdit_->setText(configManager_->get("audio.debug_dir").toString());
|
||||
|
||||
themeCombo_->setCurrentText(configManager_->get("ui.theme").toString());
|
||||
fontSizeSpin_->setValue(configManager_->get("ui.font_size").toInt());
|
||||
@ -229,6 +240,7 @@ void SettingsPage::saveToConfig() {
|
||||
batch["stt.temperature"] = temperatureSpin_->value();
|
||||
batch["shortcuts.voice_hotkey"] = hotkeyRecorder_->hotkeyText();
|
||||
batch["audio.input_device"] = getSelectedAudioDeviceIndex();
|
||||
batch["audio.debug_dir"] = audioDebugDirEdit_->text();
|
||||
batch["audio.buffer_size_ms"] = bufferSizeSpin_->value();
|
||||
batch["audio.chunk_duration_ms"] = chunkDurationSpin_->value();
|
||||
batch["audio.padding_ms"] = paddingSpin_->value();
|
||||
@ -293,6 +305,14 @@ void SettingsPage::onBrowseTokensPath() {
|
||||
}
|
||||
}
|
||||
|
||||
void SettingsPage::onBrowseAudioDebugDir() {
|
||||
QString path = QFileDialog::getExistingDirectory(this, "选择调试音频目录", "",
|
||||
QFileDialog::ShowDirsOnly);
|
||||
if (!path.isEmpty()) {
|
||||
audioDebugDirEdit_->setText(path);
|
||||
}
|
||||
}
|
||||
|
||||
void SettingsPage::onSaveConfig() {
|
||||
saveToConfig();
|
||||
if (configManager_->save()) {
|
||||
|
||||
@ -31,6 +31,7 @@ public:
|
||||
private slots:
|
||||
void onBrowseModelPath();
|
||||
void onBrowseTokensPath();
|
||||
void onBrowseAudioDebugDir();
|
||||
void onSaveConfig();
|
||||
void onResetConfig();
|
||||
|
||||
@ -62,6 +63,8 @@ private:
|
||||
|
||||
// 音频设置
|
||||
QComboBox* audioDeviceCombo_;
|
||||
QLineEdit* audioDebugDirEdit_;
|
||||
QPushButton* audioDebugDirBtn_;
|
||||
QSpinBox* bufferSizeSpin_;
|
||||
QSpinBox* chunkDurationSpin_;
|
||||
QSpinBox* paddingSpin_;
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
#include "stt_test_page.h"
|
||||
#include "core/sense_voice_engine.h"
|
||||
#include "audio/audio_capture.h"
|
||||
#include "audio/audio_ring_buffer.h"
|
||||
#include "audio/streaming_audio_writer.h"
|
||||
#include "widgets/audio_waveform.h"
|
||||
#include "app/config_manager.h"
|
||||
#include "utils/logger.h"
|
||||
@ -18,7 +18,6 @@
|
||||
#include <QMessageBox>
|
||||
#include <QDateTime>
|
||||
#include <QFileInfo>
|
||||
#include <QTimer>
|
||||
#include <QtConcurrent>
|
||||
|
||||
static const char* const kTag = "STTTestPage";
|
||||
@ -32,23 +31,21 @@ STTTestPage::STTTestPage(ConfigManager* configManager,
|
||||
, configManager_(configManager)
|
||||
, sttEngine_(sttEngine)
|
||||
, audioCapture_(new AudioCapture(this))
|
||||
, inferenceTimer_(new QTimer(this))
|
||||
, streamingWriter_(new StreamingAudioWriter(this))
|
||||
{
|
||||
setupUI();
|
||||
|
||||
// 信号连接
|
||||
connect(audioCapture_, &AudioCapture::audioDataReady,
|
||||
this, &STTTestPage::onAudioDataReady);
|
||||
connect(streamingWriter_, &StreamingAudioWriter::chunkCompleted,
|
||||
this, &STTTestPage::onChunkCompleted);
|
||||
connect(sttEngine_, &SenseVoiceEngine::modelLoaded,
|
||||
this, &STTTestPage::onModelLoaded);
|
||||
connect(sttEngine_, &SenseVoiceEngine::modelLoadError,
|
||||
this, &STTTestPage::onModelLoadError);
|
||||
connect(sttEngine_, &SenseVoiceEngine::modelUnloaded,
|
||||
this, &STTTestPage::onModelUnloaded);
|
||||
|
||||
// 推理定时器:周期性触发后台推理
|
||||
connect(inferenceTimer_, &QTimer::timeout,
|
||||
this, &STTTestPage::onInferenceTimer);
|
||||
}
|
||||
|
||||
STTTestPage::~STTTestPage() = default;
|
||||
@ -64,13 +61,6 @@ void STTTestPage::setupUI() {
|
||||
deviceCombo_->addItems(AudioCapture::getDeviceList());
|
||||
controlLayout->addRow("输入设备:", deviceCombo_);
|
||||
|
||||
chunkSizeSpin_ = new QSpinBox(this);
|
||||
chunkSizeSpin_->setRange(500, 10000);
|
||||
chunkSizeSpin_->setSingleStep(500);
|
||||
chunkSizeSpin_->setValue(3000);
|
||||
chunkSizeSpin_->setSuffix(" ms");
|
||||
controlLayout->addRow("推理间隔:", chunkSizeSpin_);
|
||||
|
||||
auto* btnLayout = new QHBoxLayout();
|
||||
recordBtn_ = new QPushButton("开始录音", this);
|
||||
recordBtn_->setMinimumWidth(120);
|
||||
@ -117,16 +107,14 @@ void STTTestPage::updateUIState() {
|
||||
? "QPushButton { font-weight: bold; padding: 8px 16px; background-color: #e74c3c; color: white; }"
|
||||
: "QPushButton { font-weight: bold; padding: 8px 16px; }");
|
||||
deviceCombo_->setEnabled(!isRecording_ && !isLoadingModel_);
|
||||
chunkSizeSpin_->setEnabled(!isRecording_ && !isLoadingModel_);
|
||||
}
|
||||
|
||||
void STTTestPage::onToggleRecording() {
|
||||
if (isRecording_) {
|
||||
streamingWriter_->stop();
|
||||
audioCapture_->stop();
|
||||
inferenceTimer_->stop();
|
||||
isRecording_ = false;
|
||||
isInferencing_ = false;
|
||||
audioBuffer_.clear();
|
||||
} else {
|
||||
// 检查全局模型是否已加载
|
||||
if (!sttEngine_->isLoaded()) {
|
||||
@ -172,81 +160,104 @@ void STTTestPage::onModelUnloaded() {
|
||||
void STTTestPage::startAudioCapture() {
|
||||
int deviceIdx = deviceCombo_->currentIndex() - 1;
|
||||
audioSampleRate_ = configManager_->get("stt.sample_rate").toInt();
|
||||
bool debugEnabled = configManager_->get("stt.debug_save_audio").toBool();
|
||||
|
||||
// 启动流式录制器
|
||||
if (!streamingWriter_->start(audioSampleRate_, debugEnabled)) {
|
||||
QMessageBox::critical(this, "错误", "无法启动流式录制器");
|
||||
return;
|
||||
}
|
||||
|
||||
// 启动音频采集
|
||||
if (!audioCapture_->start(deviceIdx, audioSampleRate_)) {
|
||||
streamingWriter_->stop();
|
||||
QMessageBox::critical(this, "错误", "无法启动音频采集");
|
||||
return;
|
||||
}
|
||||
|
||||
isRecording_ = true;
|
||||
audioBuffer_.clear();
|
||||
isInferencing_ = false;
|
||||
completedCount_ = 0;
|
||||
|
||||
// 启动周期性推理定时器
|
||||
startInferenceTimer();
|
||||
|
||||
statusLabel_->setText("录音中 | 模型已加载");
|
||||
statusLabel_->setText("录音中 | VAD 流式识别");
|
||||
updateUIState();
|
||||
}
|
||||
|
||||
void STTTestPage::startInferenceTimer() {
|
||||
int interval = chunkSizeSpin_->value(); // 与推理间隔同步
|
||||
inferenceTimer_->start(interval);
|
||||
}
|
||||
|
||||
void STTTestPage::onAudioDataReady(const std::vector<float>& samples, int /* sampleRate */) {
|
||||
// 仅缓存音频数据,不直接调用推理
|
||||
// 避免推理阻塞音频采集线程
|
||||
audioBuffer_.insert(audioBuffer_.end(), samples.begin(), samples.end());
|
||||
// 写入流式录制器(WAV 文件 + VAD 静音检测)
|
||||
streamingWriter_->writeSamples(samples);
|
||||
|
||||
// 更新波形显示(使用最新数据片段)
|
||||
// 更新波形显示
|
||||
waveform_->setSamples(samples);
|
||||
}
|
||||
|
||||
void STTTestPage::onInferenceTimer() {
|
||||
if (!sttEngine_->isLoaded() || isInferencing_) {
|
||||
void STTTestPage::onChunkCompleted(const QString& filePath, int durationMs) {
|
||||
completedCount_++;
|
||||
LOG_INFO(kTag, QString("WAV 片段 #%1 已完成: %2 (%3ms)")
|
||||
.arg(completedCount_).arg(filePath).arg(durationMs));
|
||||
|
||||
statusLabel_->setText(QString("正在识别 #%1 (%2ms)...").arg(completedCount_).arg(durationMs));
|
||||
|
||||
// 在后台线程对 WAV 文件进行识别
|
||||
transcribeChunk(filePath, durationMs);
|
||||
}
|
||||
|
||||
void STTTestPage::transcribeChunk(const QString& filePath, int /* durationMs */) {
|
||||
if (isInferencing_) {
|
||||
// 上一个识别还没完成,跳过(避免堆积)
|
||||
LOG_WARNING(kTag, "上一个识别仍在进行中,跳过当前片段");
|
||||
return;
|
||||
}
|
||||
|
||||
int chunkSize = audioSampleRate_ * chunkSizeSpin_->value() / 1000;
|
||||
|
||||
if (static_cast<int>(audioBuffer_.size()) < chunkSize) {
|
||||
return; // 缓冲区数据不足,等待下一次
|
||||
}
|
||||
|
||||
// 提取一个推理块的音频
|
||||
std::vector<float> chunk(audioBuffer_.begin(), audioBuffer_.begin() + chunkSize);
|
||||
audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + chunkSize);
|
||||
|
||||
// 在后台线程执行推理
|
||||
isInferencing_ = true;
|
||||
statusLabel_->setText("推理中...");
|
||||
|
||||
int sampleRate = audioSampleRate_;
|
||||
QString language = configManager_->get("stt.language").toString();
|
||||
// 在后台线程读取 WAV 文件并推理
|
||||
(void)QtConcurrent::run([this, filePath]() {
|
||||
QString text;
|
||||
QString errorMsg;
|
||||
|
||||
(void)QtConcurrent::run([this, chunk, sampleRate, language]() {
|
||||
auto result = sttEngine_->infer(chunk, sampleRate, language);
|
||||
// 读取 WAV 文件为 float 样本
|
||||
QFile file(filePath);
|
||||
if (!file.open(QIODevice::ReadOnly)) {
|
||||
errorMsg = QString("无法打开 WAV 文件: %1").arg(filePath);
|
||||
} else {
|
||||
// 跳过 44 字节 WAV 头
|
||||
file.seek(44);
|
||||
QByteArray raw = file.readAll();
|
||||
file.close();
|
||||
|
||||
// 回到主线程更新 UI
|
||||
QMetaObject::invokeMethod(this, [this, result]() {
|
||||
isInferencing_ = false;
|
||||
|
||||
if (result.text.isEmpty() && !result.text.isNull()) {
|
||||
// 静音段
|
||||
latencyLabel_->setText(QString("延迟: %1 ms").arg(result.latency_ms, 0, 'f', 1));
|
||||
} else {
|
||||
emit onRecognitionResult(result.text, result.confidence,
|
||||
result.latency_ms, result.isFinal);
|
||||
// int16 -> float
|
||||
int numSamples = raw.size() / 2;
|
||||
std::vector<float> samples(numSamples);
|
||||
for (int i = 0; i < numSamples; i++) {
|
||||
int16_t val = *reinterpret_cast<const int16_t*>(raw.data() + i * 2);
|
||||
samples[i] = static_cast<float>(val) / 32767.0f;
|
||||
}
|
||||
|
||||
if (!sttEngine_->isLoaded()) {
|
||||
text = "[错误] 模型未加载";
|
||||
} else {
|
||||
QString language = configManager_->get("stt.language").toString();
|
||||
auto result = sttEngine_->infer(samples, audioSampleRate_, language);
|
||||
text = result.text;
|
||||
errorMsg = result.text.startsWith("[错误]") ? result.text : QString();
|
||||
}
|
||||
}
|
||||
|
||||
// 回到主线程更新 UI
|
||||
QMetaObject::invokeMethod(this, [this, text, errorMsg, filePath]() {
|
||||
isInferencing_ = false;
|
||||
|
||||
if (!errorMsg.isEmpty() && text.startsWith("[错误]")) {
|
||||
statusLabel_->setText(text);
|
||||
} else if (text.isEmpty()) {
|
||||
statusLabel_->setText(QString("片段 #%1: 静音").arg(completedCount_));
|
||||
} else {
|
||||
emit onRecognitionResult(text, 1.0f, 0, true);
|
||||
}
|
||||
|
||||
// 更新状态
|
||||
if (isRecording_) {
|
||||
int bufMs = (audioSampleRate_ > 0)
|
||||
? static_cast<int>(audioBuffer_.size() * 1000 / audioSampleRate_)
|
||||
: 0;
|
||||
statusLabel_->setText(
|
||||
QString("录音中 | 缓冲区: %1 ms").arg(bufMs));
|
||||
statusLabel_->setText(QString("录音中 | 已识别 %1 个片段").arg(completedCount_));
|
||||
}
|
||||
}, Qt::QueuedConnection);
|
||||
});
|
||||
@ -256,8 +267,8 @@ void STTTestPage::onRecognitionResult(const QString& text, float confidence,
|
||||
double latency, bool isFinal)
|
||||
{
|
||||
QString timestamp = QDateTime::currentDateTime().toString("hh:mm:ss");
|
||||
QString line = QString("[%1] %2 (置信度: %3%, 延迟: %4 ms)\n")
|
||||
.arg(timestamp, text)
|
||||
QString line = QString("[%1] #%2 %3 (置信度: %4%, 延迟: %5 ms)")
|
||||
.arg(timestamp).arg(completedCount_).arg(text)
|
||||
.arg(confidence * 100, 0, 'f', 1)
|
||||
.arg(latency, 0, 'f', 1);
|
||||
|
||||
|
||||
@ -9,20 +9,20 @@ class QPushButton;
|
||||
class QComboBox;
|
||||
class QTextEdit;
|
||||
class QSpinBox;
|
||||
class QTimer;
|
||||
|
||||
namespace impress {
|
||||
|
||||
class ConfigManager;
|
||||
class SenseVoiceEngine;
|
||||
class AudioCapture;
|
||||
class StreamingAudioWriter;
|
||||
|
||||
/**
|
||||
* @brief STT 测试页面
|
||||
*
|
||||
* 实时麦克风采集 + 周期性后台推理。
|
||||
* 实时麦克风采集 + 基于 VAD 的流式 WAV 文件录制 + 后台识别。
|
||||
* 音频采集与推理分离,防止推理阻塞音频流。
|
||||
* 使用 SenseVoice 模型进行推理。
|
||||
* 使用 VAD 检测静音段自动切换 WAV 文件,每个文件完成后触发识别。
|
||||
*/
|
||||
class STTTestPage : public QWidget {
|
||||
Q_OBJECT
|
||||
@ -35,22 +35,22 @@ public:
|
||||
private slots:
|
||||
void onToggleRecording();
|
||||
void onAudioDataReady(const std::vector<float>& samples, int sampleRate);
|
||||
void onChunkCompleted(const QString& filePath, int durationMs);
|
||||
void onRecognitionResult(const QString& text, float confidence, double latency, bool isFinal);
|
||||
void onModelLoaded(const QString& modelPath);
|
||||
void onModelLoadError(const QString& modelPath, const QString& error);
|
||||
void onModelUnloaded();
|
||||
void onInferenceTimer();
|
||||
|
||||
private:
|
||||
void setupUI();
|
||||
void updateUIState();
|
||||
void startAudioCapture();
|
||||
void startInferenceTimer();
|
||||
void transcribeChunk(const QString& filePath, int durationMs);
|
||||
|
||||
ConfigManager* configManager_;
|
||||
SenseVoiceEngine* sttEngine_;
|
||||
AudioCapture* audioCapture_;
|
||||
QTimer* inferenceTimer_;
|
||||
StreamingAudioWriter* streamingWriter_;
|
||||
|
||||
// UI 控件
|
||||
QComboBox* deviceCombo_;
|
||||
@ -59,13 +59,12 @@ private:
|
||||
QLabel* latencyLabel_;
|
||||
QLabel* statusLabel_;
|
||||
AudioWaveform* waveform_;
|
||||
QSpinBox* chunkSizeSpin_;
|
||||
|
||||
bool isRecording_ = false;
|
||||
bool isLoadingModel_ = false;
|
||||
bool isInferencing_ = false;
|
||||
int audioSampleRate_ = 16000;
|
||||
std::vector<float> audioBuffer_;
|
||||
int completedCount_ = 0; // 已完成文件计数
|
||||
};
|
||||
|
||||
} // namespace impress
|
||||
|
||||
Loading…
Reference in New Issue
Block a user