From 01a39ddc8c6a9554c9946da07caa43ed37b4f2a1 Mon Sep 17 00:00:00 2001 From: impressionyang Date: Wed, 13 May 2026 15:33:03 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=AE=9E=E7=8E=B0=E5=9F=BA=E4=BA=8E=20?= =?UTF-8?q?VAD=20=E7=9A=84=E6=B5=81=E5=BC=8F=20WAV=20=E5=BD=95=E5=88=B6?= =?UTF-8?q?=E8=AF=86=E5=88=AB=E6=96=B9=E6=A1=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 StreamingAudioWriter 组件:持续录制 WAV 文件,VAD 检测静音段自动切换 - 静音检测:检测到 ~1s 连续静音后关闭当前文件,触发 chunkCompleted 信号 - STTTestPage 重构:移除缓冲区推理模式,改为 WAV 文件流式识别 - 每个 WAV 文件完成后在后台线程读取并推理,不阻塞继续录制 - 设置页面新增「调试音频目录」配置项 - 音频存储路径:debug 模式使用配置目录,非 debug 模式使用系统临时目录 Co-Authored-By: Claude Opus 4.6 --- CMakeLists.txt | 2 + src/app/config_manager.cpp | 3 +- src/audio/streaming_audio_writer.cpp | 253 +++++++++++++++++++++++++++ src/audio/streaming_audio_writer.h | 140 +++++++++++++++ src/ui/settings_page.cpp | 20 +++ src/ui/settings_page.h | 3 + src/ui/stt_test_page.cpp | 145 ++++++++------- src/ui/stt_test_page.h | 15 +- 8 files changed, 505 insertions(+), 76 deletions(-) create mode 100644 src/audio/streaming_audio_writer.cpp create mode 100644 src/audio/streaming_audio_writer.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 2362600..6745c89 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,6 +67,7 @@ set(SOURCES src/audio/audio_capture.cpp src/audio/audio_decoder.cpp src/audio/audio_ring_buffer.cpp + src/audio/streaming_audio_writer.cpp # UI src/ui/main_window.cpp @@ -102,6 +103,7 @@ set(HEADERS src/audio/audio_capture.h src/audio/audio_decoder.h src/audio/audio_ring_buffer.h + src/audio/streaming_audio_writer.h src/ui/main_window.h src/ui/stt_test_page.h diff --git a/src/app/config_manager.cpp b/src/app/config_manager.cpp index 5781285..fe0af35 100644 --- a/src/app/config_manager.cpp +++ b/src/app/config_manager.cpp @@ -83,7 +83,8 @@ void ConfigManager::loadDefaults() { {"input_device", -1}, {"buffer_size_ms", 20}, {"chunk_duration_ms", 3000}, - {"padding_ms", 500} + {"padding_ms", 500}, + {"debug_dir", ""} }}, {"ui", QVariantMap{ {"theme", "light"}, diff --git a/src/audio/streaming_audio_writer.cpp b/src/audio/streaming_audio_writer.cpp new file mode 100644 index 0000000..8d389f5 --- /dev/null +++ b/src/audio/streaming_audio_writer.cpp @@ -0,0 +1,253 @@ +#include "streaming_audio_writer.h" +#include "core/vad.h" +#include "utils/logger.h" + +#include +#include +#include +#include +#include +#include + +static const char* const kTag = "StreamingAudioWriter"; + +namespace impress { + +StreamingAudioWriter::StreamingAudioWriter(QObject* parent) + : QObject(parent) +{ +} + +StreamingAudioWriter::~StreamingAudioWriter() { + stop(); +} + +QString StreamingAudioWriter::getAudioStorageDir(bool debugEnabled, const QString& debugDir) { + if (debugEnabled && !debugDir.isEmpty()) { + return debugDir; + } + if (debugEnabled) { + // 使用配置默认值:临时目录 + return QDir::tempPath() + "/impress_audio_debug"; + } + +#ifdef PLATFORM_WINDOWS + return "."; +#else + return QDir::tempPath(); +#endif +} + +bool StreamingAudioWriter::start(int sampleRate, bool debugEnabled, const QString& debugDir) { + QMutexLocker locker(&mutex_); + + if (recording_) { + LOG_WARNING(kTag, "已在录制中"); + return false; + } + + sampleRate_ = sampleRate; + debugEnabled_ = debugEnabled; + debugDir_ = debugDir; + totalSamples_ = 0; + samplesWritten_ = 0; + wasSpeaking_ = false; + silenceFramesAfterSpeech_ = 0; + + // 初始化 VAD(30ms 帧,低能量阈值) + vad_ = std::make_unique(sampleRate_, 30, 0.015f, 3); + + // VAD 帧大小 + vadFrameSize_ = sampleRate_ * 30 / 1000; + if (vadFrameSize_ < 320) vadFrameSize_ = 320; + + // 静音切换:~1s 的连续静音帧 + silenceFramesNeeded_ = 1000 / 30; // ~33 帧 + + // 确保目录存在 + QString dir = getAudioStorageDir(debugEnabled, debugDir); + QDir d; + if (!d.exists(dir)) { + if (!d.mkpath(dir)) { + LOG_ERROR(kTag, QString("无法创建音频存储目录: %1").arg(dir)); + return false; + } + } + + if (!openNewFile()) { + return false; + } + + recording_ = true; + LOG_INFO(kTag, QString("流式录制已启动 (采样率: %1, VAD帧: %2, 静音切换: %3帧, 存储: %4)") + .arg(sampleRate_).arg(vadFrameSize_).arg(silenceFramesNeeded_).arg(dir)); + return true; +} + +void StreamingAudioWriter::writeSamples(const std::vector& samples) { + QMutexLocker locker(&mutex_); + if (!recording_ || !currentStream_) return; + + if (samples.empty()) return; + + // 1. 写入 WAV 文件 (float -> int16) + for (float s : samples) { + s = std::max(-1.0f, std::min(1.0f, s)); // clip + int16_t val = static_cast(s * 32767.0f); + *currentStream_ << val; + samplesWritten_++; + totalSamples_++; + } + + // 2. 用 VAD 检测语音活动 + bool isSpeaking = vad_->process(samples); + + // 3. 静音段切换逻辑: + // 检测到「说话 → 静音」的过渡,连续静音帧数达到阈值时切换 + if (isSpeaking) { + silenceFramesAfterSpeech_ = 0; + wasSpeaking_ = true; + } else if (wasSpeaking_) { + silenceFramesAfterSpeech_++; + if (silenceFramesAfterSpeech_ >= silenceFramesNeeded_ && static_cast(samplesWritten_) > sampleRate_ / 2) { + // 至少有 0.5 秒音频才切换 + LOG_DEBUG(kTag, QString("检测到静音段 (连续 %1 帧, 能量: %2),切换 WAV 文件") + .arg(silenceFramesAfterSpeech_) + .arg(vad_->currentEnergy(), 0, 'f', 4)); + + // 完成当前文件 + finalizeWavFile(); + closeCurrentFile(); + + // 发射完成信号 + int durationMs = static_cast(samplesWritten_ * 1000 / sampleRate_); + QString completedPath = currentFilePath_; + emit chunkCompleted(completedPath, durationMs); + + // 打开新文件 + samplesWritten_ = 0; + silenceFramesAfterSpeech_ = 0; + wasSpeaking_ = false; + vad_ = std::make_unique(sampleRate_, 30, 0.015f, 3); + + if (!openNewFile()) { + LOG_ERROR(kTag, "无法打开新的 WAV 文件,停止录制"); + recording_ = false; + return; + } + } + } + // else: 还没开始说话,不计数 +} + +void StreamingAudioWriter::stop() { + QMutexLocker locker(&mutex_); + if (!recording_) return; + + if (samplesWritten_ > 0) { + finalizeWavFile(); + // 停止时不触发 chunkCompleted,因为最后一小段可能太短 + // 如果需要处理最后一段,可以在外部调用时手动处理 + } + closeCurrentFile(); + + recording_ = false; + LOG_INFO(kTag, QString("流式录制已停止 (总计: %1 样本, 约 %2 秒)") + .arg(totalSamples_).arg(totalSamples_ * 1000.0 / sampleRate_ / 1000.0, 0, 'f', 1)); +} + +QString StreamingAudioWriter::currentFilePath() const { + QMutexLocker locker(&mutex_); + return currentFilePath_; +} + +int StreamingAudioWriter::recordedDurationMs() const { + QMutexLocker locker(&mutex_); + return static_cast(totalSamples_ * 1000 / sampleRate_); +} + +bool StreamingAudioWriter::openNewFile() { + // 生成文件名 + QString dir = getAudioStorageDir(debugEnabled_, debugDir_); + QString timestamp = QDateTime::currentDateTime().toString("yyyyMMdd_HHmmss_zzz"); + currentFilePath_ = QString("%1/record_%2.wav").arg(dir).arg(timestamp); + + currentFile_ = new QFile(currentFilePath_); + if (!currentFile_->open(QIODevice::WriteOnly)) { + LOG_ERROR(kTag, QString("无法创建 WAV 文件: %1").arg(currentFilePath_)); + delete currentFile_; + currentFile_ = nullptr; + return false; + } + + currentStream_ = new QDataStream(currentFile_); + currentStream_->setByteOrder(QDataStream::LittleEndian); + + // 初始化并写入 WAV 头 + WavHeader header{}; + memcpy(header.riff, "RIFF", 4); + memcpy(header.wave, "WAVE", 4); + memcpy(header.fmt, "fmt ", 4); + header.fmtSize = 16; + header.audioFormat = 1; // PCM + header.numChannels = 1; // mono + header.sampleRate = static_cast(sampleRate_); + header.byteRate = static_cast(sampleRate_) * 2; + header.blockAlign = 2; + header.bitsPerSample = 16; + memcpy(header.data, "data", 4); + header.dataSize = 0; + header.fileSize = sizeof(WavHeader) - 8; + + // 写入头 + currentStream_->writeRawData(header.riff, 4); + *currentStream_ << header.fileSize; + currentStream_->writeRawData(header.wave, 4); + currentStream_->writeRawData(header.fmt, 4); + *currentStream_ << header.fmtSize; + *currentStream_ << header.audioFormat; + *currentStream_ << header.numChannels; + *currentStream_ << header.sampleRate; + *currentStream_ << header.byteRate; + *currentStream_ << header.blockAlign; + *currentStream_ << header.bitsPerSample; + currentStream_->writeRawData(header.data, 4); + *currentStream_ << header.dataSize; + + LOG_DEBUG(kTag, QString("新 WAV 文件已打开: %1").arg(currentFilePath_)); + return true; +} + +void StreamingAudioWriter::closeCurrentFile() { + if (currentStream_) { + delete currentStream_; + currentStream_ = nullptr; + } + if (currentFile_) { + currentFile_->close(); + delete currentFile_; + currentFile_ = nullptr; + } + currentFilePath_.clear(); +} + +void StreamingAudioWriter::finalizeWavFile() { + if (!currentFile_ || !currentFile_->isOpen()) return; + + // 计算实际大小 + uint32_t dataBytes = samplesWritten_ * 2; // 16-bit mono + uint32_t fileSize = sizeof(WavHeader) + dataBytes - 8; + + // 回写到文件头更新大小 + currentFile_->seek(4); + currentStream_->writeRawData(reinterpret_cast(&fileSize), 4); + currentFile_->seek(sizeof(WavHeader) - 4); // dataSize 偏移 + currentStream_->writeRawData(reinterpret_cast(&dataBytes), 4); + currentFile_->flush(); + + int durationMs = static_cast(samplesWritten_ * 1000 / sampleRate_); + LOG_DEBUG(kTag, QString("WAV 文件已保存: %1 (时长: %2ms, 样本: %3)") + .arg(currentFilePath_).arg(durationMs).arg(samplesWritten_)); +} + +} // namespace impress diff --git a/src/audio/streaming_audio_writer.h b/src/audio/streaming_audio_writer.h new file mode 100644 index 0000000..4580f07 --- /dev/null +++ b/src/audio/streaming_audio_writer.h @@ -0,0 +1,140 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace impress { + +class VoiceActivityDetector; + +/** + * @brief 流式音频录制器 + * + * 将连续音频数据写入 WAV 文件,通过 VAD 检测静音段自动切换文件。 + * 完成一个 WAV 文件后,通过 signal 输出文件路径供外部识别。 + * + * 工作流程: + * 1. 音频数据持续写入当前 WAV 文件 + * 2. VAD 实时检测语音活动 + * 3. 检测到 ~1s 静音后,关闭当前文件、发射 chunkCompleted 信号、打开新文件 + * 4. 外部收到信号后,在后台线程对 WAV 文件进行识别 + * + * 音频存储路径: + * - debug_save_audio 开启 → 使用配置的 audio_debug_dir + * - debug_save_audio 关闭 → Windows: 当前目录, Linux/Mac: 系统临时目录 + */ +class StreamingAudioWriter : public QObject { + Q_OBJECT +public: + explicit StreamingAudioWriter(QObject* parent = nullptr); + ~StreamingAudioWriter() override; + + /** + * @brief 开始录制(打开第一个 WAV 文件) + * @param sampleRate 采样率 (如 16000) + * @param debugEnabled 是否开启调试模式(保存到配置路径) + * @param debugDir 调试目录(debugEnabled=true 时使用,为空则使用默认值) + */ + bool start(int sampleRate, bool debugEnabled = false, const QString& debugDir = QString()); + + /** + * @brief 写入音频样本(归一化 PCM float,范围 -1.0 ~ 1.0) + * + * 此方法会: + * 1. 写入当前 WAV 文件 + * 2. 通过 VAD 检测语音活动 + * 3. 检测到静音段时自动切换文件并触发 chunkCompleted 信号 + */ + void writeSamples(const std::vector& samples); + + /** + * @brief 停止录制,关闭当前文件(不触发 chunkCompleted) + */ + void stop(); + + /** @brief 是否正在录制 */ + bool isRecording() const { return recording_; } + + /** @brief 当前 WAV 文件路径 */ + QString currentFilePath() const; + + /** @brief 当前文件已写入的样本数 */ + int currentSampleCount() const { return samplesWritten_; } + + /** @brief 已录制音频总时长(毫秒) */ + int recordedDurationMs() const; + + /** + * @brief 获取音频存储目录(根据 debug 状态自动选择) + */ + static QString getAudioStorageDir(bool debugEnabled, const QString& debugDir = QString()); + +signals: + /** + * @brief 一个 WAV 文件录制完成(检测到静音段切换) + * @param filePath WAV 文件的完整路径 + * @param durationMs 音频时长(毫秒) + */ + void chunkCompleted(const QString& filePath, int durationMs); + + /** @brief 录制错误 */ + void error(const QString& message); + +private: + /** 打开新的 WAV 文件 */ + bool openNewFile(); + + /** 关闭当前 WAV 文件(不更新文件头) */ + void closeCurrentFile(); + + /** 更新 WAV 文件头的 data chunk 大小和 RIFF 大小 */ + void finalizeWavFile(); + + // WAV 文件头结构 (44 字节) + struct WavHeader { + char riff[4]; // "RIFF" + uint32_t fileSize; // 文件总大小 - 8 + char wave[4]; // "WAVE" + char fmt[4]; // "fmt " + uint32_t fmtSize; // fmt chunk 大小 (16) + uint16_t audioFormat; // 音频格式 (1 = PCM) + uint16_t numChannels; // 通道数 (1 = mono) + uint32_t sampleRate; // 采样率 + uint32_t byteRate; // 字节率 + uint16_t blockAlign; // 块对齐 + uint16_t bitsPerSample;// 位深度 (16) + char data[4]; // "data" + uint32_t dataSize; // data chunk 大小 + }; + + int sampleRate_ = 16000; + bool recording_ = false; + bool debugEnabled_ = false; + QString debugDir_; + + // VAD + std::unique_ptr vad_; + bool wasSpeaking_ = false; // 上一帧是否在说话 + int silenceFramesAfterSpeech_ = 0; // 说话后连续静音帧数 + int silenceFramesNeeded_ = 4; // 需要多少帧静音才切换(~1s) + + // 当前文件 + QString currentFilePath_; + QFile* currentFile_ = nullptr; + QDataStream* currentStream_ = nullptr; + uint32_t samplesWritten_ = 0; + int64_t totalSamples_ = 0; + + // 能量计算帧大小(VAD 帧大小) + int vadFrameSize_ = 480; // 16000 * 30ms = 480 + + mutable QMutex mutex_; +}; + +} // namespace impress diff --git a/src/ui/settings_page.cpp b/src/ui/settings_page.cpp index 934738b..7869cc1 100644 --- a/src/ui/settings_page.cpp +++ b/src/ui/settings_page.cpp @@ -120,6 +120,16 @@ void SettingsPage::setupUI() { populateAudioDevices(); audioLayout->addRow("输入设备:", audioDeviceCombo_); + // 音频调试目录 + auto* debugDirRow = new QHBoxLayout(); + audioDebugDirEdit_ = new QLineEdit(this); + audioDebugDirEdit_->setPlaceholderText("流式识别 WAV 文件保存路径(为空时使用系统临时目录)"); + audioDebugDirBtn_ = new QPushButton("浏览...", this); + connect(audioDebugDirBtn_, &QPushButton::clicked, this, &SettingsPage::onBrowseAudioDebugDir); + debugDirRow->addWidget(audioDebugDirEdit_); + debugDirRow->addWidget(audioDebugDirBtn_); + audioLayout->addRow("调试音频目录:", debugDirRow); + bufferSizeSpin_ = new QSpinBox(this); bufferSizeSpin_->setRange(10, 100); bufferSizeSpin_->setValue(20); @@ -206,6 +216,7 @@ void SettingsPage::loadFromConfig() { // 恢复音频设备选择 int savedDevice = configManager_->get("audio.input_device").toInt(); selectAudioDevice(savedDevice); + audioDebugDirEdit_->setText(configManager_->get("audio.debug_dir").toString()); themeCombo_->setCurrentText(configManager_->get("ui.theme").toString()); fontSizeSpin_->setValue(configManager_->get("ui.font_size").toInt()); @@ -229,6 +240,7 @@ void SettingsPage::saveToConfig() { batch["stt.temperature"] = temperatureSpin_->value(); batch["shortcuts.voice_hotkey"] = hotkeyRecorder_->hotkeyText(); batch["audio.input_device"] = getSelectedAudioDeviceIndex(); + batch["audio.debug_dir"] = audioDebugDirEdit_->text(); batch["audio.buffer_size_ms"] = bufferSizeSpin_->value(); batch["audio.chunk_duration_ms"] = chunkDurationSpin_->value(); batch["audio.padding_ms"] = paddingSpin_->value(); @@ -293,6 +305,14 @@ void SettingsPage::onBrowseTokensPath() { } } +void SettingsPage::onBrowseAudioDebugDir() { + QString path = QFileDialog::getExistingDirectory(this, "选择调试音频目录", "", + QFileDialog::ShowDirsOnly); + if (!path.isEmpty()) { + audioDebugDirEdit_->setText(path); + } +} + void SettingsPage::onSaveConfig() { saveToConfig(); if (configManager_->save()) { diff --git a/src/ui/settings_page.h b/src/ui/settings_page.h index f0f9043..d0b73cc 100644 --- a/src/ui/settings_page.h +++ b/src/ui/settings_page.h @@ -31,6 +31,7 @@ public: private slots: void onBrowseModelPath(); void onBrowseTokensPath(); + void onBrowseAudioDebugDir(); void onSaveConfig(); void onResetConfig(); @@ -62,6 +63,8 @@ private: // 音频设置 QComboBox* audioDeviceCombo_; + QLineEdit* audioDebugDirEdit_; + QPushButton* audioDebugDirBtn_; QSpinBox* bufferSizeSpin_; QSpinBox* chunkDurationSpin_; QSpinBox* paddingSpin_; diff --git a/src/ui/stt_test_page.cpp b/src/ui/stt_test_page.cpp index 4265250..0990604 100644 --- a/src/ui/stt_test_page.cpp +++ b/src/ui/stt_test_page.cpp @@ -1,7 +1,7 @@ #include "stt_test_page.h" #include "core/sense_voice_engine.h" #include "audio/audio_capture.h" -#include "audio/audio_ring_buffer.h" +#include "audio/streaming_audio_writer.h" #include "widgets/audio_waveform.h" #include "app/config_manager.h" #include "utils/logger.h" @@ -18,7 +18,6 @@ #include #include #include -#include #include static const char* const kTag = "STTTestPage"; @@ -32,23 +31,21 @@ STTTestPage::STTTestPage(ConfigManager* configManager, , configManager_(configManager) , sttEngine_(sttEngine) , audioCapture_(new AudioCapture(this)) - , inferenceTimer_(new QTimer(this)) + , streamingWriter_(new StreamingAudioWriter(this)) { setupUI(); // 信号连接 connect(audioCapture_, &AudioCapture::audioDataReady, this, &STTTestPage::onAudioDataReady); + connect(streamingWriter_, &StreamingAudioWriter::chunkCompleted, + this, &STTTestPage::onChunkCompleted); connect(sttEngine_, &SenseVoiceEngine::modelLoaded, this, &STTTestPage::onModelLoaded); connect(sttEngine_, &SenseVoiceEngine::modelLoadError, this, &STTTestPage::onModelLoadError); connect(sttEngine_, &SenseVoiceEngine::modelUnloaded, this, &STTTestPage::onModelUnloaded); - - // 推理定时器:周期性触发后台推理 - connect(inferenceTimer_, &QTimer::timeout, - this, &STTTestPage::onInferenceTimer); } STTTestPage::~STTTestPage() = default; @@ -64,13 +61,6 @@ void STTTestPage::setupUI() { deviceCombo_->addItems(AudioCapture::getDeviceList()); controlLayout->addRow("输入设备:", deviceCombo_); - chunkSizeSpin_ = new QSpinBox(this); - chunkSizeSpin_->setRange(500, 10000); - chunkSizeSpin_->setSingleStep(500); - chunkSizeSpin_->setValue(3000); - chunkSizeSpin_->setSuffix(" ms"); - controlLayout->addRow("推理间隔:", chunkSizeSpin_); - auto* btnLayout = new QHBoxLayout(); recordBtn_ = new QPushButton("开始录音", this); recordBtn_->setMinimumWidth(120); @@ -117,16 +107,14 @@ void STTTestPage::updateUIState() { ? "QPushButton { font-weight: bold; padding: 8px 16px; background-color: #e74c3c; color: white; }" : "QPushButton { font-weight: bold; padding: 8px 16px; }"); deviceCombo_->setEnabled(!isRecording_ && !isLoadingModel_); - chunkSizeSpin_->setEnabled(!isRecording_ && !isLoadingModel_); } void STTTestPage::onToggleRecording() { if (isRecording_) { + streamingWriter_->stop(); audioCapture_->stop(); - inferenceTimer_->stop(); isRecording_ = false; isInferencing_ = false; - audioBuffer_.clear(); } else { // 检查全局模型是否已加载 if (!sttEngine_->isLoaded()) { @@ -172,81 +160,104 @@ void STTTestPage::onModelUnloaded() { void STTTestPage::startAudioCapture() { int deviceIdx = deviceCombo_->currentIndex() - 1; audioSampleRate_ = configManager_->get("stt.sample_rate").toInt(); + bool debugEnabled = configManager_->get("stt.debug_save_audio").toBool(); + // 启动流式录制器 + if (!streamingWriter_->start(audioSampleRate_, debugEnabled)) { + QMessageBox::critical(this, "错误", "无法启动流式录制器"); + return; + } + + // 启动音频采集 if (!audioCapture_->start(deviceIdx, audioSampleRate_)) { + streamingWriter_->stop(); QMessageBox::critical(this, "错误", "无法启动音频采集"); return; } isRecording_ = true; - audioBuffer_.clear(); isInferencing_ = false; + completedCount_ = 0; - // 启动周期性推理定时器 - startInferenceTimer(); - - statusLabel_->setText("录音中 | 模型已加载"); + statusLabel_->setText("录音中 | VAD 流式识别"); updateUIState(); } -void STTTestPage::startInferenceTimer() { - int interval = chunkSizeSpin_->value(); // 与推理间隔同步 - inferenceTimer_->start(interval); -} - void STTTestPage::onAudioDataReady(const std::vector& samples, int /* sampleRate */) { - // 仅缓存音频数据,不直接调用推理 - // 避免推理阻塞音频采集线程 - audioBuffer_.insert(audioBuffer_.end(), samples.begin(), samples.end()); + // 写入流式录制器(WAV 文件 + VAD 静音检测) + streamingWriter_->writeSamples(samples); - // 更新波形显示(使用最新数据片段) + // 更新波形显示 waveform_->setSamples(samples); } -void STTTestPage::onInferenceTimer() { - if (!sttEngine_->isLoaded() || isInferencing_) { +void STTTestPage::onChunkCompleted(const QString& filePath, int durationMs) { + completedCount_++; + LOG_INFO(kTag, QString("WAV 片段 #%1 已完成: %2 (%3ms)") + .arg(completedCount_).arg(filePath).arg(durationMs)); + + statusLabel_->setText(QString("正在识别 #%1 (%2ms)...").arg(completedCount_).arg(durationMs)); + + // 在后台线程对 WAV 文件进行识别 + transcribeChunk(filePath, durationMs); +} + +void STTTestPage::transcribeChunk(const QString& filePath, int /* durationMs */) { + if (isInferencing_) { + // 上一个识别还没完成,跳过(避免堆积) + LOG_WARNING(kTag, "上一个识别仍在进行中,跳过当前片段"); return; } - int chunkSize = audioSampleRate_ * chunkSizeSpin_->value() / 1000; - - if (static_cast(audioBuffer_.size()) < chunkSize) { - return; // 缓冲区数据不足,等待下一次 - } - - // 提取一个推理块的音频 - std::vector chunk(audioBuffer_.begin(), audioBuffer_.begin() + chunkSize); - audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + chunkSize); - - // 在后台线程执行推理 isInferencing_ = true; - statusLabel_->setText("推理中..."); - int sampleRate = audioSampleRate_; - QString language = configManager_->get("stt.language").toString(); + // 在后台线程读取 WAV 文件并推理 + (void)QtConcurrent::run([this, filePath]() { + QString text; + QString errorMsg; - (void)QtConcurrent::run([this, chunk, sampleRate, language]() { - auto result = sttEngine_->infer(chunk, sampleRate, language); + // 读取 WAV 文件为 float 样本 + QFile file(filePath); + if (!file.open(QIODevice::ReadOnly)) { + errorMsg = QString("无法打开 WAV 文件: %1").arg(filePath); + } else { + // 跳过 44 字节 WAV 头 + file.seek(44); + QByteArray raw = file.readAll(); + file.close(); - // 回到主线程更新 UI - QMetaObject::invokeMethod(this, [this, result]() { - isInferencing_ = false; - - if (result.text.isEmpty() && !result.text.isNull()) { - // 静音段 - latencyLabel_->setText(QString("延迟: %1 ms").arg(result.latency_ms, 0, 'f', 1)); - } else { - emit onRecognitionResult(result.text, result.confidence, - result.latency_ms, result.isFinal); + // int16 -> float + int numSamples = raw.size() / 2; + std::vector samples(numSamples); + for (int i = 0; i < numSamples; i++) { + int16_t val = *reinterpret_cast(raw.data() + i * 2); + samples[i] = static_cast(val) / 32767.0f; + } + + if (!sttEngine_->isLoaded()) { + text = "[错误] 模型未加载"; + } else { + QString language = configManager_->get("stt.language").toString(); + auto result = sttEngine_->infer(samples, audioSampleRate_, language); + text = result.text; + errorMsg = result.text.startsWith("[错误]") ? result.text : QString(); + } + } + + // 回到主线程更新 UI + QMetaObject::invokeMethod(this, [this, text, errorMsg, filePath]() { + isInferencing_ = false; + + if (!errorMsg.isEmpty() && text.startsWith("[错误]")) { + statusLabel_->setText(text); + } else if (text.isEmpty()) { + statusLabel_->setText(QString("片段 #%1: 静音").arg(completedCount_)); + } else { + emit onRecognitionResult(text, 1.0f, 0, true); } - // 更新状态 if (isRecording_) { - int bufMs = (audioSampleRate_ > 0) - ? static_cast(audioBuffer_.size() * 1000 / audioSampleRate_) - : 0; - statusLabel_->setText( - QString("录音中 | 缓冲区: %1 ms").arg(bufMs)); + statusLabel_->setText(QString("录音中 | 已识别 %1 个片段").arg(completedCount_)); } }, Qt::QueuedConnection); }); @@ -256,8 +267,8 @@ void STTTestPage::onRecognitionResult(const QString& text, float confidence, double latency, bool isFinal) { QString timestamp = QDateTime::currentDateTime().toString("hh:mm:ss"); - QString line = QString("[%1] %2 (置信度: %3%, 延迟: %4 ms)\n") - .arg(timestamp, text) + QString line = QString("[%1] #%2 %3 (置信度: %4%, 延迟: %5 ms)") + .arg(timestamp).arg(completedCount_).arg(text) .arg(confidence * 100, 0, 'f', 1) .arg(latency, 0, 'f', 1); diff --git a/src/ui/stt_test_page.h b/src/ui/stt_test_page.h index 69da3b5..f3ffef6 100644 --- a/src/ui/stt_test_page.h +++ b/src/ui/stt_test_page.h @@ -9,20 +9,20 @@ class QPushButton; class QComboBox; class QTextEdit; class QSpinBox; -class QTimer; namespace impress { class ConfigManager; class SenseVoiceEngine; class AudioCapture; +class StreamingAudioWriter; /** * @brief STT 测试页面 * - * 实时麦克风采集 + 周期性后台推理。 + * 实时麦克风采集 + 基于 VAD 的流式 WAV 文件录制 + 后台识别。 * 音频采集与推理分离,防止推理阻塞音频流。 - * 使用 SenseVoice 模型进行推理。 + * 使用 VAD 检测静音段自动切换 WAV 文件,每个文件完成后触发识别。 */ class STTTestPage : public QWidget { Q_OBJECT @@ -35,22 +35,22 @@ public: private slots: void onToggleRecording(); void onAudioDataReady(const std::vector& samples, int sampleRate); + void onChunkCompleted(const QString& filePath, int durationMs); void onRecognitionResult(const QString& text, float confidence, double latency, bool isFinal); void onModelLoaded(const QString& modelPath); void onModelLoadError(const QString& modelPath, const QString& error); void onModelUnloaded(); - void onInferenceTimer(); private: void setupUI(); void updateUIState(); void startAudioCapture(); - void startInferenceTimer(); + void transcribeChunk(const QString& filePath, int durationMs); ConfigManager* configManager_; SenseVoiceEngine* sttEngine_; AudioCapture* audioCapture_; - QTimer* inferenceTimer_; + StreamingAudioWriter* streamingWriter_; // UI 控件 QComboBox* deviceCombo_; @@ -59,13 +59,12 @@ private: QLabel* latencyLabel_; QLabel* statusLabel_; AudioWaveform* waveform_; - QSpinBox* chunkSizeSpin_; bool isRecording_ = false; bool isLoadingModel_ = false; bool isInferencing_ = false; int audioSampleRate_ = 16000; - std::vector audioBuffer_; + int completedCount_ = 0; // 已完成文件计数 }; } // namespace impress