From a7a5b141a9c8239fb06613d595a37d0409a23ab0 Mon Sep 17 00:00:00 2001 From: impressionyang Date: Tue, 12 May 2026 19:41:38 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20STT=E6=B5=8B=E8=AF=95=E9=A1=B5=E9=9D=A2?= =?UTF-8?q?=E5=BC=82=E6=AD=A5=E6=8E=A8=E7=90=86=EF=BC=8C=E9=98=B2=E6=AD=A2?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E6=8E=A8=E7=90=86=E9=98=BB=E5=A1=9E=E9=9F=B3?= =?UTF-8?q?=E9=A2=91=E9=87=87=E9=9B=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SenseVoice 完整推理耗时数秒,原实现中 processAudioChunk 在 音频回调中同步调用 infer(),导致 PortAudio 回调线程阻塞, 表现为"程序无响应"。 修复方案: - onAudioDataReady 仅缓存音频数据,不再同步调用推理 - QTimer 周期性触发,从缓冲区提取音频块 - 推理在 QtConcurrent 后台线程执行 - isInferencing_ 标志防止排队积压,推理期间跳过新音频块 - UI 实时显示缓冲区状态和推理进度 Co-Authored-By: Claude Opus 4.6 --- src/ui/stt_test_page.cpp | 99 ++++++++++++++++++++++++++++++---------- src/ui/stt_test_page.h | 13 ++++-- 2 files changed, 85 insertions(+), 27 deletions(-) diff --git a/src/ui/stt_test_page.cpp b/src/ui/stt_test_page.cpp index ef47524..bd12481 100644 --- a/src/ui/stt_test_page.cpp +++ b/src/ui/stt_test_page.cpp @@ -18,6 +18,8 @@ #include #include #include +#include +#include static const char* const kTag = "STTTestPage"; @@ -28,6 +30,7 @@ STTTestPage::STTTestPage(ConfigManager* configManager, QWidget* parent) , configManager_(configManager) , sttEngine_(new SenseVoiceEngine(this)) , audioCapture_(new AudioCapture(this)) + , inferenceTimer_(new QTimer(this)) { setupUI(); @@ -40,6 +43,10 @@ STTTestPage::STTTestPage(ConfigManager* configManager, QWidget* parent) this, &STTTestPage::onModelLoadError); connect(sttEngine_, &SenseVoiceEngine::modelUnloaded, this, &STTTestPage::onModelUnloaded); + + // 推理定时器:周期性触发后台推理 + connect(inferenceTimer_, &QTimer::timeout, + this, &STTTestPage::onInferenceTimer); } STTTestPage::~STTTestPage() = default; @@ -114,8 +121,11 @@ void STTTestPage::updateUIState() { void STTTestPage::onToggleRecording() { if (isRecording_) { audioCapture_->stop(); + inferenceTimer_->stop(); sttEngine_->unloadModel(); isRecording_ = false; + isInferencing_ = false; + audioBuffer_.clear(); } else { // 读取配置 QString modelPath = configManager_->get("stt.model_path").toString(); @@ -172,49 +182,92 @@ void STTTestPage::onModelLoadError(const QString& modelPath, const QString& erro void STTTestPage::onModelUnloaded() { isLoadingModel_ = false; + isInferencing_ = false; statusLabel_->setText("模型已卸载"); } void STTTestPage::startAudioCapture() { int deviceIdx = deviceCombo_->currentIndex() - 1; - int sampleRate = configManager_->get("stt.sample_rate").toInt(); + audioSampleRate_ = configManager_->get("stt.sample_rate").toInt(); - if (!audioCapture_->start(deviceIdx, sampleRate)) { + if (!audioCapture_->start(deviceIdx, audioSampleRate_)) { QMessageBox::critical(this, "错误", "无法启动音频采集"); return; } + isRecording_ = true; + audioBuffer_.clear(); + isInferencing_ = false; + + // 启动周期性推理定时器 + startInferenceTimer(); + statusLabel_->setText(QString("录音中 | 模型: %1").arg( QFileInfo(currentModelPath_).fileName())); updateUIState(); } -void STTTestPage::onAudioDataReady(const std::vector& samples, int sampleRate) { - chunkBuffer_.insert(chunkBuffer_.end(), samples.begin(), samples.end()); - - int chunkSize = configManager_->get("stt.sample_rate").toInt() - * chunkSizeSpin_->value() / 1000; - - if (static_cast(chunkBuffer_.size()) >= chunkSize) { - std::vector chunk(chunkBuffer_.begin(), chunkBuffer_.begin() + chunkSize); - chunkBuffer_.erase(chunkBuffer_.begin(), chunkBuffer_.begin() + chunkSize); - - waveform_->setSamples(samples); - processAudioChunk(chunk, sampleRate); - } else { - waveform_->setSamples(samples); - } +void STTTestPage::startInferenceTimer() { + int interval = chunkSizeSpin_->value(); // 与推理间隔同步 + inferenceTimer_->start(interval); } -void STTTestPage::processAudioChunk(const std::vector& samples, int sampleRate) { - // 模型已在 onToggleRecording 中异步加载,此处防御性检查 - if (!sttEngine_->isLoaded()) { +void STTTestPage::onAudioDataReady(const std::vector& samples, int /* sampleRate */) { + // 仅缓存音频数据,不直接调用推理 + // 避免推理阻塞音频采集线程 + audioBuffer_.insert(audioBuffer_.end(), samples.begin(), samples.end()); + + // 更新波形显示(使用最新数据片段) + waveform_->setSamples(samples); +} + +void STTTestPage::onInferenceTimer() { + if (!sttEngine_->isLoaded() || isInferencing_) { return; } - auto result = sttEngine_->infer(samples, sampleRate, - configManager_->get("stt.language").toString()); - emit onRecognitionResult(result.text, result.confidence, result.latency_ms, result.isFinal); + int chunkSize = audioSampleRate_ * chunkSizeSpin_->value() / 1000; + + if (static_cast(audioBuffer_.size()) < chunkSize) { + return; // 缓冲区数据不足,等待下一次 + } + + // 提取一个推理块的音频 + std::vector chunk(audioBuffer_.begin(), audioBuffer_.begin() + chunkSize); + audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + chunkSize); + + // 在后台线程执行推理 + isInferencing_ = true; + statusLabel_->setText("推理中..."); + + int sampleRate = audioSampleRate_; + QString language = configManager_->get("stt.language").toString(); + + (void)QtConcurrent::run([this, chunk, sampleRate, language]() { + auto result = sttEngine_->infer(chunk, sampleRate, language); + + // 回到主线程更新 UI + QMetaObject::invokeMethod(this, [this, result]() { + isInferencing_ = false; + + if (result.text.isEmpty() && !result.text.isNull()) { + // 静音段 + latencyLabel_->setText(QString("延迟: %1 ms").arg(result.latency_ms, 0, 'f', 1)); + } else { + emit onRecognitionResult(result.text, result.confidence, + result.latency_ms, result.isFinal); + } + + // 更新状态 + if (isRecording_) { + int bufMs = (audioSampleRate_ > 0) + ? static_cast(audioBuffer_.size() * 1000 / audioSampleRate_) + : 0; + statusLabel_->setText( + QString("录音中 | 缓冲区: %1 ms").arg(bufMs)); + } + }, Qt::QueuedConnection); + }); } void STTTestPage::onRecognitionResult(const QString& text, float confidence, diff --git a/src/ui/stt_test_page.h b/src/ui/stt_test_page.h index bd319fd..9b9ddad 100644 --- a/src/ui/stt_test_page.h +++ b/src/ui/stt_test_page.h @@ -9,6 +9,7 @@ class QPushButton; class QComboBox; class QTextEdit; class QSpinBox; +class QTimer; namespace impress { @@ -19,9 +20,9 @@ class AudioCapture; /** * @brief STT 测试页面 * - * 实时麦克风采集 + 流式识别。 + * 实时麦克风采集 + 周期性后台推理。 + * 音频采集与推理分离,防止推理阻塞音频流。 * 使用 SenseVoice 模型进行推理。 - * 模型异步加载,不阻塞 UI。 */ class STTTestPage : public QWidget { Q_OBJECT @@ -36,16 +37,18 @@ private slots: void onModelLoaded(const QString& modelPath); void onModelLoadError(const QString& modelPath, const QString& error); void onModelUnloaded(); + void onInferenceTimer(); private: void setupUI(); void updateUIState(); void startAudioCapture(); - void processAudioChunk(const std::vector& samples, int sampleRate); + void startInferenceTimer(); ConfigManager* configManager_; SenseVoiceEngine* sttEngine_; AudioCapture* audioCapture_; + QTimer* inferenceTimer_; // UI 控件 QComboBox* deviceCombo_; @@ -58,7 +61,9 @@ private: bool isRecording_ = false; bool isLoadingModel_ = false; - std::vector chunkBuffer_; + bool isInferencing_ = false; + int audioSampleRate_ = 16000; + std::vector audioBuffer_; QString currentModelPath_; };