feat: 实现基于 VAD 的流式 WAV 录制识别方案

- 新增 StreamingAudioWriter 组件:持续录制 WAV 文件,VAD 检测静音段自动切换
- 静音检测:检测到 ~1s 连续静音后关闭当前文件,触发 chunkCompleted 信号
- STTTestPage 重构:移除缓冲区推理模式,改为 WAV 文件流式识别
- 每个 WAV 文件完成后在后台线程读取并推理,不阻塞继续录制
- 设置页面新增「调试音频目录」配置项
- 音频存储路径:debug 模式使用配置目录,非 debug 模式使用系统临时目录

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Alvin Young 2026-05-13 15:33:03 +08:00
parent dc4ebab47c
commit 01a39ddc8c
8 changed files with 505 additions and 76 deletions

View File

@ -67,6 +67,7 @@ set(SOURCES
src/audio/audio_capture.cpp
src/audio/audio_decoder.cpp
src/audio/audio_ring_buffer.cpp
src/audio/streaming_audio_writer.cpp
# UI
src/ui/main_window.cpp
@ -102,6 +103,7 @@ set(HEADERS
src/audio/audio_capture.h
src/audio/audio_decoder.h
src/audio/audio_ring_buffer.h
src/audio/streaming_audio_writer.h
src/ui/main_window.h
src/ui/stt_test_page.h

View File

@ -83,7 +83,8 @@ void ConfigManager::loadDefaults() {
{"input_device", -1},
{"buffer_size_ms", 20},
{"chunk_duration_ms", 3000},
{"padding_ms", 500}
{"padding_ms", 500},
{"debug_dir", ""}
}},
{"ui", QVariantMap{
{"theme", "light"},

View File

@ -0,0 +1,253 @@
#include "streaming_audio_writer.h"
#include "core/vad.h"
#include "utils/logger.h"
#include <QDateTime>
#include <QStandardPaths>
#include <QDir>
#include <cmath>
#include <algorithm>
#include <cstring>
static const char* const kTag = "StreamingAudioWriter";
namespace impress {
StreamingAudioWriter::StreamingAudioWriter(QObject* parent)
: QObject(parent)
{
}
StreamingAudioWriter::~StreamingAudioWriter() {
stop();
}
QString StreamingAudioWriter::getAudioStorageDir(bool debugEnabled, const QString& debugDir) {
if (debugEnabled && !debugDir.isEmpty()) {
return debugDir;
}
if (debugEnabled) {
// 使用配置默认值:临时目录
return QDir::tempPath() + "/impress_audio_debug";
}
#ifdef PLATFORM_WINDOWS
return ".";
#else
return QDir::tempPath();
#endif
}
bool StreamingAudioWriter::start(int sampleRate, bool debugEnabled, const QString& debugDir) {
QMutexLocker locker(&mutex_);
if (recording_) {
LOG_WARNING(kTag, "已在录制中");
return false;
}
sampleRate_ = sampleRate;
debugEnabled_ = debugEnabled;
debugDir_ = debugDir;
totalSamples_ = 0;
samplesWritten_ = 0;
wasSpeaking_ = false;
silenceFramesAfterSpeech_ = 0;
// 初始化 VAD30ms 帧,低能量阈值)
vad_ = std::make_unique<VoiceActivityDetector>(sampleRate_, 30, 0.015f, 3);
// VAD 帧大小
vadFrameSize_ = sampleRate_ * 30 / 1000;
if (vadFrameSize_ < 320) vadFrameSize_ = 320;
// 静音切换:~1s 的连续静音帧
silenceFramesNeeded_ = 1000 / 30; // ~33 帧
// 确保目录存在
QString dir = getAudioStorageDir(debugEnabled, debugDir);
QDir d;
if (!d.exists(dir)) {
if (!d.mkpath(dir)) {
LOG_ERROR(kTag, QString("无法创建音频存储目录: %1").arg(dir));
return false;
}
}
if (!openNewFile()) {
return false;
}
recording_ = true;
LOG_INFO(kTag, QString("流式录制已启动 (采样率: %1, VAD帧: %2, 静音切换: %3帧, 存储: %4)")
.arg(sampleRate_).arg(vadFrameSize_).arg(silenceFramesNeeded_).arg(dir));
return true;
}
void StreamingAudioWriter::writeSamples(const std::vector<float>& samples) {
QMutexLocker locker(&mutex_);
if (!recording_ || !currentStream_) return;
if (samples.empty()) return;
// 1. 写入 WAV 文件 (float -> int16)
for (float s : samples) {
s = std::max(-1.0f, std::min(1.0f, s)); // clip
int16_t val = static_cast<int16_t>(s * 32767.0f);
*currentStream_ << val;
samplesWritten_++;
totalSamples_++;
}
// 2. 用 VAD 检测语音活动
bool isSpeaking = vad_->process(samples);
// 3. 静音段切换逻辑:
// 检测到「说话 → 静音」的过渡,连续静音帧数达到阈值时切换
if (isSpeaking) {
silenceFramesAfterSpeech_ = 0;
wasSpeaking_ = true;
} else if (wasSpeaking_) {
silenceFramesAfterSpeech_++;
if (silenceFramesAfterSpeech_ >= silenceFramesNeeded_ && static_cast<int>(samplesWritten_) > sampleRate_ / 2) {
// 至少有 0.5 秒音频才切换
LOG_DEBUG(kTag, QString("检测到静音段 (连续 %1 帧, 能量: %2),切换 WAV 文件")
.arg(silenceFramesAfterSpeech_)
.arg(vad_->currentEnergy(), 0, 'f', 4));
// 完成当前文件
finalizeWavFile();
closeCurrentFile();
// 发射完成信号
int durationMs = static_cast<int>(samplesWritten_ * 1000 / sampleRate_);
QString completedPath = currentFilePath_;
emit chunkCompleted(completedPath, durationMs);
// 打开新文件
samplesWritten_ = 0;
silenceFramesAfterSpeech_ = 0;
wasSpeaking_ = false;
vad_ = std::make_unique<VoiceActivityDetector>(sampleRate_, 30, 0.015f, 3);
if (!openNewFile()) {
LOG_ERROR(kTag, "无法打开新的 WAV 文件,停止录制");
recording_ = false;
return;
}
}
}
// else: 还没开始说话,不计数
}
void StreamingAudioWriter::stop() {
QMutexLocker locker(&mutex_);
if (!recording_) return;
if (samplesWritten_ > 0) {
finalizeWavFile();
// 停止时不触发 chunkCompleted因为最后一小段可能太短
// 如果需要处理最后一段,可以在外部调用时手动处理
}
closeCurrentFile();
recording_ = false;
LOG_INFO(kTag, QString("流式录制已停止 (总计: %1 样本, 约 %2 秒)")
.arg(totalSamples_).arg(totalSamples_ * 1000.0 / sampleRate_ / 1000.0, 0, 'f', 1));
}
QString StreamingAudioWriter::currentFilePath() const {
QMutexLocker locker(&mutex_);
return currentFilePath_;
}
int StreamingAudioWriter::recordedDurationMs() const {
QMutexLocker locker(&mutex_);
return static_cast<int>(totalSamples_ * 1000 / sampleRate_);
}
bool StreamingAudioWriter::openNewFile() {
// 生成文件名
QString dir = getAudioStorageDir(debugEnabled_, debugDir_);
QString timestamp = QDateTime::currentDateTime().toString("yyyyMMdd_HHmmss_zzz");
currentFilePath_ = QString("%1/record_%2.wav").arg(dir).arg(timestamp);
currentFile_ = new QFile(currentFilePath_);
if (!currentFile_->open(QIODevice::WriteOnly)) {
LOG_ERROR(kTag, QString("无法创建 WAV 文件: %1").arg(currentFilePath_));
delete currentFile_;
currentFile_ = nullptr;
return false;
}
currentStream_ = new QDataStream(currentFile_);
currentStream_->setByteOrder(QDataStream::LittleEndian);
// 初始化并写入 WAV 头
WavHeader header{};
memcpy(header.riff, "RIFF", 4);
memcpy(header.wave, "WAVE", 4);
memcpy(header.fmt, "fmt ", 4);
header.fmtSize = 16;
header.audioFormat = 1; // PCM
header.numChannels = 1; // mono
header.sampleRate = static_cast<uint32_t>(sampleRate_);
header.byteRate = static_cast<uint32_t>(sampleRate_) * 2;
header.blockAlign = 2;
header.bitsPerSample = 16;
memcpy(header.data, "data", 4);
header.dataSize = 0;
header.fileSize = sizeof(WavHeader) - 8;
// 写入头
currentStream_->writeRawData(header.riff, 4);
*currentStream_ << header.fileSize;
currentStream_->writeRawData(header.wave, 4);
currentStream_->writeRawData(header.fmt, 4);
*currentStream_ << header.fmtSize;
*currentStream_ << header.audioFormat;
*currentStream_ << header.numChannels;
*currentStream_ << header.sampleRate;
*currentStream_ << header.byteRate;
*currentStream_ << header.blockAlign;
*currentStream_ << header.bitsPerSample;
currentStream_->writeRawData(header.data, 4);
*currentStream_ << header.dataSize;
LOG_DEBUG(kTag, QString("新 WAV 文件已打开: %1").arg(currentFilePath_));
return true;
}
void StreamingAudioWriter::closeCurrentFile() {
if (currentStream_) {
delete currentStream_;
currentStream_ = nullptr;
}
if (currentFile_) {
currentFile_->close();
delete currentFile_;
currentFile_ = nullptr;
}
currentFilePath_.clear();
}
void StreamingAudioWriter::finalizeWavFile() {
if (!currentFile_ || !currentFile_->isOpen()) return;
// 计算实际大小
uint32_t dataBytes = samplesWritten_ * 2; // 16-bit mono
uint32_t fileSize = sizeof(WavHeader) + dataBytes - 8;
// 回写到文件头更新大小
currentFile_->seek(4);
currentStream_->writeRawData(reinterpret_cast<const char*>(&fileSize), 4);
currentFile_->seek(sizeof(WavHeader) - 4); // dataSize 偏移
currentStream_->writeRawData(reinterpret_cast<const char*>(&dataBytes), 4);
currentFile_->flush();
int durationMs = static_cast<int>(samplesWritten_ * 1000 / sampleRate_);
LOG_DEBUG(kTag, QString("WAV 文件已保存: %1 (时长: %2ms, 样本: %3)")
.arg(currentFilePath_).arg(durationMs).arg(samplesWritten_));
}
} // namespace impress

View File

@ -0,0 +1,140 @@
#pragma once
#include <QObject>
#include <QString>
#include <QFile>
#include <QDataStream>
#include <QMutex>
#include <cstdint>
#include <memory>
#include <vector>
namespace impress {
class VoiceActivityDetector;
/**
* @brief
*
* WAV VAD
* WAV signal
*
*
* 1. WAV
* 2. VAD
* 3. ~1s chunkCompleted
* 4. 线 WAV
*
*
* - debug_save_audio 使 audio_debug_dir
* - debug_save_audio Windows: , Linux/Mac:
*/
class StreamingAudioWriter : public QObject {
Q_OBJECT
public:
explicit StreamingAudioWriter(QObject* parent = nullptr);
~StreamingAudioWriter() override;
/**
* @brief WAV
* @param sampleRate ( 16000)
* @param debugEnabled
* @param debugDir debugEnabled=true 使使
*/
bool start(int sampleRate, bool debugEnabled = false, const QString& debugDir = QString());
/**
* @brief PCM float -1.0 ~ 1.0
*
*
* 1. WAV
* 2. VAD
* 3. chunkCompleted
*/
void writeSamples(const std::vector<float>& samples);
/**
* @brief chunkCompleted
*/
void stop();
/** @brief 是否正在录制 */
bool isRecording() const { return recording_; }
/** @brief 当前 WAV 文件路径 */
QString currentFilePath() const;
/** @brief 当前文件已写入的样本数 */
int currentSampleCount() const { return samplesWritten_; }
/** @brief 已录制音频总时长(毫秒) */
int recordedDurationMs() const;
/**
* @brief debug
*/
static QString getAudioStorageDir(bool debugEnabled, const QString& debugDir = QString());
signals:
/**
* @brief WAV
* @param filePath WAV
* @param durationMs
*/
void chunkCompleted(const QString& filePath, int durationMs);
/** @brief 录制错误 */
void error(const QString& message);
private:
/** 打开新的 WAV 文件 */
bool openNewFile();
/** 关闭当前 WAV 文件(不更新文件头) */
void closeCurrentFile();
/** 更新 WAV 文件头的 data chunk 大小和 RIFF 大小 */
void finalizeWavFile();
// WAV 文件头结构 (44 字节)
struct WavHeader {
char riff[4]; // "RIFF"
uint32_t fileSize; // 文件总大小 - 8
char wave[4]; // "WAVE"
char fmt[4]; // "fmt "
uint32_t fmtSize; // fmt chunk 大小 (16)
uint16_t audioFormat; // 音频格式 (1 = PCM)
uint16_t numChannels; // 通道数 (1 = mono)
uint32_t sampleRate; // 采样率
uint32_t byteRate; // 字节率
uint16_t blockAlign; // 块对齐
uint16_t bitsPerSample;// 位深度 (16)
char data[4]; // "data"
uint32_t dataSize; // data chunk 大小
};
int sampleRate_ = 16000;
bool recording_ = false;
bool debugEnabled_ = false;
QString debugDir_;
// VAD
std::unique_ptr<VoiceActivityDetector> vad_;
bool wasSpeaking_ = false; // 上一帧是否在说话
int silenceFramesAfterSpeech_ = 0; // 说话后连续静音帧数
int silenceFramesNeeded_ = 4; // 需要多少帧静音才切换(~1s
// 当前文件
QString currentFilePath_;
QFile* currentFile_ = nullptr;
QDataStream* currentStream_ = nullptr;
uint32_t samplesWritten_ = 0;
int64_t totalSamples_ = 0;
// 能量计算帧大小VAD 帧大小)
int vadFrameSize_ = 480; // 16000 * 30ms = 480
mutable QMutex mutex_;
};
} // namespace impress

View File

@ -120,6 +120,16 @@ void SettingsPage::setupUI() {
populateAudioDevices();
audioLayout->addRow("输入设备:", audioDeviceCombo_);
// 音频调试目录
auto* debugDirRow = new QHBoxLayout();
audioDebugDirEdit_ = new QLineEdit(this);
audioDebugDirEdit_->setPlaceholderText("流式识别 WAV 文件保存路径(为空时使用系统临时目录)");
audioDebugDirBtn_ = new QPushButton("浏览...", this);
connect(audioDebugDirBtn_, &QPushButton::clicked, this, &SettingsPage::onBrowseAudioDebugDir);
debugDirRow->addWidget(audioDebugDirEdit_);
debugDirRow->addWidget(audioDebugDirBtn_);
audioLayout->addRow("调试音频目录:", debugDirRow);
bufferSizeSpin_ = new QSpinBox(this);
bufferSizeSpin_->setRange(10, 100);
bufferSizeSpin_->setValue(20);
@ -206,6 +216,7 @@ void SettingsPage::loadFromConfig() {
// 恢复音频设备选择
int savedDevice = configManager_->get("audio.input_device").toInt();
selectAudioDevice(savedDevice);
audioDebugDirEdit_->setText(configManager_->get("audio.debug_dir").toString());
themeCombo_->setCurrentText(configManager_->get("ui.theme").toString());
fontSizeSpin_->setValue(configManager_->get("ui.font_size").toInt());
@ -229,6 +240,7 @@ void SettingsPage::saveToConfig() {
batch["stt.temperature"] = temperatureSpin_->value();
batch["shortcuts.voice_hotkey"] = hotkeyRecorder_->hotkeyText();
batch["audio.input_device"] = getSelectedAudioDeviceIndex();
batch["audio.debug_dir"] = audioDebugDirEdit_->text();
batch["audio.buffer_size_ms"] = bufferSizeSpin_->value();
batch["audio.chunk_duration_ms"] = chunkDurationSpin_->value();
batch["audio.padding_ms"] = paddingSpin_->value();
@ -293,6 +305,14 @@ void SettingsPage::onBrowseTokensPath() {
}
}
void SettingsPage::onBrowseAudioDebugDir() {
QString path = QFileDialog::getExistingDirectory(this, "选择调试音频目录", "",
QFileDialog::ShowDirsOnly);
if (!path.isEmpty()) {
audioDebugDirEdit_->setText(path);
}
}
void SettingsPage::onSaveConfig() {
saveToConfig();
if (configManager_->save()) {

View File

@ -31,6 +31,7 @@ public:
private slots:
void onBrowseModelPath();
void onBrowseTokensPath();
void onBrowseAudioDebugDir();
void onSaveConfig();
void onResetConfig();
@ -62,6 +63,8 @@ private:
// 音频设置
QComboBox* audioDeviceCombo_;
QLineEdit* audioDebugDirEdit_;
QPushButton* audioDebugDirBtn_;
QSpinBox* bufferSizeSpin_;
QSpinBox* chunkDurationSpin_;
QSpinBox* paddingSpin_;

View File

@ -1,7 +1,7 @@
#include "stt_test_page.h"
#include "core/sense_voice_engine.h"
#include "audio/audio_capture.h"
#include "audio/audio_ring_buffer.h"
#include "audio/streaming_audio_writer.h"
#include "widgets/audio_waveform.h"
#include "app/config_manager.h"
#include "utils/logger.h"
@ -18,7 +18,6 @@
#include <QMessageBox>
#include <QDateTime>
#include <QFileInfo>
#include <QTimer>
#include <QtConcurrent>
static const char* const kTag = "STTTestPage";
@ -32,23 +31,21 @@ STTTestPage::STTTestPage(ConfigManager* configManager,
, configManager_(configManager)
, sttEngine_(sttEngine)
, audioCapture_(new AudioCapture(this))
, inferenceTimer_(new QTimer(this))
, streamingWriter_(new StreamingAudioWriter(this))
{
setupUI();
// 信号连接
connect(audioCapture_, &AudioCapture::audioDataReady,
this, &STTTestPage::onAudioDataReady);
connect(streamingWriter_, &StreamingAudioWriter::chunkCompleted,
this, &STTTestPage::onChunkCompleted);
connect(sttEngine_, &SenseVoiceEngine::modelLoaded,
this, &STTTestPage::onModelLoaded);
connect(sttEngine_, &SenseVoiceEngine::modelLoadError,
this, &STTTestPage::onModelLoadError);
connect(sttEngine_, &SenseVoiceEngine::modelUnloaded,
this, &STTTestPage::onModelUnloaded);
// 推理定时器:周期性触发后台推理
connect(inferenceTimer_, &QTimer::timeout,
this, &STTTestPage::onInferenceTimer);
}
STTTestPage::~STTTestPage() = default;
@ -64,13 +61,6 @@ void STTTestPage::setupUI() {
deviceCombo_->addItems(AudioCapture::getDeviceList());
controlLayout->addRow("输入设备:", deviceCombo_);
chunkSizeSpin_ = new QSpinBox(this);
chunkSizeSpin_->setRange(500, 10000);
chunkSizeSpin_->setSingleStep(500);
chunkSizeSpin_->setValue(3000);
chunkSizeSpin_->setSuffix(" ms");
controlLayout->addRow("推理间隔:", chunkSizeSpin_);
auto* btnLayout = new QHBoxLayout();
recordBtn_ = new QPushButton("开始录音", this);
recordBtn_->setMinimumWidth(120);
@ -117,16 +107,14 @@ void STTTestPage::updateUIState() {
? "QPushButton { font-weight: bold; padding: 8px 16px; background-color: #e74c3c; color: white; }"
: "QPushButton { font-weight: bold; padding: 8px 16px; }");
deviceCombo_->setEnabled(!isRecording_ && !isLoadingModel_);
chunkSizeSpin_->setEnabled(!isRecording_ && !isLoadingModel_);
}
void STTTestPage::onToggleRecording() {
if (isRecording_) {
streamingWriter_->stop();
audioCapture_->stop();
inferenceTimer_->stop();
isRecording_ = false;
isInferencing_ = false;
audioBuffer_.clear();
} else {
// 检查全局模型是否已加载
if (!sttEngine_->isLoaded()) {
@ -172,81 +160,104 @@ void STTTestPage::onModelUnloaded() {
void STTTestPage::startAudioCapture() {
int deviceIdx = deviceCombo_->currentIndex() - 1;
audioSampleRate_ = configManager_->get("stt.sample_rate").toInt();
bool debugEnabled = configManager_->get("stt.debug_save_audio").toBool();
// 启动流式录制器
if (!streamingWriter_->start(audioSampleRate_, debugEnabled)) {
QMessageBox::critical(this, "错误", "无法启动流式录制器");
return;
}
// 启动音频采集
if (!audioCapture_->start(deviceIdx, audioSampleRate_)) {
streamingWriter_->stop();
QMessageBox::critical(this, "错误", "无法启动音频采集");
return;
}
isRecording_ = true;
audioBuffer_.clear();
isInferencing_ = false;
completedCount_ = 0;
// 启动周期性推理定时器
startInferenceTimer();
statusLabel_->setText("录音中 | 模型已加载");
statusLabel_->setText("录音中 | VAD 流式识别");
updateUIState();
}
void STTTestPage::startInferenceTimer() {
int interval = chunkSizeSpin_->value(); // 与推理间隔同步
inferenceTimer_->start(interval);
}
void STTTestPage::onAudioDataReady(const std::vector<float>& samples, int /* sampleRate */) {
// 仅缓存音频数据,不直接调用推理
// 避免推理阻塞音频采集线程
audioBuffer_.insert(audioBuffer_.end(), samples.begin(), samples.end());
// 写入流式录制器WAV 文件 + VAD 静音检测)
streamingWriter_->writeSamples(samples);
// 更新波形显示(使用最新数据片段)
// 更新波形显示
waveform_->setSamples(samples);
}
void STTTestPage::onInferenceTimer() {
if (!sttEngine_->isLoaded() || isInferencing_) {
void STTTestPage::onChunkCompleted(const QString& filePath, int durationMs) {
completedCount_++;
LOG_INFO(kTag, QString("WAV 片段 #%1 已完成: %2 (%3ms)")
.arg(completedCount_).arg(filePath).arg(durationMs));
statusLabel_->setText(QString("正在识别 #%1 (%2ms)...").arg(completedCount_).arg(durationMs));
// 在后台线程对 WAV 文件进行识别
transcribeChunk(filePath, durationMs);
}
void STTTestPage::transcribeChunk(const QString& filePath, int /* durationMs */) {
if (isInferencing_) {
// 上一个识别还没完成,跳过(避免堆积)
LOG_WARNING(kTag, "上一个识别仍在进行中,跳过当前片段");
return;
}
int chunkSize = audioSampleRate_ * chunkSizeSpin_->value() / 1000;
if (static_cast<int>(audioBuffer_.size()) < chunkSize) {
return; // 缓冲区数据不足,等待下一次
}
// 提取一个推理块的音频
std::vector<float> chunk(audioBuffer_.begin(), audioBuffer_.begin() + chunkSize);
audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + chunkSize);
// 在后台线程执行推理
isInferencing_ = true;
statusLabel_->setText("推理中...");
int sampleRate = audioSampleRate_;
QString language = configManager_->get("stt.language").toString();
// 在后台线程读取 WAV 文件并推理
(void)QtConcurrent::run([this, filePath]() {
QString text;
QString errorMsg;
(void)QtConcurrent::run([this, chunk, sampleRate, language]() {
auto result = sttEngine_->infer(chunk, sampleRate, language);
// 读取 WAV 文件为 float 样本
QFile file(filePath);
if (!file.open(QIODevice::ReadOnly)) {
errorMsg = QString("无法打开 WAV 文件: %1").arg(filePath);
} else {
// 跳过 44 字节 WAV 头
file.seek(44);
QByteArray raw = file.readAll();
file.close();
// 回到主线程更新 UI
QMetaObject::invokeMethod(this, [this, result]() {
isInferencing_ = false;
if (result.text.isEmpty() && !result.text.isNull()) {
// 静音段
latencyLabel_->setText(QString("延迟: %1 ms").arg(result.latency_ms, 0, 'f', 1));
} else {
emit onRecognitionResult(result.text, result.confidence,
result.latency_ms, result.isFinal);
// int16 -> float
int numSamples = raw.size() / 2;
std::vector<float> samples(numSamples);
for (int i = 0; i < numSamples; i++) {
int16_t val = *reinterpret_cast<const int16_t*>(raw.data() + i * 2);
samples[i] = static_cast<float>(val) / 32767.0f;
}
if (!sttEngine_->isLoaded()) {
text = "[错误] 模型未加载";
} else {
QString language = configManager_->get("stt.language").toString();
auto result = sttEngine_->infer(samples, audioSampleRate_, language);
text = result.text;
errorMsg = result.text.startsWith("[错误]") ? result.text : QString();
}
}
// 回到主线程更新 UI
QMetaObject::invokeMethod(this, [this, text, errorMsg, filePath]() {
isInferencing_ = false;
if (!errorMsg.isEmpty() && text.startsWith("[错误]")) {
statusLabel_->setText(text);
} else if (text.isEmpty()) {
statusLabel_->setText(QString("片段 #%1: 静音").arg(completedCount_));
} else {
emit onRecognitionResult(text, 1.0f, 0, true);
}
// 更新状态
if (isRecording_) {
int bufMs = (audioSampleRate_ > 0)
? static_cast<int>(audioBuffer_.size() * 1000 / audioSampleRate_)
: 0;
statusLabel_->setText(
QString("录音中 | 缓冲区: %1 ms").arg(bufMs));
statusLabel_->setText(QString("录音中 | 已识别 %1 个片段").arg(completedCount_));
}
}, Qt::QueuedConnection);
});
@ -256,8 +267,8 @@ void STTTestPage::onRecognitionResult(const QString& text, float confidence,
double latency, bool isFinal)
{
QString timestamp = QDateTime::currentDateTime().toString("hh:mm:ss");
QString line = QString("[%1] %2 (置信度: %3%, 延迟: %4 ms)\n")
.arg(timestamp, text)
QString line = QString("[%1] #%2 %3 (置信度: %4%, 延迟: %5 ms)")
.arg(timestamp).arg(completedCount_).arg(text)
.arg(confidence * 100, 0, 'f', 1)
.arg(latency, 0, 'f', 1);

View File

@ -9,20 +9,20 @@ class QPushButton;
class QComboBox;
class QTextEdit;
class QSpinBox;
class QTimer;
namespace impress {
class ConfigManager;
class SenseVoiceEngine;
class AudioCapture;
class StreamingAudioWriter;
/**
* @brief STT
*
* +
* + VAD WAV +
*
* 使 SenseVoice
* 使 VAD WAV
*/
class STTTestPage : public QWidget {
Q_OBJECT
@ -35,22 +35,22 @@ public:
private slots:
void onToggleRecording();
void onAudioDataReady(const std::vector<float>& samples, int sampleRate);
void onChunkCompleted(const QString& filePath, int durationMs);
void onRecognitionResult(const QString& text, float confidence, double latency, bool isFinal);
void onModelLoaded(const QString& modelPath);
void onModelLoadError(const QString& modelPath, const QString& error);
void onModelUnloaded();
void onInferenceTimer();
private:
void setupUI();
void updateUIState();
void startAudioCapture();
void startInferenceTimer();
void transcribeChunk(const QString& filePath, int durationMs);
ConfigManager* configManager_;
SenseVoiceEngine* sttEngine_;
AudioCapture* audioCapture_;
QTimer* inferenceTimer_;
StreamingAudioWriter* streamingWriter_;
// UI 控件
QComboBox* deviceCombo_;
@ -59,13 +59,12 @@ private:
QLabel* latencyLabel_;
QLabel* statusLabel_;
AudioWaveform* waveform_;
QSpinBox* chunkSizeSpin_;
bool isRecording_ = false;
bool isLoadingModel_ = false;
bool isInferencing_ = false;
int audioSampleRate_ = 16000;
std::vector<float> audioBuffer_;
int completedCount_ = 0; // 已完成文件计数
};
} // namespace impress