fix: 降低 VAD 能量阈值并添加 STT 推理诊断日志
- VAD 能量阈值从 0.015 降至 0.003,适配低增益麦克风 - transcribeChunk 添加 WAV 文件大小、样本数、RMS 诊断日志 - onChunkCompleted 增加空文件路径检查 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
271fccb39b
commit
801dbe1ec2
@ -54,8 +54,8 @@ bool StreamingAudioWriter::start(int sampleRate, bool debugEnabled, const QStrin
|
|||||||
wasSpeaking_ = false;
|
wasSpeaking_ = false;
|
||||||
silenceFramesAfterSpeech_ = 0;
|
silenceFramesAfterSpeech_ = 0;
|
||||||
|
|
||||||
// 初始化 VAD(30ms 帧,低能量阈值)
|
// 初始化 VAD(30ms 帧,降低能量阈值以适配低增益麦克风)
|
||||||
vad_ = std::make_unique<VoiceActivityDetector>(sampleRate_, 30, 0.015f, 3);
|
vad_ = std::make_unique<VoiceActivityDetector>(sampleRate_, 30, 0.003f, 3);
|
||||||
|
|
||||||
// VAD 帧大小
|
// VAD 帧大小
|
||||||
vadFrameSize_ = sampleRate_ * 30 / 1000;
|
vadFrameSize_ = sampleRate_ * 30 / 1000;
|
||||||
@ -128,7 +128,7 @@ void StreamingAudioWriter::writeSamples(const std::vector<float>& samples) {
|
|||||||
samplesWritten_ = 0;
|
samplesWritten_ = 0;
|
||||||
silenceFramesAfterSpeech_ = 0;
|
silenceFramesAfterSpeech_ = 0;
|
||||||
wasSpeaking_ = false;
|
wasSpeaking_ = false;
|
||||||
vad_ = std::make_unique<VoiceActivityDetector>(sampleRate_, 30, 0.015f, 3);
|
vad_ = std::make_unique<VoiceActivityDetector>(sampleRate_, 30, 0.003f, 3);
|
||||||
|
|
||||||
if (!openNewFile()) {
|
if (!openNewFile()) {
|
||||||
LOG_ERROR(kTag, "无法打开新的 WAV 文件,停止录制");
|
LOG_ERROR(kTag, "无法打开新的 WAV 文件,停止录制");
|
||||||
|
|||||||
@ -196,6 +196,12 @@ void STTTestPage::onChunkCompleted(const QString& filePath, int durationMs) {
|
|||||||
LOG_INFO(kTag, QString("WAV 片段 #%1 已完成: %2 (%3ms)")
|
LOG_INFO(kTag, QString("WAV 片段 #%1 已完成: %2 (%3ms)")
|
||||||
.arg(completedCount_).arg(filePath).arg(durationMs));
|
.arg(completedCount_).arg(filePath).arg(durationMs));
|
||||||
|
|
||||||
|
if (filePath.isEmpty()) {
|
||||||
|
LOG_ERROR(kTag, "chunkCompleted 信号中文件路径为空,跳过识别");
|
||||||
|
statusLabel_->setText("错误:文件路径为空");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
statusLabel_->setText(QString("正在识别 #%1 (%2ms)...").arg(completedCount_).arg(durationMs));
|
statusLabel_->setText(QString("正在识别 #%1 (%2ms)...").arg(completedCount_).arg(durationMs));
|
||||||
|
|
||||||
// 在后台线程对 WAV 文件进行识别
|
// 在后台线程对 WAV 文件进行识别
|
||||||
@ -221,6 +227,9 @@ void STTTestPage::transcribeChunk(const QString& filePath, int /* durationMs */)
|
|||||||
if (!file.open(QIODevice::ReadOnly)) {
|
if (!file.open(QIODevice::ReadOnly)) {
|
||||||
errorMsg = QString("无法打开 WAV 文件: %1").arg(filePath);
|
errorMsg = QString("无法打开 WAV 文件: %1").arg(filePath);
|
||||||
} else {
|
} else {
|
||||||
|
// 诊断:文件大小
|
||||||
|
qint64 fileSize = file.size();
|
||||||
|
|
||||||
// 跳过 44 字节 WAV 头
|
// 跳过 44 字节 WAV 头
|
||||||
file.seek(44);
|
file.seek(44);
|
||||||
QByteArray raw = file.readAll();
|
QByteArray raw = file.readAll();
|
||||||
@ -229,13 +238,20 @@ void STTTestPage::transcribeChunk(const QString& filePath, int /* durationMs */)
|
|||||||
// int16 -> float
|
// int16 -> float
|
||||||
int numSamples = raw.size() / 2;
|
int numSamples = raw.size() / 2;
|
||||||
std::vector<float> samples(numSamples);
|
std::vector<float> samples(numSamples);
|
||||||
|
double rms = 0.0;
|
||||||
for (int i = 0; i < numSamples; i++) {
|
for (int i = 0; i < numSamples; i++) {
|
||||||
int16_t val = *reinterpret_cast<const int16_t*>(raw.data() + i * 2);
|
int16_t val = *reinterpret_cast<const int16_t*>(raw.data() + i * 2);
|
||||||
samples[i] = static_cast<float>(val) / 32767.0f;
|
samples[i] = static_cast<float>(val) / 32767.0f;
|
||||||
|
rms += static_cast<double>(samples[i]) * samples[i];
|
||||||
}
|
}
|
||||||
|
rms = std::sqrt(rms / std::max(1, numSamples));
|
||||||
|
|
||||||
|
LOG_DEBUG(kTag, QString("WAV 诊断: %1 (文件大小: %2 字节, 样本: %3, RMS: %4)")
|
||||||
|
.arg(filePath).arg(fileSize).arg(numSamples).arg(rms, 0, 'f', 6));
|
||||||
|
|
||||||
if (!sttEngine_->isLoaded()) {
|
if (!sttEngine_->isLoaded()) {
|
||||||
text = "[错误] 模型未加载";
|
text = "[错误] 模型未加载";
|
||||||
|
LOG_ERROR(kTag, "模型未加载,无法推理");
|
||||||
} else {
|
} else {
|
||||||
QString language = configManager_->get("stt.language").toString();
|
QString language = configManager_->get("stt.language").toString();
|
||||||
auto result = sttEngine_->infer(samples, audioSampleRate_, language);
|
auto result = sttEngine_->infer(samples, audioSampleRate_, language);
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user