feat: 集成 SenseVoice 完整推理管线
- 新增 SenseVoiceEngine: 专用 SenseVoice ONNX 模型推理引擎 - 新增 SenseVoiceFeatures: Fbank 特征提取 + LFR 拼接 + CMVN 归一化 - 80-dim Mel 滤波器组 + 对数压缩 - LFR (Low Frame Rate): window_size=7, window_shift=6 → 560-dim - CMVN: neg_mean / inv_stddev 从模型元数据自动提取 - 新增 SenseVoiceTokenizer: 加载 tokens.txt 词表,BPE 解码 - 新增 CTC 贪婪解码: 去除重复 token 和空白符 - 配置页面新增词表路径选择 - STT 测试页面和文件转写页面切换至 SenseVoiceEngine - 更新 CMakeLists.txt 包含所有新增源文件 - 模型: /home/alvin/Documents/SenseVoice-Small/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx - 语言支持: 中文/英语/日语/韩语/粤语 自动检测 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
49313f15f9
commit
9a6dfa3b88
@ -46,6 +46,9 @@ set(SOURCES
|
||||
|
||||
# Core
|
||||
src/core/stt_engine.cpp
|
||||
src/core/sense_voice_engine.cpp
|
||||
src/core/sense_voice_features.cpp
|
||||
src/core/sense_voice_tokenizer.cpp
|
||||
src/core/mel_spectrogram.cpp
|
||||
src/core/whisper_tokenizer.cpp
|
||||
src/core/audio_processor.cpp
|
||||
@ -76,6 +79,10 @@ set(HEADERS
|
||||
src/app/config_manager.h
|
||||
|
||||
src/core/stt_engine.h
|
||||
src/core/sense_voice_engine.h
|
||||
src/core/sense_voice_features.h
|
||||
src/core/sense_voice_tokenizer.h
|
||||
src/core/sense_voice_cmvn.h
|
||||
src/core/mel_spectrogram.h
|
||||
src/core/whisper_tokenizer.h
|
||||
src/core/audio_processor.h
|
||||
|
||||
@ -131,7 +131,8 @@ ctest
|
||||
- [x] 语音活动检测 (VAD — 短时能量 + 过零率)
|
||||
- [x] 音频文件信息 (时长/采样率/声道数)
|
||||
- [x] 单元测试框架 (Catch2, 39 个测试用例)
|
||||
- [ ] 完整 Whisper 推理 (自回归解码 + 流式识别)
|
||||
- [x] 完整 Whisper 推理 (自回归解码 + 流式识别)
|
||||
- [x] SenseVoice 完整推理 (Fbank → LFR → CMVN → ONNX → CTC 解码)
|
||||
- [ ] 跨平台打包
|
||||
|
||||
## License
|
||||
|
||||
@ -66,7 +66,8 @@ void ConfigManager::loadDefaults() {
|
||||
config_ = QVariantMap{
|
||||
{"stt", QVariantMap{
|
||||
{"model_path", ""},
|
||||
{"model_type", "whisper"},
|
||||
{"model_type", "sense_voice"},
|
||||
{"tokens_path", ""},
|
||||
{"device", "cpu"},
|
||||
{"num_threads", 4},
|
||||
{"sample_rate", 16000},
|
||||
|
||||
174
src/core/sense_voice_cmvn.h
Normal file
174
src/core/sense_voice_cmvn.h
Normal file
@ -0,0 +1,174 @@
|
||||
// Auto-generated CMVN coefficients from SenseVoice model
|
||||
// DO NOT EDIT MANUALLY
|
||||
#pragma once
|
||||
|
||||
namespace impress {
|
||||
|
||||
// neg_mean (560 values)
|
||||
static constexpr float kNegMean[] = {
|
||||
-8.311879f, -8.600912f, -9.615928f, -10.435950f, -11.212920f, -11.883330f, -12.362430f, -12.637060f,
|
||||
-12.881800f, -12.830660f, -12.891030f, -12.956660f, -13.197630f, -13.405980f, -13.491130f, -13.554600f,
|
||||
-13.556390f, -13.519150f, -13.682840f, -13.532890f, -13.421070f, -13.655190f, -13.507130f, -13.752510f,
|
||||
-13.767150f, -13.874080f, -13.731090f, -13.704120f, -13.560730f, -13.534880f, -13.548950f, -13.562280f,
|
||||
-13.594080f, -13.620470f, -13.641980f, -13.661090f, -13.626690f, -13.582970f, -13.573870f, -13.473900f,
|
||||
-13.530630f, -13.483480f, -13.610470f, -13.647160f, -13.715460f, -13.791840f, -13.906140f, -14.030980f,
|
||||
-14.182050f, -14.358810f, -14.484190f, -14.601720f, -14.705910f, -14.833620f, -14.921220f, -15.006220f,
|
||||
-15.051220f, -15.031190f, -14.990280f, -14.923020f, -14.869270f, -14.826910f, -14.797200f, -14.769090f,
|
||||
-14.713560f, -14.612770f, -14.516960f, -14.422520f, -14.364050f, -14.304510f, -14.231610f, -14.198510f,
|
||||
-14.166330f, -14.156490f, -14.105040f, -13.995180f, -13.795620f, -13.399600f, -12.776700f, -11.712080f,
|
||||
-8.311879f, -8.600912f, -9.615928f, -10.435950f, -11.212920f, -11.883330f, -12.362430f, -12.637060f,
|
||||
-12.881800f, -12.830660f, -12.891030f, -12.956660f, -13.197630f, -13.405980f, -13.491130f, -13.554600f,
|
||||
-13.556390f, -13.519150f, -13.682840f, -13.532890f, -13.421070f, -13.655190f, -13.507130f, -13.752510f,
|
||||
-13.767150f, -13.874080f, -13.731090f, -13.704120f, -13.560730f, -13.534880f, -13.548950f, -13.562280f,
|
||||
-13.594080f, -13.620470f, -13.641980f, -13.661090f, -13.626690f, -13.582970f, -13.573870f, -13.473900f,
|
||||
-13.530630f, -13.483480f, -13.610470f, -13.647160f, -13.715460f, -13.791840f, -13.906140f, -14.030980f,
|
||||
-14.182050f, -14.358810f, -14.484190f, -14.601720f, -14.705910f, -14.833620f, -14.921220f, -15.006220f,
|
||||
-15.051220f, -15.031190f, -14.990280f, -14.923020f, -14.869270f, -14.826910f, -14.797200f, -14.769090f,
|
||||
-14.713560f, -14.612770f, -14.516960f, -14.422520f, -14.364050f, -14.304510f, -14.231610f, -14.198510f,
|
||||
-14.166330f, -14.156490f, -14.105040f, -13.995180f, -13.795620f, -13.399600f, -12.776700f, -11.712080f,
|
||||
-8.311879f, -8.600912f, -9.615928f, -10.435950f, -11.212920f, -11.883330f, -12.362430f, -12.637060f,
|
||||
-12.881800f, -12.830660f, -12.891030f, -12.956660f, -13.197630f, -13.405980f, -13.491130f, -13.554600f,
|
||||
-13.556390f, -13.519150f, -13.682840f, -13.532890f, -13.421070f, -13.655190f, -13.507130f, -13.752510f,
|
||||
-13.767150f, -13.874080f, -13.731090f, -13.704120f, -13.560730f, -13.534880f, -13.548950f, -13.562280f,
|
||||
-13.594080f, -13.620470f, -13.641980f, -13.661090f, -13.626690f, -13.582970f, -13.573870f, -13.473900f,
|
||||
-13.530630f, -13.483480f, -13.610470f, -13.647160f, -13.715460f, -13.791840f, -13.906140f, -14.030980f,
|
||||
-14.182050f, -14.358810f, -14.484190f, -14.601720f, -14.705910f, -14.833620f, -14.921220f, -15.006220f,
|
||||
-15.051220f, -15.031190f, -14.990280f, -14.923020f, -14.869270f, -14.826910f, -14.797200f, -14.769090f,
|
||||
-14.713560f, -14.612770f, -14.516960f, -14.422520f, -14.364050f, -14.304510f, -14.231610f, -14.198510f,
|
||||
-14.166330f, -14.156490f, -14.105040f, -13.995180f, -13.795620f, -13.399600f, -12.776700f, -11.712080f,
|
||||
-8.311879f, -8.600912f, -9.615928f, -10.435950f, -11.212920f, -11.883330f, -12.362430f, -12.637060f,
|
||||
-12.881800f, -12.830660f, -12.891030f, -12.956660f, -13.197630f, -13.405980f, -13.491130f, -13.554600f,
|
||||
-13.556390f, -13.519150f, -13.682840f, -13.532890f, -13.421070f, -13.655190f, -13.507130f, -13.752510f,
|
||||
-13.767150f, -13.874080f, -13.731090f, -13.704120f, -13.560730f, -13.534880f, -13.548950f, -13.562280f,
|
||||
-13.594080f, -13.620470f, -13.641980f, -13.661090f, -13.626690f, -13.582970f, -13.573870f, -13.473900f,
|
||||
-13.530630f, -13.483480f, -13.610470f, -13.647160f, -13.715460f, -13.791840f, -13.906140f, -14.030980f,
|
||||
-14.182050f, -14.358810f, -14.484190f, -14.601720f, -14.705910f, -14.833620f, -14.921220f, -15.006220f,
|
||||
-15.051220f, -15.031190f, -14.990280f, -14.923020f, -14.869270f, -14.826910f, -14.797200f, -14.769090f,
|
||||
-14.713560f, -14.612770f, -14.516960f, -14.422520f, -14.364050f, -14.304510f, -14.231610f, -14.198510f,
|
||||
-14.166330f, -14.156490f, -14.105040f, -13.995180f, -13.795620f, -13.399600f, -12.776700f, -11.712080f,
|
||||
-8.311879f, -8.600912f, -9.615928f, -10.435950f, -11.212920f, -11.883330f, -12.362430f, -12.637060f,
|
||||
-12.881800f, -12.830660f, -12.891030f, -12.956660f, -13.197630f, -13.405980f, -13.491130f, -13.554600f,
|
||||
-13.556390f, -13.519150f, -13.682840f, -13.532890f, -13.421070f, -13.655190f, -13.507130f, -13.752510f,
|
||||
-13.767150f, -13.874080f, -13.731090f, -13.704120f, -13.560730f, -13.534880f, -13.548950f, -13.562280f,
|
||||
-13.594080f, -13.620470f, -13.641980f, -13.661090f, -13.626690f, -13.582970f, -13.573870f, -13.473900f,
|
||||
-13.530630f, -13.483480f, -13.610470f, -13.647160f, -13.715460f, -13.791840f, -13.906140f, -14.030980f,
|
||||
-14.182050f, -14.358810f, -14.484190f, -14.601720f, -14.705910f, -14.833620f, -14.921220f, -15.006220f,
|
||||
-15.051220f, -15.031190f, -14.990280f, -14.923020f, -14.869270f, -14.826910f, -14.797200f, -14.769090f,
|
||||
-14.713560f, -14.612770f, -14.516960f, -14.422520f, -14.364050f, -14.304510f, -14.231610f, -14.198510f,
|
||||
-14.166330f, -14.156490f, -14.105040f, -13.995180f, -13.795620f, -13.399600f, -12.776700f, -11.712080f,
|
||||
-8.311879f, -8.600912f, -9.615928f, -10.435950f, -11.212920f, -11.883330f, -12.362430f, -12.637060f,
|
||||
-12.881800f, -12.830660f, -12.891030f, -12.956660f, -13.197630f, -13.405980f, -13.491130f, -13.554600f,
|
||||
-13.556390f, -13.519150f, -13.682840f, -13.532890f, -13.421070f, -13.655190f, -13.507130f, -13.752510f,
|
||||
-13.767150f, -13.874080f, -13.731090f, -13.704120f, -13.560730f, -13.534880f, -13.548950f, -13.562280f,
|
||||
-13.594080f, -13.620470f, -13.641980f, -13.661090f, -13.626690f, -13.582970f, -13.573870f, -13.473900f,
|
||||
-13.530630f, -13.483480f, -13.610470f, -13.647160f, -13.715460f, -13.791840f, -13.906140f, -14.030980f,
|
||||
-14.182050f, -14.358810f, -14.484190f, -14.601720f, -14.705910f, -14.833620f, -14.921220f, -15.006220f,
|
||||
-15.051220f, -15.031190f, -14.990280f, -14.923020f, -14.869270f, -14.826910f, -14.797200f, -14.769090f,
|
||||
-14.713560f, -14.612770f, -14.516960f, -14.422520f, -14.364050f, -14.304510f, -14.231610f, -14.198510f,
|
||||
-14.166330f, -14.156490f, -14.105040f, -13.995180f, -13.795620f, -13.399600f, -12.776700f, -11.712080f,
|
||||
-8.311879f, -8.600912f, -9.615928f, -10.435950f, -11.212920f, -11.883330f, -12.362430f, -12.637060f,
|
||||
-12.881800f, -12.830660f, -12.891030f, -12.956660f, -13.197630f, -13.405980f, -13.491130f, -13.554600f,
|
||||
-13.556390f, -13.519150f, -13.682840f, -13.532890f, -13.421070f, -13.655190f, -13.507130f, -13.752510f,
|
||||
-13.767150f, -13.874080f, -13.731090f, -13.704120f, -13.560730f, -13.534880f, -13.548950f, -13.562280f,
|
||||
-13.594080f, -13.620470f, -13.641980f, -13.661090f, -13.626690f, -13.582970f, -13.573870f, -13.473900f,
|
||||
-13.530630f, -13.483480f, -13.610470f, -13.647160f, -13.715460f, -13.791840f, -13.906140f, -14.030980f,
|
||||
-14.182050f, -14.358810f, -14.484190f, -14.601720f, -14.705910f, -14.833620f, -14.921220f, -15.006220f,
|
||||
-15.051220f, -15.031190f, -14.990280f, -14.923020f, -14.869270f, -14.826910f, -14.797200f, -14.769090f,
|
||||
-14.713560f, -14.612770f, -14.516960f, -14.422520f, -14.364050f, -14.304510f, -14.231610f, -14.198510f,
|
||||
-14.166330f, -14.156490f, -14.105040f, -13.995180f, -13.795620f, -13.399600f, -12.776700f, -11.712080f,
|
||||
};
|
||||
|
||||
// inv_stddev (560 values)
|
||||
static constexpr float kInvStddev[] = {
|
||||
0.155775f, 0.154484f, 0.152738f, 0.151872f, 0.150603f, 0.148926f, 0.147067f, 0.144706f,
|
||||
0.143631f, 0.144357f, 0.145185f, 0.145516f, 0.145282f, 0.144572f, 0.143920f, 0.143587f,
|
||||
0.143602f, 0.143878f, 0.144209f, 0.144884f, 0.145476f, 0.145663f, 0.146268f, 0.146739f,
|
||||
0.147272f, 0.147664f, 0.148091f, 0.148374f, 0.148884f, 0.149364f, 0.149709f, 0.150038f,
|
||||
0.150292f, 0.150539f, 0.150679f, 0.150710f, 0.150599f, 0.150544f, 0.150594f, 0.150813f,
|
||||
0.150957f, 0.151240f, 0.151462f, 0.151619f, 0.151616f, 0.151556f, 0.151497f, 0.151398f,
|
||||
0.151261f, 0.151076f, 0.151060f, 0.151043f, 0.151077f, 0.151117f, 0.151192f, 0.151023f,
|
||||
0.150805f, 0.150588f, 0.150349f, 0.150237f, 0.150173f, 0.150076f, 0.150006f, 0.149978f,
|
||||
0.150057f, 0.150266f, 0.150469f, 0.150533f, 0.150551f, 0.150533f, 0.150427f, 0.150244f,
|
||||
0.149967f, 0.149712f, 0.149466f, 0.149310f, 0.149368f, 0.149550f, 0.149974f, 0.150965f,
|
||||
0.155775f, 0.154484f, 0.152738f, 0.151872f, 0.150603f, 0.148926f, 0.147067f, 0.144706f,
|
||||
0.143631f, 0.144357f, 0.145185f, 0.145516f, 0.145282f, 0.144572f, 0.143920f, 0.143587f,
|
||||
0.143602f, 0.143878f, 0.144209f, 0.144884f, 0.145476f, 0.145663f, 0.146268f, 0.146739f,
|
||||
0.147272f, 0.147664f, 0.148091f, 0.148374f, 0.148884f, 0.149364f, 0.149709f, 0.150038f,
|
||||
0.150292f, 0.150539f, 0.150679f, 0.150710f, 0.150599f, 0.150544f, 0.150594f, 0.150813f,
|
||||
0.150957f, 0.151240f, 0.151462f, 0.151619f, 0.151616f, 0.151556f, 0.151497f, 0.151398f,
|
||||
0.151261f, 0.151076f, 0.151060f, 0.151043f, 0.151077f, 0.151117f, 0.151192f, 0.151023f,
|
||||
0.150805f, 0.150588f, 0.150349f, 0.150237f, 0.150173f, 0.150076f, 0.150006f, 0.149978f,
|
||||
0.150057f, 0.150266f, 0.150469f, 0.150533f, 0.150551f, 0.150533f, 0.150427f, 0.150244f,
|
||||
0.149967f, 0.149712f, 0.149466f, 0.149310f, 0.149368f, 0.149550f, 0.149974f, 0.150965f,
|
||||
0.155775f, 0.154484f, 0.152738f, 0.151872f, 0.150603f, 0.148926f, 0.147067f, 0.144706f,
|
||||
0.143631f, 0.144357f, 0.145185f, 0.145516f, 0.145282f, 0.144572f, 0.143920f, 0.143587f,
|
||||
0.143602f, 0.143878f, 0.144209f, 0.144884f, 0.145476f, 0.145663f, 0.146268f, 0.146739f,
|
||||
0.147272f, 0.147664f, 0.148091f, 0.148374f, 0.148884f, 0.149364f, 0.149709f, 0.150038f,
|
||||
0.150292f, 0.150539f, 0.150679f, 0.150710f, 0.150599f, 0.150544f, 0.150594f, 0.150813f,
|
||||
0.150957f, 0.151240f, 0.151462f, 0.151619f, 0.151616f, 0.151556f, 0.151497f, 0.151398f,
|
||||
0.151261f, 0.151076f, 0.151060f, 0.151043f, 0.151077f, 0.151117f, 0.151192f, 0.151023f,
|
||||
0.150805f, 0.150588f, 0.150349f, 0.150237f, 0.150173f, 0.150076f, 0.150006f, 0.149978f,
|
||||
0.150057f, 0.150266f, 0.150469f, 0.150533f, 0.150551f, 0.150533f, 0.150427f, 0.150244f,
|
||||
0.149967f, 0.149712f, 0.149466f, 0.149310f, 0.149368f, 0.149550f, 0.149974f, 0.150965f,
|
||||
0.155775f, 0.154484f, 0.152738f, 0.151872f, 0.150603f, 0.148926f, 0.147067f, 0.144706f,
|
||||
0.143631f, 0.144357f, 0.145185f, 0.145516f, 0.145282f, 0.144572f, 0.143920f, 0.143587f,
|
||||
0.143602f, 0.143878f, 0.144209f, 0.144884f, 0.145476f, 0.145663f, 0.146268f, 0.146739f,
|
||||
0.147272f, 0.147664f, 0.148091f, 0.148374f, 0.148884f, 0.149364f, 0.149709f, 0.150038f,
|
||||
0.150292f, 0.150539f, 0.150679f, 0.150710f, 0.150599f, 0.150544f, 0.150594f, 0.150813f,
|
||||
0.150957f, 0.151240f, 0.151462f, 0.151619f, 0.151616f, 0.151556f, 0.151497f, 0.151398f,
|
||||
0.151261f, 0.151076f, 0.151060f, 0.151043f, 0.151077f, 0.151117f, 0.151192f, 0.151023f,
|
||||
0.150805f, 0.150588f, 0.150349f, 0.150237f, 0.150173f, 0.150076f, 0.150006f, 0.149978f,
|
||||
0.150057f, 0.150266f, 0.150469f, 0.150533f, 0.150551f, 0.150533f, 0.150427f, 0.150244f,
|
||||
0.149967f, 0.149712f, 0.149466f, 0.149310f, 0.149368f, 0.149550f, 0.149974f, 0.150965f,
|
||||
0.155775f, 0.154484f, 0.152738f, 0.151872f, 0.150603f, 0.148926f, 0.147067f, 0.144706f,
|
||||
0.143631f, 0.144357f, 0.145185f, 0.145516f, 0.145282f, 0.144572f, 0.143920f, 0.143587f,
|
||||
0.143602f, 0.143878f, 0.144209f, 0.144884f, 0.145476f, 0.145663f, 0.146268f, 0.146739f,
|
||||
0.147272f, 0.147664f, 0.148091f, 0.148374f, 0.148884f, 0.149364f, 0.149709f, 0.150038f,
|
||||
0.150292f, 0.150539f, 0.150679f, 0.150710f, 0.150599f, 0.150544f, 0.150594f, 0.150813f,
|
||||
0.150957f, 0.151240f, 0.151462f, 0.151619f, 0.151616f, 0.151556f, 0.151497f, 0.151398f,
|
||||
0.151261f, 0.151076f, 0.151060f, 0.151043f, 0.151077f, 0.151117f, 0.151192f, 0.151023f,
|
||||
0.150805f, 0.150588f, 0.150349f, 0.150237f, 0.150173f, 0.150076f, 0.150006f, 0.149978f,
|
||||
0.150057f, 0.150266f, 0.150469f, 0.150533f, 0.150551f, 0.150533f, 0.150427f, 0.150244f,
|
||||
0.149967f, 0.149712f, 0.149466f, 0.149310f, 0.149368f, 0.149550f, 0.149974f, 0.150965f,
|
||||
0.155775f, 0.154484f, 0.152738f, 0.151872f, 0.150603f, 0.148926f, 0.147067f, 0.144706f,
|
||||
0.143631f, 0.144357f, 0.145185f, 0.145516f, 0.145282f, 0.144572f, 0.143920f, 0.143587f,
|
||||
0.143602f, 0.143878f, 0.144209f, 0.144884f, 0.145476f, 0.145663f, 0.146268f, 0.146739f,
|
||||
0.147272f, 0.147664f, 0.148091f, 0.148374f, 0.148884f, 0.149364f, 0.149709f, 0.150038f,
|
||||
0.150292f, 0.150539f, 0.150679f, 0.150710f, 0.150599f, 0.150544f, 0.150594f, 0.150813f,
|
||||
0.150957f, 0.151240f, 0.151462f, 0.151619f, 0.151616f, 0.151556f, 0.151497f, 0.151398f,
|
||||
0.151261f, 0.151076f, 0.151060f, 0.151043f, 0.151077f, 0.151117f, 0.151192f, 0.151023f,
|
||||
0.150805f, 0.150588f, 0.150349f, 0.150237f, 0.150173f, 0.150076f, 0.150006f, 0.149978f,
|
||||
0.150057f, 0.150266f, 0.150469f, 0.150533f, 0.150551f, 0.150533f, 0.150427f, 0.150244f,
|
||||
0.149967f, 0.149712f, 0.149466f, 0.149310f, 0.149368f, 0.149550f, 0.149974f, 0.150965f,
|
||||
0.155775f, 0.154484f, 0.152738f, 0.151872f, 0.150603f, 0.148926f, 0.147067f, 0.144706f,
|
||||
0.143631f, 0.144357f, 0.145185f, 0.145516f, 0.145282f, 0.144572f, 0.143920f, 0.143587f,
|
||||
0.143602f, 0.143878f, 0.144209f, 0.144884f, 0.145476f, 0.145663f, 0.146268f, 0.146739f,
|
||||
0.147272f, 0.147664f, 0.148091f, 0.148374f, 0.148884f, 0.149364f, 0.149709f, 0.150038f,
|
||||
0.150292f, 0.150539f, 0.150679f, 0.150710f, 0.150599f, 0.150544f, 0.150594f, 0.150813f,
|
||||
0.150957f, 0.151240f, 0.151462f, 0.151619f, 0.151616f, 0.151556f, 0.151497f, 0.151398f,
|
||||
0.151261f, 0.151076f, 0.151060f, 0.151043f, 0.151077f, 0.151117f, 0.151192f, 0.151023f,
|
||||
0.150805f, 0.150588f, 0.150349f, 0.150237f, 0.150173f, 0.150076f, 0.150006f, 0.149978f,
|
||||
0.150057f, 0.150266f, 0.150469f, 0.150533f, 0.150551f, 0.150533f, 0.150427f, 0.150244f,
|
||||
0.149967f, 0.149712f, 0.149466f, 0.149310f, 0.149368f, 0.149550f, 0.149974f, 0.150965f,
|
||||
};
|
||||
|
||||
// LFR 参数
|
||||
static constexpr int kLFRWindowSize = 7;
|
||||
static constexpr int kLFRWindowShift = 6;
|
||||
static constexpr int kFBankDim = 80;
|
||||
static constexpr int kLFROutputDim = 560; // 80 * 7
|
||||
|
||||
// 语言代码
|
||||
static constexpr int kLangAuto = 0;
|
||||
static constexpr int kLangZh = 3;
|
||||
static constexpr int kLangEn = 4;
|
||||
static constexpr int kLangYue = 7;
|
||||
static constexpr int kLangJa = 11;
|
||||
static constexpr int kLangKo = 12;
|
||||
static constexpr int kLangNoSpeech = 13;
|
||||
|
||||
// 文本归一化
|
||||
static constexpr int kTextNormWithITN = 14;
|
||||
static constexpr int kTextNormWithoutITN = 15;
|
||||
|
||||
} // namespace impress
|
||||
417
src/core/sense_voice_engine.cpp
Normal file
417
src/core/sense_voice_engine.cpp
Normal file
@ -0,0 +1,417 @@
|
||||
#include "sense_voice_engine.h"
|
||||
#include "sense_voice_features.h"
|
||||
#include "sense_voice_tokenizer.h"
|
||||
#include "sense_voice_cmvn.h"
|
||||
#include "audio_processor.h"
|
||||
#include "utils/logger.h"
|
||||
#include "utils/timer.h"
|
||||
|
||||
#include <QThread>
|
||||
#include <QFuture>
|
||||
#include <QtConcurrent>
|
||||
#include <QMutex>
|
||||
#include <QMutexLocker>
|
||||
#include <QFileInfo>
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
||||
// ONNX Runtime headers
|
||||
#ifdef HAVE_ONNXRUNTIME
|
||||
#include <onnxruntime_cxx_api.h>
|
||||
#endif
|
||||
|
||||
static const char* const kTag = "SenseVoiceEngine";
|
||||
|
||||
namespace impress {
|
||||
|
||||
/** 语言代码映射 */
|
||||
static int languageToInt(const QString& lang) {
|
||||
if (lang.isEmpty()) return kLangAuto;
|
||||
if (lang == "zh") return kLangZh;
|
||||
if (lang == "en") return kLangEn;
|
||||
if (lang == "ja") return kLangJa;
|
||||
if (lang == "ko") return kLangKo;
|
||||
if (lang == "yue") return kLangYue;
|
||||
if (lang == "auto") return kLangAuto;
|
||||
return kLangAuto;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief SenseVoice 引擎内部实现
|
||||
*/
|
||||
struct SenseVoiceEngine::Impl {
|
||||
#ifdef HAVE_ONNXRUNTIME
|
||||
std::unique_ptr<Ort::Env> env;
|
||||
std::unique_ptr<Ort::SessionOptions> sessionOptions;
|
||||
std::unique_ptr<Ort::Session> session;
|
||||
|
||||
std::vector<std::string> inputNames;
|
||||
std::vector<std::string> outputNames;
|
||||
|
||||
SenseVoiceTokenizer tokenizer;
|
||||
std::unique_ptr<SenseVoiceFeatures> features;
|
||||
|
||||
bool loadInWorker(const QString& modelPath,
|
||||
const QString& tokensPath,
|
||||
const QString& device,
|
||||
int numThreads,
|
||||
QString& errorMsg)
|
||||
{
|
||||
QMutexLocker locker(&mutex);
|
||||
try {
|
||||
auto envPtr = std::make_unique<Ort::Env>(
|
||||
ORT_LOGGING_LEVEL_WARNING, "impress_sensevoice");
|
||||
auto optionsPtr = std::make_unique<Ort::SessionOptions>();
|
||||
optionsPtr->SetIntraOpNumThreads(numThreads);
|
||||
optionsPtr->SetGraphOptimizationLevel(
|
||||
GraphOptimizationLevel::ORT_ENABLE_ALL);
|
||||
|
||||
if (device == "gpu") {
|
||||
LOG_WARNING(kTag, "GPU 加速尚未实现,回退到 CPU");
|
||||
}
|
||||
|
||||
LOG_INFO(kTag, QString("正在加载 SenseVoice 模型: %1 (线程: %2)")
|
||||
.arg(modelPath).arg(numThreads));
|
||||
|
||||
auto sessionPtr = std::make_unique<Ort::Session>(
|
||||
*envPtr,
|
||||
modelPath.toUtf8().constData(),
|
||||
*optionsPtr);
|
||||
|
||||
Ort::AllocatorWithDefaultOptions allocator;
|
||||
size_t inputCount = sessionPtr->GetInputCount();
|
||||
size_t outputCount = sessionPtr->GetOutputCount();
|
||||
|
||||
LOG_INFO(kTag, QString("模型有 %1 个输入, %2 个输出")
|
||||
.arg(inputCount).arg(outputCount));
|
||||
|
||||
inputNames.clear();
|
||||
outputNames.clear();
|
||||
|
||||
for (size_t i = 0; i < inputCount; i++) {
|
||||
auto namePtr = sessionPtr->GetInputNameAllocated(i, allocator);
|
||||
inputNames.emplace_back(namePtr.get());
|
||||
LOG_DEBUG(kTag, QString("输入 #%1: %2").arg(i).arg(namePtr.get()));
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < outputCount; i++) {
|
||||
auto namePtr = sessionPtr->GetOutputNameAllocated(i, allocator);
|
||||
outputNames.emplace_back(namePtr.get());
|
||||
LOG_DEBUG(kTag, QString("输出 #%1: %2").arg(i).arg(namePtr.get()));
|
||||
}
|
||||
|
||||
env = std::move(envPtr);
|
||||
sessionOptions = std::move(optionsPtr);
|
||||
session = std::move(sessionPtr);
|
||||
|
||||
// 加载 tokenizer 词表
|
||||
QString vocabPath = tokensPath;
|
||||
if (vocabPath.isEmpty()) {
|
||||
QFileInfo modelInfo(modelPath);
|
||||
vocabPath = modelInfo.absolutePath() + "/tokens.txt";
|
||||
}
|
||||
if (QFile::exists(vocabPath)) {
|
||||
tokenizer.load(vocabPath);
|
||||
LOG_INFO(kTag, QString("Tokenizer 词表已加载: %1").arg(vocabPath));
|
||||
} else {
|
||||
LOG_WARNING(kTag, QString("未找到 tokenizer 词表: %1").arg(vocabPath));
|
||||
}
|
||||
|
||||
// 初始化特征提取器
|
||||
features = std::make_unique<SenseVoiceFeatures>(16000);
|
||||
|
||||
LOG_INFO(kTag, QString("SenseVoice 模型加载成功: %1").arg(modelPath));
|
||||
return true;
|
||||
} catch (const Ort::Exception& e) {
|
||||
errorMsg = QString("ONNX 异常: %1").arg(e.what());
|
||||
LOG_ERROR(kTag, errorMsg);
|
||||
return false;
|
||||
} catch (const std::exception& e) {
|
||||
errorMsg = QString("加载异常: %1").arg(e.what());
|
||||
LOG_ERROR(kTag, errorMsg);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
QMutex mutex;
|
||||
#endif
|
||||
};
|
||||
|
||||
SenseVoiceEngine::SenseVoiceEngine(QObject* parent)
|
||||
: QObject(parent)
|
||||
, impl_(std::make_unique<Impl>())
|
||||
{}
|
||||
|
||||
SenseVoiceEngine::~SenseVoiceEngine() {
|
||||
unloadModel();
|
||||
}
|
||||
|
||||
bool SenseVoiceEngine::loadModelSync(const QString& modelPath,
|
||||
const QString& tokensPath,
|
||||
const QString& device,
|
||||
int numThreads)
|
||||
{
|
||||
if (loaded_) {
|
||||
LOG_WARNING(kTag, "模型已加载,先卸载再加载");
|
||||
unloadModel();
|
||||
}
|
||||
|
||||
QString errorMsg;
|
||||
bool success = impl_->loadInWorker(modelPath, tokensPath, device, numThreads, errorMsg);
|
||||
loaded_ = success;
|
||||
|
||||
if (success) {
|
||||
emit modelLoaded(modelPath);
|
||||
} else {
|
||||
emit modelLoadError(modelPath, errorMsg);
|
||||
emit error(errorMsg);
|
||||
}
|
||||
return success;
|
||||
}
|
||||
|
||||
void SenseVoiceEngine::loadModelAsync(const QString& modelPath,
|
||||
const QString& tokensPath,
|
||||
const QString& device,
|
||||
int numThreads)
|
||||
{
|
||||
if (loaded_) {
|
||||
LOG_WARNING(kTag, "模型已加载,先卸载再加载");
|
||||
unloadModel();
|
||||
}
|
||||
|
||||
LOG_INFO(kTag, QString("异步加载 SenseVoice 模型: %1").arg(modelPath));
|
||||
|
||||
QFuture<void> future = QtConcurrent::run([this, modelPath, tokensPath, device, numThreads]() {
|
||||
QString errorMsg;
|
||||
bool success = impl_->loadInWorker(modelPath, tokensPath, device, numThreads, errorMsg);
|
||||
|
||||
QMetaObject::invokeMethod(this, [this, modelPath, errorMsg, success]() {
|
||||
loaded_ = success;
|
||||
if (success) {
|
||||
emit modelLoaded(modelPath);
|
||||
} else {
|
||||
emit modelLoadError(modelPath, errorMsg);
|
||||
emit error(errorMsg);
|
||||
}
|
||||
}, Qt::QueuedConnection);
|
||||
});
|
||||
}
|
||||
|
||||
void SenseVoiceEngine::unloadModel() {
|
||||
QMutexLocker locker(&impl_->mutex);
|
||||
#ifdef HAVE_ONNXRUNTIME
|
||||
impl_->session.reset();
|
||||
impl_->sessionOptions.reset();
|
||||
impl_->env.reset();
|
||||
impl_->features.reset();
|
||||
impl_->tokenizer = SenseVoiceTokenizer();
|
||||
#endif
|
||||
loaded_ = false;
|
||||
LOG_INFO(kTag, "模型已卸载");
|
||||
emit modelUnloaded();
|
||||
}
|
||||
|
||||
bool SenseVoiceEngine::isLoaded() const {
|
||||
return loaded_;
|
||||
}
|
||||
|
||||
/** CTC 贪婪解码:去重 + 去除空白 */
|
||||
static std::vector<int> ctcGreedyDecode(const std::vector<int>& tokens, int blankToken) {
|
||||
std::vector<int> result;
|
||||
int prev = -1;
|
||||
|
||||
for (int token : tokens) {
|
||||
if (token == blankToken) {
|
||||
prev = -1; // 重置去重状态
|
||||
continue;
|
||||
}
|
||||
if (token != prev) {
|
||||
result.push_back(token);
|
||||
}
|
||||
prev = token;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/** argmax: 寻找数组中最大值的索引 */
|
||||
static int argmax(const float* data, int start, int end) {
|
||||
int bestIdx = start;
|
||||
float bestVal = data[start];
|
||||
for (int i = start + 1; i < end; i++) {
|
||||
if (data[i] > bestVal) {
|
||||
bestVal = data[i];
|
||||
bestIdx = i;
|
||||
}
|
||||
}
|
||||
return bestIdx;
|
||||
}
|
||||
|
||||
RecognitionResult SenseVoiceEngine::infer(const std::vector<float>& samples,
|
||||
int sampleRate,
|
||||
const QString& language)
|
||||
{
|
||||
Timer timer;
|
||||
RecognitionResult result;
|
||||
|
||||
QString lang = language.isEmpty() ? "auto" : language;
|
||||
LOG_DEBUG(kTag, QString("推理语言: %1 (采样率: %2Hz, 样本数: %3)")
|
||||
.arg(lang).arg(sampleRate).arg(samples.size()));
|
||||
|
||||
#ifdef HAVE_ONNXRUNTIME
|
||||
if (!loaded_) {
|
||||
result.text = "[错误] 模型未加载";
|
||||
result.latency_ms = timer.elapsedMs();
|
||||
return result;
|
||||
}
|
||||
|
||||
if (samples.empty()) {
|
||||
result.text = "";
|
||||
result.latency_ms = timer.elapsedMs();
|
||||
return result;
|
||||
}
|
||||
|
||||
try {
|
||||
// 1. 重采样到 16kHz
|
||||
Timer preprocessTimer;
|
||||
std::vector<float> processedSamples = samples;
|
||||
int currentSampleRate = sampleRate;
|
||||
|
||||
if (sampleRate != 16000) {
|
||||
AudioProcessor processor(16000);
|
||||
processedSamples = processor.resample(samples, sampleRate);
|
||||
currentSampleRate = 16000;
|
||||
LOG_DEBUG(kTag, QString("重采样: %1Hz -> %2Hz (%3 -> %4 样本)")
|
||||
.arg(sampleRate).arg(currentSampleRate)
|
||||
.arg(samples.size()).arg(processedSamples.size()));
|
||||
}
|
||||
|
||||
// 2. 提取 LFR Fbank 特征
|
||||
std::vector<float> lfrFeatures = impl_->features->extract(processedSamples);
|
||||
int numFrames = static_cast<int>(lfrFeatures.size()) / kLFROutputDim;
|
||||
LOG_DEBUG(kTag, QString("特征提取: %1 ms (%2 帧, %3-dim)")
|
||||
.arg(preprocessTimer.elapsedMs(), 0, 'f', 1)
|
||||
.arg(numFrames).arg(kLFROutputDim));
|
||||
|
||||
if (numFrames <= 0) {
|
||||
result.text = "[错误] 特征提取失败";
|
||||
result.latency_ms = timer.elapsedMs();
|
||||
return result;
|
||||
}
|
||||
|
||||
// 3. 准备输入张量
|
||||
QMutexLocker locker(&impl_->mutex);
|
||||
|
||||
// 输入: x, x_length, language, text_norm
|
||||
int64_t xShape[] = {1, numFrames, kLFROutputDim};
|
||||
auto memInfo = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
|
||||
|
||||
int64_t xLengthVal = numFrames;
|
||||
int64_t xLengthShape[] = {1};
|
||||
|
||||
int langCode = languageToInt(lang);
|
||||
int64_t langVal = langCode;
|
||||
int64_t langShape[] = {1};
|
||||
|
||||
int64_t textNormVal = kTextNormWithITN;
|
||||
int64_t textNormShape[] = {1};
|
||||
|
||||
std::vector<Ort::Value> inputTensors;
|
||||
inputTensors.push_back(Ort::Value::CreateTensor<float>(
|
||||
memInfo, lfrFeatures.data(), lfrFeatures.size(), xShape, 3));
|
||||
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
|
||||
memInfo, &xLengthVal, 1, xLengthShape, 1));
|
||||
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
|
||||
memInfo, &langVal, 1, langShape, 1));
|
||||
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
|
||||
memInfo, &textNormVal, 1, textNormShape, 1));
|
||||
|
||||
// 4. 运行推理
|
||||
Timer inferTimer;
|
||||
std::vector<const char*> inputNamePtrs;
|
||||
for (auto& name : impl_->inputNames) inputNamePtrs.push_back(name.c_str());
|
||||
std::vector<const char*> outputNamePtrs;
|
||||
for (auto& name : impl_->outputNames) outputNamePtrs.push_back(name.c_str());
|
||||
|
||||
auto outputTensors = impl_->session->Run(
|
||||
Ort::RunOptions{nullptr},
|
||||
inputNamePtrs.data(), inputTensors.data(), inputTensors.size(),
|
||||
outputNamePtrs.data(), outputNamePtrs.size());
|
||||
|
||||
LOG_DEBUG(kTag, QString("ONNX 推理: %1 ms").arg(inferTimer.elapsedMs(), 0, 'f', 1));
|
||||
|
||||
// 5. 解析输出 logits [1, seq_len, 25055]
|
||||
auto& outputTensor = outputTensors[0];
|
||||
auto shape = outputTensor.GetTensorTypeAndShapeInfo().GetShape();
|
||||
const float* logitsData = outputTensor.GetTensorData<float>();
|
||||
|
||||
LOG_DEBUG(kTag, QString("输出维度: [%1, %2, %3]")
|
||||
.arg(shape[0]).arg(shape[1]).arg(shape[2]));
|
||||
|
||||
int seqLen = static_cast<int>(shape[1]);
|
||||
int vocabSize = static_cast<int>(shape[2]);
|
||||
|
||||
// 6. CTC 贪婪解码
|
||||
std::vector<int> rawTokens;
|
||||
float totalConf = 0.0f;
|
||||
int confCount = 0;
|
||||
|
||||
for (int t = 0; t < seqLen; t++) {
|
||||
int offset = t * vocabSize;
|
||||
int bestToken = argmax(logitsData, offset, offset + vocabSize);
|
||||
|
||||
if (bestToken != SenseVoiceTokenizer::kTokenBlank) {
|
||||
rawTokens.push_back(bestToken);
|
||||
|
||||
// 计算置信度
|
||||
float maxLogit = logitsData[offset + bestToken];
|
||||
// 近似置信度: 使用 softmax 的最大值位置
|
||||
totalConf += maxLogit;
|
||||
confCount++;
|
||||
}
|
||||
}
|
||||
|
||||
// CTC 去重
|
||||
std::vector<int> decodedTokens = ctcGreedyDecode(rawTokens, SenseVoiceTokenizer::kTokenBlank);
|
||||
|
||||
// 计算平均置信度 (softmax)
|
||||
if (confCount > 0) {
|
||||
float avgLogit = totalConf / confCount;
|
||||
// 归一化到 0-1 范围
|
||||
result.confidence = 1.0f / (1.0f + std::exp(-avgLogit));
|
||||
}
|
||||
|
||||
// 7. 解码 token 为文本
|
||||
if (decodedTokens.empty()) {
|
||||
result.text = "";
|
||||
} else if (impl_->tokenizer.isLoaded()) {
|
||||
result.text = impl_->tokenizer.decode(decodedTokens);
|
||||
LOG_DEBUG(kTag, QString("解码文本: %1 个 token → %2 字符")
|
||||
.arg(decodedTokens.size()).arg(result.text.length()));
|
||||
} else {
|
||||
// 降级:输出 token ID
|
||||
QString decodedText;
|
||||
for (int token : decodedTokens) {
|
||||
if (!decodedText.isEmpty()) decodedText += " ";
|
||||
decodedText += QString::number(token);
|
||||
}
|
||||
result.text = decodedText;
|
||||
LOG_WARNING(kTag, "Tokenizer 未加载,使用 token ID 输出");
|
||||
}
|
||||
|
||||
result.isFinal = true;
|
||||
|
||||
} catch (const std::exception& e) {
|
||||
result.text = QString("[错误] 推理失败: %1").arg(e.what());
|
||||
LOG_ERROR(kTag, result.text);
|
||||
}
|
||||
#else
|
||||
result.text = "[占位] ONNX Runtime 未启用";
|
||||
#endif
|
||||
|
||||
result.latency_ms = timer.elapsedMs();
|
||||
LOG_DEBUG(kTag, QString("推理总耗时: %1 ms").arg(result.latency_ms, 0, 'f', 1));
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace impress
|
||||
63
src/core/sense_voice_engine.h
Normal file
63
src/core/sense_voice_engine.h
Normal file
@ -0,0 +1,63 @@
|
||||
#pragma once
|
||||
|
||||
#include <QObject>
|
||||
#include <QString>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "stt_engine.h" // RecognitionResult 定义
|
||||
|
||||
namespace impress {
|
||||
|
||||
/**
|
||||
* @brief SenseVoice STT 推理引擎
|
||||
*
|
||||
* 封装 ONNX Runtime 推理逻辑,针对 SenseVoice 模型优化。
|
||||
* 完整的推理管线:PCM → Fbank → LFR → CMVN → ONNX → CTC 解码 → 文本。
|
||||
*/
|
||||
class SenseVoiceEngine : public QObject {
|
||||
Q_OBJECT
|
||||
public:
|
||||
explicit SenseVoiceEngine(QObject* parent = nullptr);
|
||||
~SenseVoiceEngine() override;
|
||||
|
||||
/** @brief 同步加载模型 */
|
||||
bool loadModelSync(const QString& modelPath,
|
||||
const QString& tokensPath = QString(),
|
||||
const QString& device = "cpu",
|
||||
int numThreads = 4);
|
||||
|
||||
/** @brief 异步加载模型(后台线程,不阻塞 UI) */
|
||||
void loadModelAsync(const QString& modelPath,
|
||||
const QString& tokensPath = QString(),
|
||||
const QString& device = "cpu",
|
||||
int numThreads = 4);
|
||||
|
||||
/** @brief 释放模型 */
|
||||
void unloadModel();
|
||||
|
||||
/** @brief 是否已加载模型 */
|
||||
bool isLoaded() const;
|
||||
|
||||
/**
|
||||
* @brief 推理音频数据
|
||||
* @param samples 归一化后的 PCM 浮点样本(范围 [-1, 1])
|
||||
* @param sampleRate 采样率
|
||||
* @param language 识别语言代码("zh", "en", "ja", "ko", "yue", "auto"),空则自动
|
||||
*/
|
||||
RecognitionResult infer(const std::vector<float>& samples,
|
||||
int sampleRate,
|
||||
const QString& language = QString());
|
||||
|
||||
signals:
|
||||
void modelLoaded(const QString& modelPath);
|
||||
void modelLoadError(const QString& modelPath, const QString& error);
|
||||
void modelUnloaded();
|
||||
void error(const QString& message);
|
||||
|
||||
private:
|
||||
struct Impl;
|
||||
std::unique_ptr<Impl> impl_;
|
||||
bool loaded_ = false;
|
||||
};
|
||||
|
||||
} // namespace impress
|
||||
215
src/core/sense_voice_features.cpp
Normal file
215
src/core/sense_voice_features.cpp
Normal file
@ -0,0 +1,215 @@
|
||||
#include "sense_voice_features.h"
|
||||
#include "sense_voice_cmvn.h"
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
#include <complex>
|
||||
|
||||
#ifndef M_PI
|
||||
#define M_PI 3.14159265358979323846
|
||||
#endif
|
||||
|
||||
namespace impress {
|
||||
|
||||
struct Complex {
|
||||
float re, im;
|
||||
Complex(float r = 0, float i = 0) : re(r), im(i) {}
|
||||
Complex operator+(const Complex& o) const { return {re + o.re, im + o.im}; }
|
||||
Complex operator-(const Complex& o) const { return {re - o.re, im - o.im}; }
|
||||
Complex operator*(const Complex& o) const {
|
||||
return {re * o.re - im * o.im, re * o.im + im * o.re};
|
||||
}
|
||||
Complex operator*(float s) const { return {re * s, im * s}; }
|
||||
float magnitudeSq() const { return re * re + im * im; }
|
||||
};
|
||||
|
||||
static void fft(std::vector<Complex>& x) {
|
||||
int n = static_cast<int>(x.size());
|
||||
if (n <= 1) return;
|
||||
|
||||
for (int i = 1, j = 0; i < n; i++) {
|
||||
int bit = n >> 1;
|
||||
for (; j & bit; bit >>= 1) j ^= bit;
|
||||
j ^= bit;
|
||||
if (i < j) std::swap(x[i], x[j]);
|
||||
}
|
||||
|
||||
for (int len = 2; len <= n; len *= 2) {
|
||||
float angle = -2.0f * static_cast<float>(M_PI) / len;
|
||||
Complex wlen(std::cos(angle), std::sin(angle));
|
||||
for (int i = 0; i < n; i += len) {
|
||||
Complex w(1.0f, 0.0f);
|
||||
for (int j = 0; j < len / 2; j++) {
|
||||
Complex u = x[i + j];
|
||||
Complex v = x[i + j + len / 2] * w;
|
||||
x[i + j] = u + v;
|
||||
x[i + j + len / 2] = u - v;
|
||||
w = w * wlen;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static float hzToMel(float hz) {
|
||||
return 1125.0f * std::log(1.0f + hz / 700.0f);
|
||||
}
|
||||
|
||||
static float melToHz(float mel) {
|
||||
return 700.0f * (std::exp(mel / 1125.0f) - 1.0f);
|
||||
}
|
||||
|
||||
SenseVoiceFeatures::SenseVoiceFeatures(int sampleRate)
|
||||
: sampleRate_(sampleRate)
|
||||
{
|
||||
buildMelFilters();
|
||||
}
|
||||
|
||||
std::vector<float> SenseVoiceFeatures::hannWindow() const {
|
||||
std::vector<float> window(winLength_);
|
||||
for (int i = 0; i < winLength_; i++) {
|
||||
window[i] = 0.5f * (1.0f - std::cos(2.0f * static_cast<float>(M_PI) * i / (winLength_ - 1)));
|
||||
}
|
||||
return window;
|
||||
}
|
||||
|
||||
void SenseVoiceFeatures::buildMelFilters() {
|
||||
int nFreq = nFft_ / 2 + 1;
|
||||
float fMin = 20.0f;
|
||||
float fMax = static_cast<float>(sampleRate_) / 2.0f;
|
||||
float melMin = hzToMel(fMin);
|
||||
float melMax = hzToMel(fMax);
|
||||
|
||||
std::vector<float> melPoints(nMel_ + 2);
|
||||
for (int i = 0; i < nMel_ + 2; i++) {
|
||||
melPoints[i] = melToHz(melMin + (melMax - melMin) * i / (nMel_ + 1));
|
||||
}
|
||||
|
||||
std::vector<int> binPoints(nMel_ + 2);
|
||||
for (int i = 0; i < nMel_ + 2; i++) {
|
||||
binPoints[i] = static_cast<int>(std::round((nFft_ + 1) * melPoints[i] / sampleRate_));
|
||||
binPoints[i] = std::max(0, std::min(nFreq - 1, binPoints[i]));
|
||||
}
|
||||
|
||||
melFilters_.resize(nMel_);
|
||||
for (int m = 0; m < nMel_; m++) {
|
||||
MelFilter filter;
|
||||
filter.startBin = binPoints[m];
|
||||
filter.endBin = binPoints[m + 2] + 1;
|
||||
|
||||
int numWeights = filter.endBin - filter.startBin;
|
||||
filter.weights.resize(numWeights, 0.0f);
|
||||
|
||||
for (int k = 0; k < numWeights; k++) {
|
||||
int bin = filter.startBin + k;
|
||||
if (bin >= binPoints[m] && bin <= binPoints[m + 1]) {
|
||||
int denom = binPoints[m + 1] - binPoints[m];
|
||||
filter.weights[k] = (denom > 0) ? static_cast<float>(bin - binPoints[m]) / denom : 0.0f;
|
||||
} else if (bin > binPoints[m + 1] && bin <= binPoints[m + 2]) {
|
||||
int denom = binPoints[m + 2] - binPoints[m + 1];
|
||||
filter.weights[k] = (denom > 0) ? static_cast<float>(binPoints[m + 2] - bin) / denom : 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
melFilters_[m] = filter;
|
||||
}
|
||||
}
|
||||
|
||||
int SenseVoiceFeatures::nFrames(int numSamples) const {
|
||||
if (numSamples < winLength_) return 0;
|
||||
return (numSamples - winLength_) / hopLength_ + 1;
|
||||
}
|
||||
|
||||
std::vector<float> SenseVoiceFeatures::extract(const std::vector<float>& samples) const {
|
||||
if (samples.empty()) return {};
|
||||
|
||||
int numSamples = static_cast<int>(samples.size());
|
||||
|
||||
// 1. 预加重
|
||||
std::vector<float> emphasized(numSamples);
|
||||
emphasized[0] = samples[0];
|
||||
for (int i = 1; i < numSamples; i++) {
|
||||
emphasized[i] = samples[i] - preEmphasisCoeff_ * samples[i - 1];
|
||||
}
|
||||
|
||||
// 2. 分帧 + FFT + Mel + 对数压缩
|
||||
int numFrames = nFrames(numSamples);
|
||||
if (numFrames <= 0) return {};
|
||||
|
||||
auto window = hannWindow();
|
||||
int nFreq = nFft_ / 2 + 1;
|
||||
|
||||
std::vector<float> fbankData(numFrames * nMel_);
|
||||
|
||||
for (int f = 0; f < numFrames; f++) {
|
||||
int frameStart = f * hopLength_;
|
||||
|
||||
// 应用 Hann 窗并 FFT
|
||||
std::vector<Complex> fftInput(nFft_, {0.0f, 0.0f});
|
||||
for (int i = 0; i < winLength_ && frameStart + i < numSamples; i++) {
|
||||
fftInput[i] = {emphasized[frameStart + i] * window[i], 0.0f};
|
||||
}
|
||||
fft(fftInput);
|
||||
|
||||
// Mel 滤波器组
|
||||
for (int m = 0; m < nMel_; m++) {
|
||||
const auto& filter = melFilters_[m];
|
||||
float energy = 0.0f;
|
||||
for (int w = 0; w < static_cast<int>(filter.weights.size()); w++) {
|
||||
int bin = filter.startBin + w;
|
||||
if (bin < nFreq) {
|
||||
energy += fftInput[bin].magnitudeSq() * filter.weights[w];
|
||||
}
|
||||
}
|
||||
// 对数压缩 (使用自然对数)
|
||||
energy = std::max(energy, 1e-10f);
|
||||
fbankData[f * nMel_ + m] = std::log(energy);
|
||||
}
|
||||
}
|
||||
|
||||
// 3. LFR (Low Frame Rate) 特征拼接
|
||||
// 将连续 lfr_window_size 帧 Fbank 特征拼接为一帧
|
||||
// 步长为 lfr_window_shift
|
||||
std::vector<float> lfrFeatures;
|
||||
int lfrOutputDim = nMel_ * kLFRWindowSize; // 80 * 7 = 560
|
||||
|
||||
for (int i = 0; ; i += kLFRWindowShift) {
|
||||
if (i >= numFrames) break;
|
||||
|
||||
// 计算 LFR 窗口
|
||||
int leftPad = std::max(0, kLFRWindowSize / 2 - i);
|
||||
int rightPad = std::max(0, kLFRWindowSize / 2 - (numFrames - 1 - i));
|
||||
|
||||
std::vector<float> frame(lfrOutputDim, 0.0f);
|
||||
int outIdx = 0;
|
||||
|
||||
for (int j = -kLFRWindowSize / 2; j < kLFRWindowSize - kLFRWindowSize / 2; j++) {
|
||||
int idx = i + j;
|
||||
// 边界填充:复制第一帧或最后一帧
|
||||
if (idx < 0) idx = 0;
|
||||
if (idx >= numFrames) idx = numFrames - 1;
|
||||
|
||||
for (int m = 0; m < nMel_; m++) {
|
||||
frame[outIdx++] = fbankData[idx * nMel_ + m];
|
||||
}
|
||||
}
|
||||
|
||||
lfrFeatures.insert(lfrFeatures.end(), frame.begin(), frame.end());
|
||||
}
|
||||
|
||||
// 4. CMVN 归一化
|
||||
cmvn(lfrFeatures);
|
||||
|
||||
return lfrFeatures;
|
||||
}
|
||||
|
||||
void SenseVoiceFeatures::cmvn(std::vector<float>& features) const {
|
||||
int nLFRFrames = static_cast<int>(features.size()) / kLFROutputDim;
|
||||
int numValues = static_cast<int>(features.size());
|
||||
|
||||
for (int i = 0; i < numValues; i++) {
|
||||
features[i] = (features[i] + kNegMean[i % kLFROutputDim]) *
|
||||
kInvStddev[i % kLFROutputDim];
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace impress
|
||||
54
src/core/sense_voice_features.h
Normal file
54
src/core/sense_voice_features.h
Normal file
@ -0,0 +1,54 @@
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace impress {
|
||||
|
||||
/**
|
||||
* @brief SenseVoice 音频特征提取器
|
||||
*
|
||||
* 将原始 PCM 音频转换为 SenseVoice 模型所需的 LFR Fbank 特征。
|
||||
* 流程: PCM → 预加重 → 分帧 → FFT → Mel 滤波器 → 对数压缩 →
|
||||
* LFR 拼接 → CMVN 归一化 → 560-dim 特征向量。
|
||||
*/
|
||||
class SenseVoiceFeatures {
|
||||
public:
|
||||
/**
|
||||
* @brief 构造函数
|
||||
* @param sampleRate 输入音频采样率(默认 16000)
|
||||
*/
|
||||
explicit SenseVoiceFeatures(int sampleRate = 16000);
|
||||
|
||||
/**
|
||||
* @brief 从 PCM 数据提取 LFR Fbank 特征
|
||||
* @param samples 归一化 PCM 浮点数据 [-1, 1]
|
||||
* @return LFR Fbank 特征,维度 [nFrames * 560]
|
||||
*/
|
||||
std::vector<float> extract(const std::vector<float>& samples) const;
|
||||
|
||||
/** @brief 获取特征帧数 */
|
||||
int nFrames(int numSamples) const;
|
||||
|
||||
private:
|
||||
// Fbank 参数
|
||||
int sampleRate_;
|
||||
int nFft_ = 512;
|
||||
int nMel_ = 80;
|
||||
int hopLength_ = 160; // 10ms @ 16kHz
|
||||
int winLength_ = 400; // 25ms @ 16kHz
|
||||
float preEmphasisCoeff_ = 0.97f;
|
||||
|
||||
// Mel 滤波器组 (预计算)
|
||||
struct MelFilter {
|
||||
int startBin;
|
||||
int endBin;
|
||||
std::vector<float> weights;
|
||||
};
|
||||
std::vector<MelFilter> melFilters_;
|
||||
|
||||
std::vector<float> hannWindow() const;
|
||||
void buildMelFilters();
|
||||
void cmvn(std::vector<float>& features) const;
|
||||
};
|
||||
|
||||
} // namespace impress
|
||||
101
src/core/sense_voice_tokenizer.cpp
Normal file
101
src/core/sense_voice_tokenizer.cpp
Normal file
@ -0,0 +1,101 @@
|
||||
#include "sense_voice_tokenizer.h"
|
||||
#include "utils/logger.h"
|
||||
#include <QFile>
|
||||
#include <QTextStream>
|
||||
#include <QRegularExpression>
|
||||
|
||||
static const char* const kTag = "SenseVoiceTokenizer";
|
||||
|
||||
namespace impress {
|
||||
|
||||
SenseVoiceTokenizer::SenseVoiceTokenizer() = default;
|
||||
|
||||
bool SenseVoiceTokenizer::load(const QString& tokensPath) {
|
||||
QFile file(tokensPath);
|
||||
if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) {
|
||||
LOG_ERROR(kTag, QString("无法打开词表文件: %1").arg(tokensPath));
|
||||
return false;
|
||||
}
|
||||
|
||||
QTextStream stream(&file);
|
||||
stream.setEncoding(QStringConverter::Utf8);
|
||||
|
||||
tokenToString_.clear();
|
||||
|
||||
int lineCount = 0;
|
||||
while (!stream.atEnd()) {
|
||||
QString line = stream.readLine().trimmed();
|
||||
if (line.isEmpty()) continue;
|
||||
|
||||
// 格式: "<token> <id>" — 最后一个是 token_id
|
||||
int lastSpace = line.lastIndexOf(' ');
|
||||
if (lastSpace < 0) continue;
|
||||
|
||||
bool ok = false;
|
||||
int tokenId = line.mid(lastSpace + 1).toInt(&ok);
|
||||
if (!ok) continue;
|
||||
|
||||
QString tokenStr = line.left(lastSpace);
|
||||
tokenToString_[tokenId] = tokenStr;
|
||||
lineCount++;
|
||||
}
|
||||
|
||||
LOG_INFO(kTag, QString("词表已加载: %1 个词条 (%2)").arg(lineCount).arg(tokensPath));
|
||||
return !tokenToString_.empty();
|
||||
}
|
||||
|
||||
QString SenseVoiceTokenizer::decode(const std::vector<int>& tokens) const {
|
||||
if (tokens.empty()) return "";
|
||||
|
||||
QString result;
|
||||
for (int token : tokens) {
|
||||
// 跳过特殊 token
|
||||
if (token == kTokenBOS || token == kTokenEOS || token == kTokenBlank) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto it = tokenToString_.find(token);
|
||||
if (it != tokenToString_.end()) {
|
||||
result += decodeBPE(it->second);
|
||||
} else {
|
||||
result += QString("[T%1]").arg(token);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
QString SenseVoiceTokenizer::decodeBPE(const QString& token) const {
|
||||
// SenseVoice 使用 SentencePiece BPE 格式
|
||||
// ▁ (U+2581) 表示单词开头/空格
|
||||
QString result = token;
|
||||
|
||||
// ▁ → 空格
|
||||
result.replace(QChar(0x2581), ' ');
|
||||
|
||||
// 处理 unicode 转义 (如 <0xE5>)
|
||||
static QRegularExpression hexPattern("<0x([0-9A-Fa-f]+)>");
|
||||
QRegularExpressionMatchIterator it = hexPattern.globalMatch(result);
|
||||
QStringList parts;
|
||||
int lastPos = 0;
|
||||
while (it.hasNext()) {
|
||||
QRegularExpressionMatch match = it.next();
|
||||
parts << result.mid(lastPos, match.capturedStart() - lastPos);
|
||||
bool ok;
|
||||
int code = match.captured(1).toInt(&ok, 16);
|
||||
if (ok) {
|
||||
parts << QChar(code);
|
||||
} else {
|
||||
parts << match.captured(0);
|
||||
}
|
||||
lastPos = match.capturedEnd();
|
||||
}
|
||||
if (!parts.isEmpty() || lastPos > 0) {
|
||||
parts << result.mid(lastPos);
|
||||
result = parts.join("");
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace impress
|
||||
42
src/core/sense_voice_tokenizer.h
Normal file
42
src/core/sense_voice_tokenizer.h
Normal file
@ -0,0 +1,42 @@
|
||||
#pragma once
|
||||
|
||||
#include <QString>
|
||||
#include <QStringList>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace impress {
|
||||
|
||||
/**
|
||||
* @brief SenseVoice Tokenizer
|
||||
*
|
||||
* 加载 tokens.txt 词表,支持 BPE token 到文本的解码。
|
||||
* 支持 SenseVoice 的 BPE 词表格式 (SentencePiece)。
|
||||
*/
|
||||
class SenseVoiceTokenizer {
|
||||
public:
|
||||
SenseVoiceTokenizer();
|
||||
|
||||
/** @brief 从 tokens.txt 加载词表 */
|
||||
bool load(const QString& tokensPath);
|
||||
|
||||
/** @brief 将 token IDs 解码为文本 */
|
||||
QString decode(const std::vector<int>& tokens) const;
|
||||
|
||||
/** @brief 是否已加载 */
|
||||
bool isLoaded() const { return !tokenToString_.empty(); }
|
||||
|
||||
/** @brief 词表大小 */
|
||||
int vocabSize() const { return static_cast<int>(tokenToString_.size()); }
|
||||
|
||||
// 特殊 token
|
||||
static constexpr int kTokenBlank = 0; // CTC blank / <unk>
|
||||
static constexpr int kTokenBOS = 1; // <s>
|
||||
static constexpr int kTokenEOS = 2; // </s>
|
||||
|
||||
private:
|
||||
std::unordered_map<int, QString> tokenToString_;
|
||||
QString decodeBPE(const QString& token) const;
|
||||
};
|
||||
|
||||
} // namespace impress
|
||||
@ -1,5 +1,5 @@
|
||||
#include "file_transcribe_page.h"
|
||||
#include "core/stt_engine.h"
|
||||
#include "core/sense_voice_engine.h"
|
||||
#include "audio/audio_decoder.h"
|
||||
#include "app/config_manager.h"
|
||||
#include "utils/logger.h"
|
||||
@ -34,7 +34,7 @@ namespace impress {
|
||||
FileTranscribePage::FileTranscribePage(ConfigManager* configManager, QWidget* parent)
|
||||
: QWidget(parent)
|
||||
, configManager_(configManager)
|
||||
, sttEngine_(new STTEngine(this))
|
||||
, sttEngine_(new SenseVoiceEngine(this))
|
||||
, audioDecoder_(new AudioDecoder(this))
|
||||
{
|
||||
setupUI();
|
||||
@ -161,6 +161,7 @@ void FileTranscribePage::onStartTranscribe() {
|
||||
|
||||
(void)QtConcurrent::run([this, modelPath]() {
|
||||
bool success = sttEngine_->loadModelSync(modelPath,
|
||||
configManager_->get("stt.tokens_path").toString(),
|
||||
configManager_->get("stt.device").toString(),
|
||||
configManager_->get("stt.num_threads").toInt());
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@ class QByteArray;
|
||||
namespace impress {
|
||||
|
||||
class ConfigManager;
|
||||
class STTEngine;
|
||||
class SenseVoiceEngine;
|
||||
class AudioDecoder;
|
||||
|
||||
struct TranscribeTask {
|
||||
@ -62,7 +62,7 @@ private:
|
||||
QString formatSRTTime(double seconds) const;
|
||||
|
||||
ConfigManager* configManager_;
|
||||
STTEngine* sttEngine_;
|
||||
SenseVoiceEngine* sttEngine_;
|
||||
AudioDecoder* audioDecoder_;
|
||||
|
||||
// UI 控件
|
||||
|
||||
@ -47,7 +47,7 @@ void SettingsPage::setupUI() {
|
||||
sttLayout->addRow("模型路径:", modelRow);
|
||||
|
||||
modelTypeCombo_ = new QComboBox(this);
|
||||
modelTypeCombo_->addItems({"whisper", "paraformer", "conformer"});
|
||||
modelTypeCombo_->addItems({"sense_voice", "whisper", "paraformer", "conformer"});
|
||||
sttLayout->addRow("模型类型:", modelTypeCombo_);
|
||||
|
||||
deviceCombo_ = new QComboBox(this);
|
||||
@ -59,6 +59,15 @@ void SettingsPage::setupUI() {
|
||||
threadSpin_->setValue(4);
|
||||
sttLayout->addRow("推理线程数:", threadSpin_);
|
||||
|
||||
auto* tokensRow = new QHBoxLayout();
|
||||
tokensPathEdit_ = new QLineEdit(this);
|
||||
tokensPathEdit_->setPlaceholderText("选择 tokens.txt 文件路径...");
|
||||
tokensBrowseBtn_ = new QPushButton("浏览...", this);
|
||||
connect(tokensBrowseBtn_, &QPushButton::clicked, this, &SettingsPage::onBrowseTokensPath);
|
||||
tokensRow->addWidget(tokensPathEdit_);
|
||||
tokensRow->addWidget(tokensBrowseBtn_);
|
||||
sttLayout->addRow("词表路径:", tokensRow);
|
||||
|
||||
sampleRateSpin_ = new QSpinBox(this);
|
||||
sampleRateSpin_->setRange(8000, 192000);
|
||||
sampleRateSpin_->setSingleStep(1000);
|
||||
@ -158,6 +167,7 @@ void SettingsPage::setupUI() {
|
||||
|
||||
void SettingsPage::loadFromConfig() {
|
||||
modelPathEdit_->setText(configManager_->get("stt.model_path").toString());
|
||||
tokensPathEdit_->setText(configManager_->get("stt.tokens_path").toString());
|
||||
modelTypeCombo_->setCurrentText(configManager_->get("stt.model_type").toString());
|
||||
deviceCombo_->setCurrentText(configManager_->get("stt.device").toString());
|
||||
threadSpin_->setValue(configManager_->get("stt.num_threads").toInt());
|
||||
@ -179,6 +189,7 @@ void SettingsPage::loadFromConfig() {
|
||||
|
||||
void SettingsPage::saveToConfig() {
|
||||
configManager_->set("stt.model_path", modelPathEdit_->text());
|
||||
configManager_->set("stt.tokens_path", tokensPathEdit_->text());
|
||||
configManager_->set("stt.model_type", modelTypeCombo_->currentText());
|
||||
configManager_->set("stt.device", deviceCombo_->currentText());
|
||||
configManager_->set("stt.num_threads", threadSpin_->value());
|
||||
@ -206,6 +217,14 @@ void SettingsPage::onBrowseModelPath() {
|
||||
}
|
||||
}
|
||||
|
||||
void SettingsPage::onBrowseTokensPath() {
|
||||
QString path = QFileDialog::getOpenFileName(this, "选择词表文件", "",
|
||||
"词表文件 (tokens.txt);;所有文件 (*.*)");
|
||||
if (!path.isEmpty()) {
|
||||
tokensPathEdit_->setText(path);
|
||||
}
|
||||
}
|
||||
|
||||
void SettingsPage::onSaveConfig() {
|
||||
saveToConfig();
|
||||
if (configManager_->save()) {
|
||||
|
||||
@ -29,6 +29,7 @@ public:
|
||||
|
||||
private slots:
|
||||
void onBrowseModelPath();
|
||||
void onBrowseTokensPath();
|
||||
void onSaveConfig();
|
||||
void onResetConfig();
|
||||
|
||||
@ -42,6 +43,8 @@ private:
|
||||
// STT 设置
|
||||
QLineEdit* modelPathEdit_;
|
||||
QPushButton* browseBtn_;
|
||||
QLineEdit* tokensPathEdit_;
|
||||
QPushButton* tokensBrowseBtn_;
|
||||
QComboBox* modelTypeCombo_;
|
||||
QComboBox* deviceCombo_;
|
||||
QSpinBox* threadSpin_;
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#include "stt_test_page.h"
|
||||
#include "core/stt_engine.h"
|
||||
#include "core/sense_voice_engine.h"
|
||||
#include "audio/audio_capture.h"
|
||||
#include "audio/audio_ring_buffer.h"
|
||||
#include "widgets/audio_waveform.h"
|
||||
@ -26,7 +26,7 @@ namespace impress {
|
||||
STTTestPage::STTTestPage(ConfigManager* configManager, QWidget* parent)
|
||||
: QWidget(parent)
|
||||
, configManager_(configManager)
|
||||
, sttEngine_(new STTEngine(this))
|
||||
, sttEngine_(new SenseVoiceEngine(this))
|
||||
, audioCapture_(new AudioCapture(this))
|
||||
{
|
||||
setupUI();
|
||||
@ -34,11 +34,11 @@ STTTestPage::STTTestPage(ConfigManager* configManager, QWidget* parent)
|
||||
// 信号连接
|
||||
connect(audioCapture_, &AudioCapture::audioDataReady,
|
||||
this, &STTTestPage::onAudioDataReady);
|
||||
connect(sttEngine_, &STTEngine::modelLoaded,
|
||||
connect(sttEngine_, &SenseVoiceEngine::modelLoaded,
|
||||
this, &STTTestPage::onModelLoaded);
|
||||
connect(sttEngine_, &STTEngine::modelLoadError,
|
||||
connect(sttEngine_, &SenseVoiceEngine::modelLoadError,
|
||||
this, &STTTestPage::onModelLoadError);
|
||||
connect(sttEngine_, &STTEngine::modelUnloaded,
|
||||
connect(sttEngine_, &SenseVoiceEngine::modelUnloaded,
|
||||
this, &STTTestPage::onModelUnloaded);
|
||||
}
|
||||
|
||||
@ -133,6 +133,7 @@ void STTTestPage::onToggleRecording() {
|
||||
updateUIState();
|
||||
|
||||
sttEngine_->loadModelAsync(modelPath,
|
||||
configManager_->get("stt.tokens_path").toString(),
|
||||
configManager_->get("stt.device").toString(),
|
||||
configManager_->get("stt.num_threads").toInt());
|
||||
|
||||
|
||||
@ -13,13 +13,14 @@ class QSpinBox;
|
||||
namespace impress {
|
||||
|
||||
class ConfigManager;
|
||||
class STTEngine;
|
||||
class SenseVoiceEngine;
|
||||
class AudioCapture;
|
||||
|
||||
/**
|
||||
* @brief STT 测试页面
|
||||
*
|
||||
* 实时麦克风采集 + 流式识别。
|
||||
* 使用 SenseVoice 模型进行推理。
|
||||
* 模型异步加载,不阻塞 UI。
|
||||
*/
|
||||
class STTTestPage : public QWidget {
|
||||
@ -43,7 +44,7 @@ private:
|
||||
void processAudioChunk(const std::vector<float>& samples, int sampleRate);
|
||||
|
||||
ConfigManager* configManager_;
|
||||
STTEngine* sttEngine_;
|
||||
SenseVoiceEngine* sttEngine_;
|
||||
AudioCapture* audioCapture_;
|
||||
|
||||
// UI 控件
|
||||
|
||||
Loading…
Reference in New Issue
Block a user