feat: 集成 SenseVoice 完整推理管线

- 新增 SenseVoiceEngine: 专用 SenseVoice ONNX 模型推理引擎
- 新增 SenseVoiceFeatures: Fbank 特征提取 + LFR 拼接 + CMVN 归一化
  - 80-dim Mel 滤波器组 + 对数压缩
  - LFR (Low Frame Rate): window_size=7, window_shift=6 → 560-dim
  - CMVN: neg_mean / inv_stddev 从模型元数据自动提取
- 新增 SenseVoiceTokenizer: 加载 tokens.txt 词表,BPE 解码
- 新增 CTC 贪婪解码: 去除重复 token 和空白符
- 配置页面新增词表路径选择
- STT 测试页面和文件转写页面切换至 SenseVoiceEngine
- 更新 CMakeLists.txt 包含所有新增源文件
- 模型: /home/alvin/Documents/SenseVoice-Small/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx
- 语言支持: 中文/英语/日语/韩语/粤语 自动检测

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Alvin Young 2026-05-12 19:26:11 +08:00
parent 49313f15f9
commit 9a6dfa3b88
16 changed files with 1114 additions and 14 deletions

View File

@ -46,6 +46,9 @@ set(SOURCES
# Core
src/core/stt_engine.cpp
src/core/sense_voice_engine.cpp
src/core/sense_voice_features.cpp
src/core/sense_voice_tokenizer.cpp
src/core/mel_spectrogram.cpp
src/core/whisper_tokenizer.cpp
src/core/audio_processor.cpp
@ -76,6 +79,10 @@ set(HEADERS
src/app/config_manager.h
src/core/stt_engine.h
src/core/sense_voice_engine.h
src/core/sense_voice_features.h
src/core/sense_voice_tokenizer.h
src/core/sense_voice_cmvn.h
src/core/mel_spectrogram.h
src/core/whisper_tokenizer.h
src/core/audio_processor.h

View File

@ -131,7 +131,8 @@ ctest
- [x] 语音活动检测 (VAD — 短时能量 + 过零率)
- [x] 音频文件信息 (时长/采样率/声道数)
- [x] 单元测试框架 (Catch2, 39 个测试用例)
- [ ] 完整 Whisper 推理 (自回归解码 + 流式识别)
- [x] 完整 Whisper 推理 (自回归解码 + 流式识别)
- [x] SenseVoice 完整推理 (Fbank → LFR → CMVN → ONNX → CTC 解码)
- [ ] 跨平台打包
## License

View File

@ -66,7 +66,8 @@ void ConfigManager::loadDefaults() {
config_ = QVariantMap{
{"stt", QVariantMap{
{"model_path", ""},
{"model_type", "whisper"},
{"model_type", "sense_voice"},
{"tokens_path", ""},
{"device", "cpu"},
{"num_threads", 4},
{"sample_rate", 16000},

174
src/core/sense_voice_cmvn.h Normal file
View File

@ -0,0 +1,174 @@
// Auto-generated CMVN coefficients from SenseVoice model
// DO NOT EDIT MANUALLY
#pragma once
namespace impress {
// neg_mean (560 values)
static constexpr float kNegMean[] = {
-8.311879f, -8.600912f, -9.615928f, -10.435950f, -11.212920f, -11.883330f, -12.362430f, -12.637060f,
-12.881800f, -12.830660f, -12.891030f, -12.956660f, -13.197630f, -13.405980f, -13.491130f, -13.554600f,
-13.556390f, -13.519150f, -13.682840f, -13.532890f, -13.421070f, -13.655190f, -13.507130f, -13.752510f,
-13.767150f, -13.874080f, -13.731090f, -13.704120f, -13.560730f, -13.534880f, -13.548950f, -13.562280f,
-13.594080f, -13.620470f, -13.641980f, -13.661090f, -13.626690f, -13.582970f, -13.573870f, -13.473900f,
-13.530630f, -13.483480f, -13.610470f, -13.647160f, -13.715460f, -13.791840f, -13.906140f, -14.030980f,
-14.182050f, -14.358810f, -14.484190f, -14.601720f, -14.705910f, -14.833620f, -14.921220f, -15.006220f,
-15.051220f, -15.031190f, -14.990280f, -14.923020f, -14.869270f, -14.826910f, -14.797200f, -14.769090f,
-14.713560f, -14.612770f, -14.516960f, -14.422520f, -14.364050f, -14.304510f, -14.231610f, -14.198510f,
-14.166330f, -14.156490f, -14.105040f, -13.995180f, -13.795620f, -13.399600f, -12.776700f, -11.712080f,
-8.311879f, -8.600912f, -9.615928f, -10.435950f, -11.212920f, -11.883330f, -12.362430f, -12.637060f,
-12.881800f, -12.830660f, -12.891030f, -12.956660f, -13.197630f, -13.405980f, -13.491130f, -13.554600f,
-13.556390f, -13.519150f, -13.682840f, -13.532890f, -13.421070f, -13.655190f, -13.507130f, -13.752510f,
-13.767150f, -13.874080f, -13.731090f, -13.704120f, -13.560730f, -13.534880f, -13.548950f, -13.562280f,
-13.594080f, -13.620470f, -13.641980f, -13.661090f, -13.626690f, -13.582970f, -13.573870f, -13.473900f,
-13.530630f, -13.483480f, -13.610470f, -13.647160f, -13.715460f, -13.791840f, -13.906140f, -14.030980f,
-14.182050f, -14.358810f, -14.484190f, -14.601720f, -14.705910f, -14.833620f, -14.921220f, -15.006220f,
-15.051220f, -15.031190f, -14.990280f, -14.923020f, -14.869270f, -14.826910f, -14.797200f, -14.769090f,
-14.713560f, -14.612770f, -14.516960f, -14.422520f, -14.364050f, -14.304510f, -14.231610f, -14.198510f,
-14.166330f, -14.156490f, -14.105040f, -13.995180f, -13.795620f, -13.399600f, -12.776700f, -11.712080f,
-8.311879f, -8.600912f, -9.615928f, -10.435950f, -11.212920f, -11.883330f, -12.362430f, -12.637060f,
-12.881800f, -12.830660f, -12.891030f, -12.956660f, -13.197630f, -13.405980f, -13.491130f, -13.554600f,
-13.556390f, -13.519150f, -13.682840f, -13.532890f, -13.421070f, -13.655190f, -13.507130f, -13.752510f,
-13.767150f, -13.874080f, -13.731090f, -13.704120f, -13.560730f, -13.534880f, -13.548950f, -13.562280f,
-13.594080f, -13.620470f, -13.641980f, -13.661090f, -13.626690f, -13.582970f, -13.573870f, -13.473900f,
-13.530630f, -13.483480f, -13.610470f, -13.647160f, -13.715460f, -13.791840f, -13.906140f, -14.030980f,
-14.182050f, -14.358810f, -14.484190f, -14.601720f, -14.705910f, -14.833620f, -14.921220f, -15.006220f,
-15.051220f, -15.031190f, -14.990280f, -14.923020f, -14.869270f, -14.826910f, -14.797200f, -14.769090f,
-14.713560f, -14.612770f, -14.516960f, -14.422520f, -14.364050f, -14.304510f, -14.231610f, -14.198510f,
-14.166330f, -14.156490f, -14.105040f, -13.995180f, -13.795620f, -13.399600f, -12.776700f, -11.712080f,
-8.311879f, -8.600912f, -9.615928f, -10.435950f, -11.212920f, -11.883330f, -12.362430f, -12.637060f,
-12.881800f, -12.830660f, -12.891030f, -12.956660f, -13.197630f, -13.405980f, -13.491130f, -13.554600f,
-13.556390f, -13.519150f, -13.682840f, -13.532890f, -13.421070f, -13.655190f, -13.507130f, -13.752510f,
-13.767150f, -13.874080f, -13.731090f, -13.704120f, -13.560730f, -13.534880f, -13.548950f, -13.562280f,
-13.594080f, -13.620470f, -13.641980f, -13.661090f, -13.626690f, -13.582970f, -13.573870f, -13.473900f,
-13.530630f, -13.483480f, -13.610470f, -13.647160f, -13.715460f, -13.791840f, -13.906140f, -14.030980f,
-14.182050f, -14.358810f, -14.484190f, -14.601720f, -14.705910f, -14.833620f, -14.921220f, -15.006220f,
-15.051220f, -15.031190f, -14.990280f, -14.923020f, -14.869270f, -14.826910f, -14.797200f, -14.769090f,
-14.713560f, -14.612770f, -14.516960f, -14.422520f, -14.364050f, -14.304510f, -14.231610f, -14.198510f,
-14.166330f, -14.156490f, -14.105040f, -13.995180f, -13.795620f, -13.399600f, -12.776700f, -11.712080f,
-8.311879f, -8.600912f, -9.615928f, -10.435950f, -11.212920f, -11.883330f, -12.362430f, -12.637060f,
-12.881800f, -12.830660f, -12.891030f, -12.956660f, -13.197630f, -13.405980f, -13.491130f, -13.554600f,
-13.556390f, -13.519150f, -13.682840f, -13.532890f, -13.421070f, -13.655190f, -13.507130f, -13.752510f,
-13.767150f, -13.874080f, -13.731090f, -13.704120f, -13.560730f, -13.534880f, -13.548950f, -13.562280f,
-13.594080f, -13.620470f, -13.641980f, -13.661090f, -13.626690f, -13.582970f, -13.573870f, -13.473900f,
-13.530630f, -13.483480f, -13.610470f, -13.647160f, -13.715460f, -13.791840f, -13.906140f, -14.030980f,
-14.182050f, -14.358810f, -14.484190f, -14.601720f, -14.705910f, -14.833620f, -14.921220f, -15.006220f,
-15.051220f, -15.031190f, -14.990280f, -14.923020f, -14.869270f, -14.826910f, -14.797200f, -14.769090f,
-14.713560f, -14.612770f, -14.516960f, -14.422520f, -14.364050f, -14.304510f, -14.231610f, -14.198510f,
-14.166330f, -14.156490f, -14.105040f, -13.995180f, -13.795620f, -13.399600f, -12.776700f, -11.712080f,
-8.311879f, -8.600912f, -9.615928f, -10.435950f, -11.212920f, -11.883330f, -12.362430f, -12.637060f,
-12.881800f, -12.830660f, -12.891030f, -12.956660f, -13.197630f, -13.405980f, -13.491130f, -13.554600f,
-13.556390f, -13.519150f, -13.682840f, -13.532890f, -13.421070f, -13.655190f, -13.507130f, -13.752510f,
-13.767150f, -13.874080f, -13.731090f, -13.704120f, -13.560730f, -13.534880f, -13.548950f, -13.562280f,
-13.594080f, -13.620470f, -13.641980f, -13.661090f, -13.626690f, -13.582970f, -13.573870f, -13.473900f,
-13.530630f, -13.483480f, -13.610470f, -13.647160f, -13.715460f, -13.791840f, -13.906140f, -14.030980f,
-14.182050f, -14.358810f, -14.484190f, -14.601720f, -14.705910f, -14.833620f, -14.921220f, -15.006220f,
-15.051220f, -15.031190f, -14.990280f, -14.923020f, -14.869270f, -14.826910f, -14.797200f, -14.769090f,
-14.713560f, -14.612770f, -14.516960f, -14.422520f, -14.364050f, -14.304510f, -14.231610f, -14.198510f,
-14.166330f, -14.156490f, -14.105040f, -13.995180f, -13.795620f, -13.399600f, -12.776700f, -11.712080f,
-8.311879f, -8.600912f, -9.615928f, -10.435950f, -11.212920f, -11.883330f, -12.362430f, -12.637060f,
-12.881800f, -12.830660f, -12.891030f, -12.956660f, -13.197630f, -13.405980f, -13.491130f, -13.554600f,
-13.556390f, -13.519150f, -13.682840f, -13.532890f, -13.421070f, -13.655190f, -13.507130f, -13.752510f,
-13.767150f, -13.874080f, -13.731090f, -13.704120f, -13.560730f, -13.534880f, -13.548950f, -13.562280f,
-13.594080f, -13.620470f, -13.641980f, -13.661090f, -13.626690f, -13.582970f, -13.573870f, -13.473900f,
-13.530630f, -13.483480f, -13.610470f, -13.647160f, -13.715460f, -13.791840f, -13.906140f, -14.030980f,
-14.182050f, -14.358810f, -14.484190f, -14.601720f, -14.705910f, -14.833620f, -14.921220f, -15.006220f,
-15.051220f, -15.031190f, -14.990280f, -14.923020f, -14.869270f, -14.826910f, -14.797200f, -14.769090f,
-14.713560f, -14.612770f, -14.516960f, -14.422520f, -14.364050f, -14.304510f, -14.231610f, -14.198510f,
-14.166330f, -14.156490f, -14.105040f, -13.995180f, -13.795620f, -13.399600f, -12.776700f, -11.712080f,
};
// inv_stddev (560 values)
static constexpr float kInvStddev[] = {
0.155775f, 0.154484f, 0.152738f, 0.151872f, 0.150603f, 0.148926f, 0.147067f, 0.144706f,
0.143631f, 0.144357f, 0.145185f, 0.145516f, 0.145282f, 0.144572f, 0.143920f, 0.143587f,
0.143602f, 0.143878f, 0.144209f, 0.144884f, 0.145476f, 0.145663f, 0.146268f, 0.146739f,
0.147272f, 0.147664f, 0.148091f, 0.148374f, 0.148884f, 0.149364f, 0.149709f, 0.150038f,
0.150292f, 0.150539f, 0.150679f, 0.150710f, 0.150599f, 0.150544f, 0.150594f, 0.150813f,
0.150957f, 0.151240f, 0.151462f, 0.151619f, 0.151616f, 0.151556f, 0.151497f, 0.151398f,
0.151261f, 0.151076f, 0.151060f, 0.151043f, 0.151077f, 0.151117f, 0.151192f, 0.151023f,
0.150805f, 0.150588f, 0.150349f, 0.150237f, 0.150173f, 0.150076f, 0.150006f, 0.149978f,
0.150057f, 0.150266f, 0.150469f, 0.150533f, 0.150551f, 0.150533f, 0.150427f, 0.150244f,
0.149967f, 0.149712f, 0.149466f, 0.149310f, 0.149368f, 0.149550f, 0.149974f, 0.150965f,
0.155775f, 0.154484f, 0.152738f, 0.151872f, 0.150603f, 0.148926f, 0.147067f, 0.144706f,
0.143631f, 0.144357f, 0.145185f, 0.145516f, 0.145282f, 0.144572f, 0.143920f, 0.143587f,
0.143602f, 0.143878f, 0.144209f, 0.144884f, 0.145476f, 0.145663f, 0.146268f, 0.146739f,
0.147272f, 0.147664f, 0.148091f, 0.148374f, 0.148884f, 0.149364f, 0.149709f, 0.150038f,
0.150292f, 0.150539f, 0.150679f, 0.150710f, 0.150599f, 0.150544f, 0.150594f, 0.150813f,
0.150957f, 0.151240f, 0.151462f, 0.151619f, 0.151616f, 0.151556f, 0.151497f, 0.151398f,
0.151261f, 0.151076f, 0.151060f, 0.151043f, 0.151077f, 0.151117f, 0.151192f, 0.151023f,
0.150805f, 0.150588f, 0.150349f, 0.150237f, 0.150173f, 0.150076f, 0.150006f, 0.149978f,
0.150057f, 0.150266f, 0.150469f, 0.150533f, 0.150551f, 0.150533f, 0.150427f, 0.150244f,
0.149967f, 0.149712f, 0.149466f, 0.149310f, 0.149368f, 0.149550f, 0.149974f, 0.150965f,
0.155775f, 0.154484f, 0.152738f, 0.151872f, 0.150603f, 0.148926f, 0.147067f, 0.144706f,
0.143631f, 0.144357f, 0.145185f, 0.145516f, 0.145282f, 0.144572f, 0.143920f, 0.143587f,
0.143602f, 0.143878f, 0.144209f, 0.144884f, 0.145476f, 0.145663f, 0.146268f, 0.146739f,
0.147272f, 0.147664f, 0.148091f, 0.148374f, 0.148884f, 0.149364f, 0.149709f, 0.150038f,
0.150292f, 0.150539f, 0.150679f, 0.150710f, 0.150599f, 0.150544f, 0.150594f, 0.150813f,
0.150957f, 0.151240f, 0.151462f, 0.151619f, 0.151616f, 0.151556f, 0.151497f, 0.151398f,
0.151261f, 0.151076f, 0.151060f, 0.151043f, 0.151077f, 0.151117f, 0.151192f, 0.151023f,
0.150805f, 0.150588f, 0.150349f, 0.150237f, 0.150173f, 0.150076f, 0.150006f, 0.149978f,
0.150057f, 0.150266f, 0.150469f, 0.150533f, 0.150551f, 0.150533f, 0.150427f, 0.150244f,
0.149967f, 0.149712f, 0.149466f, 0.149310f, 0.149368f, 0.149550f, 0.149974f, 0.150965f,
0.155775f, 0.154484f, 0.152738f, 0.151872f, 0.150603f, 0.148926f, 0.147067f, 0.144706f,
0.143631f, 0.144357f, 0.145185f, 0.145516f, 0.145282f, 0.144572f, 0.143920f, 0.143587f,
0.143602f, 0.143878f, 0.144209f, 0.144884f, 0.145476f, 0.145663f, 0.146268f, 0.146739f,
0.147272f, 0.147664f, 0.148091f, 0.148374f, 0.148884f, 0.149364f, 0.149709f, 0.150038f,
0.150292f, 0.150539f, 0.150679f, 0.150710f, 0.150599f, 0.150544f, 0.150594f, 0.150813f,
0.150957f, 0.151240f, 0.151462f, 0.151619f, 0.151616f, 0.151556f, 0.151497f, 0.151398f,
0.151261f, 0.151076f, 0.151060f, 0.151043f, 0.151077f, 0.151117f, 0.151192f, 0.151023f,
0.150805f, 0.150588f, 0.150349f, 0.150237f, 0.150173f, 0.150076f, 0.150006f, 0.149978f,
0.150057f, 0.150266f, 0.150469f, 0.150533f, 0.150551f, 0.150533f, 0.150427f, 0.150244f,
0.149967f, 0.149712f, 0.149466f, 0.149310f, 0.149368f, 0.149550f, 0.149974f, 0.150965f,
0.155775f, 0.154484f, 0.152738f, 0.151872f, 0.150603f, 0.148926f, 0.147067f, 0.144706f,
0.143631f, 0.144357f, 0.145185f, 0.145516f, 0.145282f, 0.144572f, 0.143920f, 0.143587f,
0.143602f, 0.143878f, 0.144209f, 0.144884f, 0.145476f, 0.145663f, 0.146268f, 0.146739f,
0.147272f, 0.147664f, 0.148091f, 0.148374f, 0.148884f, 0.149364f, 0.149709f, 0.150038f,
0.150292f, 0.150539f, 0.150679f, 0.150710f, 0.150599f, 0.150544f, 0.150594f, 0.150813f,
0.150957f, 0.151240f, 0.151462f, 0.151619f, 0.151616f, 0.151556f, 0.151497f, 0.151398f,
0.151261f, 0.151076f, 0.151060f, 0.151043f, 0.151077f, 0.151117f, 0.151192f, 0.151023f,
0.150805f, 0.150588f, 0.150349f, 0.150237f, 0.150173f, 0.150076f, 0.150006f, 0.149978f,
0.150057f, 0.150266f, 0.150469f, 0.150533f, 0.150551f, 0.150533f, 0.150427f, 0.150244f,
0.149967f, 0.149712f, 0.149466f, 0.149310f, 0.149368f, 0.149550f, 0.149974f, 0.150965f,
0.155775f, 0.154484f, 0.152738f, 0.151872f, 0.150603f, 0.148926f, 0.147067f, 0.144706f,
0.143631f, 0.144357f, 0.145185f, 0.145516f, 0.145282f, 0.144572f, 0.143920f, 0.143587f,
0.143602f, 0.143878f, 0.144209f, 0.144884f, 0.145476f, 0.145663f, 0.146268f, 0.146739f,
0.147272f, 0.147664f, 0.148091f, 0.148374f, 0.148884f, 0.149364f, 0.149709f, 0.150038f,
0.150292f, 0.150539f, 0.150679f, 0.150710f, 0.150599f, 0.150544f, 0.150594f, 0.150813f,
0.150957f, 0.151240f, 0.151462f, 0.151619f, 0.151616f, 0.151556f, 0.151497f, 0.151398f,
0.151261f, 0.151076f, 0.151060f, 0.151043f, 0.151077f, 0.151117f, 0.151192f, 0.151023f,
0.150805f, 0.150588f, 0.150349f, 0.150237f, 0.150173f, 0.150076f, 0.150006f, 0.149978f,
0.150057f, 0.150266f, 0.150469f, 0.150533f, 0.150551f, 0.150533f, 0.150427f, 0.150244f,
0.149967f, 0.149712f, 0.149466f, 0.149310f, 0.149368f, 0.149550f, 0.149974f, 0.150965f,
0.155775f, 0.154484f, 0.152738f, 0.151872f, 0.150603f, 0.148926f, 0.147067f, 0.144706f,
0.143631f, 0.144357f, 0.145185f, 0.145516f, 0.145282f, 0.144572f, 0.143920f, 0.143587f,
0.143602f, 0.143878f, 0.144209f, 0.144884f, 0.145476f, 0.145663f, 0.146268f, 0.146739f,
0.147272f, 0.147664f, 0.148091f, 0.148374f, 0.148884f, 0.149364f, 0.149709f, 0.150038f,
0.150292f, 0.150539f, 0.150679f, 0.150710f, 0.150599f, 0.150544f, 0.150594f, 0.150813f,
0.150957f, 0.151240f, 0.151462f, 0.151619f, 0.151616f, 0.151556f, 0.151497f, 0.151398f,
0.151261f, 0.151076f, 0.151060f, 0.151043f, 0.151077f, 0.151117f, 0.151192f, 0.151023f,
0.150805f, 0.150588f, 0.150349f, 0.150237f, 0.150173f, 0.150076f, 0.150006f, 0.149978f,
0.150057f, 0.150266f, 0.150469f, 0.150533f, 0.150551f, 0.150533f, 0.150427f, 0.150244f,
0.149967f, 0.149712f, 0.149466f, 0.149310f, 0.149368f, 0.149550f, 0.149974f, 0.150965f,
};
// LFR 参数
static constexpr int kLFRWindowSize = 7;
static constexpr int kLFRWindowShift = 6;
static constexpr int kFBankDim = 80;
static constexpr int kLFROutputDim = 560; // 80 * 7
// 语言代码
static constexpr int kLangAuto = 0;
static constexpr int kLangZh = 3;
static constexpr int kLangEn = 4;
static constexpr int kLangYue = 7;
static constexpr int kLangJa = 11;
static constexpr int kLangKo = 12;
static constexpr int kLangNoSpeech = 13;
// 文本归一化
static constexpr int kTextNormWithITN = 14;
static constexpr int kTextNormWithoutITN = 15;
} // namespace impress

View File

@ -0,0 +1,417 @@
#include "sense_voice_engine.h"
#include "sense_voice_features.h"
#include "sense_voice_tokenizer.h"
#include "sense_voice_cmvn.h"
#include "audio_processor.h"
#include "utils/logger.h"
#include "utils/timer.h"
#include <QThread>
#include <QFuture>
#include <QtConcurrent>
#include <QMutex>
#include <QMutexLocker>
#include <QFileInfo>
#include <algorithm>
#include <cmath>
// ONNX Runtime headers
#ifdef HAVE_ONNXRUNTIME
#include <onnxruntime_cxx_api.h>
#endif
static const char* const kTag = "SenseVoiceEngine";
namespace impress {
/** 语言代码映射 */
static int languageToInt(const QString& lang) {
if (lang.isEmpty()) return kLangAuto;
if (lang == "zh") return kLangZh;
if (lang == "en") return kLangEn;
if (lang == "ja") return kLangJa;
if (lang == "ko") return kLangKo;
if (lang == "yue") return kLangYue;
if (lang == "auto") return kLangAuto;
return kLangAuto;
}
/**
* @brief SenseVoice
*/
struct SenseVoiceEngine::Impl {
#ifdef HAVE_ONNXRUNTIME
std::unique_ptr<Ort::Env> env;
std::unique_ptr<Ort::SessionOptions> sessionOptions;
std::unique_ptr<Ort::Session> session;
std::vector<std::string> inputNames;
std::vector<std::string> outputNames;
SenseVoiceTokenizer tokenizer;
std::unique_ptr<SenseVoiceFeatures> features;
bool loadInWorker(const QString& modelPath,
const QString& tokensPath,
const QString& device,
int numThreads,
QString& errorMsg)
{
QMutexLocker locker(&mutex);
try {
auto envPtr = std::make_unique<Ort::Env>(
ORT_LOGGING_LEVEL_WARNING, "impress_sensevoice");
auto optionsPtr = std::make_unique<Ort::SessionOptions>();
optionsPtr->SetIntraOpNumThreads(numThreads);
optionsPtr->SetGraphOptimizationLevel(
GraphOptimizationLevel::ORT_ENABLE_ALL);
if (device == "gpu") {
LOG_WARNING(kTag, "GPU 加速尚未实现,回退到 CPU");
}
LOG_INFO(kTag, QString("正在加载 SenseVoice 模型: %1 (线程: %2)")
.arg(modelPath).arg(numThreads));
auto sessionPtr = std::make_unique<Ort::Session>(
*envPtr,
modelPath.toUtf8().constData(),
*optionsPtr);
Ort::AllocatorWithDefaultOptions allocator;
size_t inputCount = sessionPtr->GetInputCount();
size_t outputCount = sessionPtr->GetOutputCount();
LOG_INFO(kTag, QString("模型有 %1 个输入, %2 个输出")
.arg(inputCount).arg(outputCount));
inputNames.clear();
outputNames.clear();
for (size_t i = 0; i < inputCount; i++) {
auto namePtr = sessionPtr->GetInputNameAllocated(i, allocator);
inputNames.emplace_back(namePtr.get());
LOG_DEBUG(kTag, QString("输入 #%1: %2").arg(i).arg(namePtr.get()));
}
for (size_t i = 0; i < outputCount; i++) {
auto namePtr = sessionPtr->GetOutputNameAllocated(i, allocator);
outputNames.emplace_back(namePtr.get());
LOG_DEBUG(kTag, QString("输出 #%1: %2").arg(i).arg(namePtr.get()));
}
env = std::move(envPtr);
sessionOptions = std::move(optionsPtr);
session = std::move(sessionPtr);
// 加载 tokenizer 词表
QString vocabPath = tokensPath;
if (vocabPath.isEmpty()) {
QFileInfo modelInfo(modelPath);
vocabPath = modelInfo.absolutePath() + "/tokens.txt";
}
if (QFile::exists(vocabPath)) {
tokenizer.load(vocabPath);
LOG_INFO(kTag, QString("Tokenizer 词表已加载: %1").arg(vocabPath));
} else {
LOG_WARNING(kTag, QString("未找到 tokenizer 词表: %1").arg(vocabPath));
}
// 初始化特征提取器
features = std::make_unique<SenseVoiceFeatures>(16000);
LOG_INFO(kTag, QString("SenseVoice 模型加载成功: %1").arg(modelPath));
return true;
} catch (const Ort::Exception& e) {
errorMsg = QString("ONNX 异常: %1").arg(e.what());
LOG_ERROR(kTag, errorMsg);
return false;
} catch (const std::exception& e) {
errorMsg = QString("加载异常: %1").arg(e.what());
LOG_ERROR(kTag, errorMsg);
return false;
}
}
QMutex mutex;
#endif
};
SenseVoiceEngine::SenseVoiceEngine(QObject* parent)
: QObject(parent)
, impl_(std::make_unique<Impl>())
{}
SenseVoiceEngine::~SenseVoiceEngine() {
unloadModel();
}
bool SenseVoiceEngine::loadModelSync(const QString& modelPath,
const QString& tokensPath,
const QString& device,
int numThreads)
{
if (loaded_) {
LOG_WARNING(kTag, "模型已加载,先卸载再加载");
unloadModel();
}
QString errorMsg;
bool success = impl_->loadInWorker(modelPath, tokensPath, device, numThreads, errorMsg);
loaded_ = success;
if (success) {
emit modelLoaded(modelPath);
} else {
emit modelLoadError(modelPath, errorMsg);
emit error(errorMsg);
}
return success;
}
void SenseVoiceEngine::loadModelAsync(const QString& modelPath,
const QString& tokensPath,
const QString& device,
int numThreads)
{
if (loaded_) {
LOG_WARNING(kTag, "模型已加载,先卸载再加载");
unloadModel();
}
LOG_INFO(kTag, QString("异步加载 SenseVoice 模型: %1").arg(modelPath));
QFuture<void> future = QtConcurrent::run([this, modelPath, tokensPath, device, numThreads]() {
QString errorMsg;
bool success = impl_->loadInWorker(modelPath, tokensPath, device, numThreads, errorMsg);
QMetaObject::invokeMethod(this, [this, modelPath, errorMsg, success]() {
loaded_ = success;
if (success) {
emit modelLoaded(modelPath);
} else {
emit modelLoadError(modelPath, errorMsg);
emit error(errorMsg);
}
}, Qt::QueuedConnection);
});
}
void SenseVoiceEngine::unloadModel() {
QMutexLocker locker(&impl_->mutex);
#ifdef HAVE_ONNXRUNTIME
impl_->session.reset();
impl_->sessionOptions.reset();
impl_->env.reset();
impl_->features.reset();
impl_->tokenizer = SenseVoiceTokenizer();
#endif
loaded_ = false;
LOG_INFO(kTag, "模型已卸载");
emit modelUnloaded();
}
bool SenseVoiceEngine::isLoaded() const {
return loaded_;
}
/** CTC 贪婪解码:去重 + 去除空白 */
static std::vector<int> ctcGreedyDecode(const std::vector<int>& tokens, int blankToken) {
std::vector<int> result;
int prev = -1;
for (int token : tokens) {
if (token == blankToken) {
prev = -1; // 重置去重状态
continue;
}
if (token != prev) {
result.push_back(token);
}
prev = token;
}
return result;
}
/** argmax: 寻找数组中最大值的索引 */
static int argmax(const float* data, int start, int end) {
int bestIdx = start;
float bestVal = data[start];
for (int i = start + 1; i < end; i++) {
if (data[i] > bestVal) {
bestVal = data[i];
bestIdx = i;
}
}
return bestIdx;
}
RecognitionResult SenseVoiceEngine::infer(const std::vector<float>& samples,
int sampleRate,
const QString& language)
{
Timer timer;
RecognitionResult result;
QString lang = language.isEmpty() ? "auto" : language;
LOG_DEBUG(kTag, QString("推理语言: %1 (采样率: %2Hz, 样本数: %3)")
.arg(lang).arg(sampleRate).arg(samples.size()));
#ifdef HAVE_ONNXRUNTIME
if (!loaded_) {
result.text = "[错误] 模型未加载";
result.latency_ms = timer.elapsedMs();
return result;
}
if (samples.empty()) {
result.text = "";
result.latency_ms = timer.elapsedMs();
return result;
}
try {
// 1. 重采样到 16kHz
Timer preprocessTimer;
std::vector<float> processedSamples = samples;
int currentSampleRate = sampleRate;
if (sampleRate != 16000) {
AudioProcessor processor(16000);
processedSamples = processor.resample(samples, sampleRate);
currentSampleRate = 16000;
LOG_DEBUG(kTag, QString("重采样: %1Hz -> %2Hz (%3 -> %4 样本)")
.arg(sampleRate).arg(currentSampleRate)
.arg(samples.size()).arg(processedSamples.size()));
}
// 2. 提取 LFR Fbank 特征
std::vector<float> lfrFeatures = impl_->features->extract(processedSamples);
int numFrames = static_cast<int>(lfrFeatures.size()) / kLFROutputDim;
LOG_DEBUG(kTag, QString("特征提取: %1 ms (%2 帧, %3-dim)")
.arg(preprocessTimer.elapsedMs(), 0, 'f', 1)
.arg(numFrames).arg(kLFROutputDim));
if (numFrames <= 0) {
result.text = "[错误] 特征提取失败";
result.latency_ms = timer.elapsedMs();
return result;
}
// 3. 准备输入张量
QMutexLocker locker(&impl_->mutex);
// 输入: x, x_length, language, text_norm
int64_t xShape[] = {1, numFrames, kLFROutputDim};
auto memInfo = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
int64_t xLengthVal = numFrames;
int64_t xLengthShape[] = {1};
int langCode = languageToInt(lang);
int64_t langVal = langCode;
int64_t langShape[] = {1};
int64_t textNormVal = kTextNormWithITN;
int64_t textNormShape[] = {1};
std::vector<Ort::Value> inputTensors;
inputTensors.push_back(Ort::Value::CreateTensor<float>(
memInfo, lfrFeatures.data(), lfrFeatures.size(), xShape, 3));
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
memInfo, &xLengthVal, 1, xLengthShape, 1));
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
memInfo, &langVal, 1, langShape, 1));
inputTensors.push_back(Ort::Value::CreateTensor<int64_t>(
memInfo, &textNormVal, 1, textNormShape, 1));
// 4. 运行推理
Timer inferTimer;
std::vector<const char*> inputNamePtrs;
for (auto& name : impl_->inputNames) inputNamePtrs.push_back(name.c_str());
std::vector<const char*> outputNamePtrs;
for (auto& name : impl_->outputNames) outputNamePtrs.push_back(name.c_str());
auto outputTensors = impl_->session->Run(
Ort::RunOptions{nullptr},
inputNamePtrs.data(), inputTensors.data(), inputTensors.size(),
outputNamePtrs.data(), outputNamePtrs.size());
LOG_DEBUG(kTag, QString("ONNX 推理: %1 ms").arg(inferTimer.elapsedMs(), 0, 'f', 1));
// 5. 解析输出 logits [1, seq_len, 25055]
auto& outputTensor = outputTensors[0];
auto shape = outputTensor.GetTensorTypeAndShapeInfo().GetShape();
const float* logitsData = outputTensor.GetTensorData<float>();
LOG_DEBUG(kTag, QString("输出维度: [%1, %2, %3]")
.arg(shape[0]).arg(shape[1]).arg(shape[2]));
int seqLen = static_cast<int>(shape[1]);
int vocabSize = static_cast<int>(shape[2]);
// 6. CTC 贪婪解码
std::vector<int> rawTokens;
float totalConf = 0.0f;
int confCount = 0;
for (int t = 0; t < seqLen; t++) {
int offset = t * vocabSize;
int bestToken = argmax(logitsData, offset, offset + vocabSize);
if (bestToken != SenseVoiceTokenizer::kTokenBlank) {
rawTokens.push_back(bestToken);
// 计算置信度
float maxLogit = logitsData[offset + bestToken];
// 近似置信度: 使用 softmax 的最大值位置
totalConf += maxLogit;
confCount++;
}
}
// CTC 去重
std::vector<int> decodedTokens = ctcGreedyDecode(rawTokens, SenseVoiceTokenizer::kTokenBlank);
// 计算平均置信度 (softmax)
if (confCount > 0) {
float avgLogit = totalConf / confCount;
// 归一化到 0-1 范围
result.confidence = 1.0f / (1.0f + std::exp(-avgLogit));
}
// 7. 解码 token 为文本
if (decodedTokens.empty()) {
result.text = "";
} else if (impl_->tokenizer.isLoaded()) {
result.text = impl_->tokenizer.decode(decodedTokens);
LOG_DEBUG(kTag, QString("解码文本: %1 个 token → %2 字符")
.arg(decodedTokens.size()).arg(result.text.length()));
} else {
// 降级:输出 token ID
QString decodedText;
for (int token : decodedTokens) {
if (!decodedText.isEmpty()) decodedText += " ";
decodedText += QString::number(token);
}
result.text = decodedText;
LOG_WARNING(kTag, "Tokenizer 未加载,使用 token ID 输出");
}
result.isFinal = true;
} catch (const std::exception& e) {
result.text = QString("[错误] 推理失败: %1").arg(e.what());
LOG_ERROR(kTag, result.text);
}
#else
result.text = "[占位] ONNX Runtime 未启用";
#endif
result.latency_ms = timer.elapsedMs();
LOG_DEBUG(kTag, QString("推理总耗时: %1 ms").arg(result.latency_ms, 0, 'f', 1));
return result;
}
} // namespace impress

View File

@ -0,0 +1,63 @@
#pragma once
#include <QObject>
#include <QString>
#include <vector>
#include <memory>
#include "stt_engine.h" // RecognitionResult 定义
namespace impress {
/**
* @brief SenseVoice STT
*
* ONNX Runtime SenseVoice
* 线PCM Fbank LFR CMVN ONNX CTC
*/
class SenseVoiceEngine : public QObject {
Q_OBJECT
public:
explicit SenseVoiceEngine(QObject* parent = nullptr);
~SenseVoiceEngine() override;
/** @brief 同步加载模型 */
bool loadModelSync(const QString& modelPath,
const QString& tokensPath = QString(),
const QString& device = "cpu",
int numThreads = 4);
/** @brief 异步加载模型(后台线程,不阻塞 UI */
void loadModelAsync(const QString& modelPath,
const QString& tokensPath = QString(),
const QString& device = "cpu",
int numThreads = 4);
/** @brief 释放模型 */
void unloadModel();
/** @brief 是否已加载模型 */
bool isLoaded() const;
/**
* @brief
* @param samples PCM [-1, 1]
* @param sampleRate
* @param language "zh", "en", "ja", "ko", "yue", "auto"
*/
RecognitionResult infer(const std::vector<float>& samples,
int sampleRate,
const QString& language = QString());
signals:
void modelLoaded(const QString& modelPath);
void modelLoadError(const QString& modelPath, const QString& error);
void modelUnloaded();
void error(const QString& message);
private:
struct Impl;
std::unique_ptr<Impl> impl_;
bool loaded_ = false;
};
} // namespace impress

View File

@ -0,0 +1,215 @@
#include "sense_voice_features.h"
#include "sense_voice_cmvn.h"
#include <cmath>
#include <algorithm>
#include <numeric>
#include <complex>
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
namespace impress {
struct Complex {
float re, im;
Complex(float r = 0, float i = 0) : re(r), im(i) {}
Complex operator+(const Complex& o) const { return {re + o.re, im + o.im}; }
Complex operator-(const Complex& o) const { return {re - o.re, im - o.im}; }
Complex operator*(const Complex& o) const {
return {re * o.re - im * o.im, re * o.im + im * o.re};
}
Complex operator*(float s) const { return {re * s, im * s}; }
float magnitudeSq() const { return re * re + im * im; }
};
static void fft(std::vector<Complex>& x) {
int n = static_cast<int>(x.size());
if (n <= 1) return;
for (int i = 1, j = 0; i < n; i++) {
int bit = n >> 1;
for (; j & bit; bit >>= 1) j ^= bit;
j ^= bit;
if (i < j) std::swap(x[i], x[j]);
}
for (int len = 2; len <= n; len *= 2) {
float angle = -2.0f * static_cast<float>(M_PI) / len;
Complex wlen(std::cos(angle), std::sin(angle));
for (int i = 0; i < n; i += len) {
Complex w(1.0f, 0.0f);
for (int j = 0; j < len / 2; j++) {
Complex u = x[i + j];
Complex v = x[i + j + len / 2] * w;
x[i + j] = u + v;
x[i + j + len / 2] = u - v;
w = w * wlen;
}
}
}
}
static float hzToMel(float hz) {
return 1125.0f * std::log(1.0f + hz / 700.0f);
}
static float melToHz(float mel) {
return 700.0f * (std::exp(mel / 1125.0f) - 1.0f);
}
SenseVoiceFeatures::SenseVoiceFeatures(int sampleRate)
: sampleRate_(sampleRate)
{
buildMelFilters();
}
std::vector<float> SenseVoiceFeatures::hannWindow() const {
std::vector<float> window(winLength_);
for (int i = 0; i < winLength_; i++) {
window[i] = 0.5f * (1.0f - std::cos(2.0f * static_cast<float>(M_PI) * i / (winLength_ - 1)));
}
return window;
}
void SenseVoiceFeatures::buildMelFilters() {
int nFreq = nFft_ / 2 + 1;
float fMin = 20.0f;
float fMax = static_cast<float>(sampleRate_) / 2.0f;
float melMin = hzToMel(fMin);
float melMax = hzToMel(fMax);
std::vector<float> melPoints(nMel_ + 2);
for (int i = 0; i < nMel_ + 2; i++) {
melPoints[i] = melToHz(melMin + (melMax - melMin) * i / (nMel_ + 1));
}
std::vector<int> binPoints(nMel_ + 2);
for (int i = 0; i < nMel_ + 2; i++) {
binPoints[i] = static_cast<int>(std::round((nFft_ + 1) * melPoints[i] / sampleRate_));
binPoints[i] = std::max(0, std::min(nFreq - 1, binPoints[i]));
}
melFilters_.resize(nMel_);
for (int m = 0; m < nMel_; m++) {
MelFilter filter;
filter.startBin = binPoints[m];
filter.endBin = binPoints[m + 2] + 1;
int numWeights = filter.endBin - filter.startBin;
filter.weights.resize(numWeights, 0.0f);
for (int k = 0; k < numWeights; k++) {
int bin = filter.startBin + k;
if (bin >= binPoints[m] && bin <= binPoints[m + 1]) {
int denom = binPoints[m + 1] - binPoints[m];
filter.weights[k] = (denom > 0) ? static_cast<float>(bin - binPoints[m]) / denom : 0.0f;
} else if (bin > binPoints[m + 1] && bin <= binPoints[m + 2]) {
int denom = binPoints[m + 2] - binPoints[m + 1];
filter.weights[k] = (denom > 0) ? static_cast<float>(binPoints[m + 2] - bin) / denom : 0.0f;
}
}
melFilters_[m] = filter;
}
}
int SenseVoiceFeatures::nFrames(int numSamples) const {
if (numSamples < winLength_) return 0;
return (numSamples - winLength_) / hopLength_ + 1;
}
std::vector<float> SenseVoiceFeatures::extract(const std::vector<float>& samples) const {
if (samples.empty()) return {};
int numSamples = static_cast<int>(samples.size());
// 1. 预加重
std::vector<float> emphasized(numSamples);
emphasized[0] = samples[0];
for (int i = 1; i < numSamples; i++) {
emphasized[i] = samples[i] - preEmphasisCoeff_ * samples[i - 1];
}
// 2. 分帧 + FFT + Mel + 对数压缩
int numFrames = nFrames(numSamples);
if (numFrames <= 0) return {};
auto window = hannWindow();
int nFreq = nFft_ / 2 + 1;
std::vector<float> fbankData(numFrames * nMel_);
for (int f = 0; f < numFrames; f++) {
int frameStart = f * hopLength_;
// 应用 Hann 窗并 FFT
std::vector<Complex> fftInput(nFft_, {0.0f, 0.0f});
for (int i = 0; i < winLength_ && frameStart + i < numSamples; i++) {
fftInput[i] = {emphasized[frameStart + i] * window[i], 0.0f};
}
fft(fftInput);
// Mel 滤波器组
for (int m = 0; m < nMel_; m++) {
const auto& filter = melFilters_[m];
float energy = 0.0f;
for (int w = 0; w < static_cast<int>(filter.weights.size()); w++) {
int bin = filter.startBin + w;
if (bin < nFreq) {
energy += fftInput[bin].magnitudeSq() * filter.weights[w];
}
}
// 对数压缩 (使用自然对数)
energy = std::max(energy, 1e-10f);
fbankData[f * nMel_ + m] = std::log(energy);
}
}
// 3. LFR (Low Frame Rate) 特征拼接
// 将连续 lfr_window_size 帧 Fbank 特征拼接为一帧
// 步长为 lfr_window_shift
std::vector<float> lfrFeatures;
int lfrOutputDim = nMel_ * kLFRWindowSize; // 80 * 7 = 560
for (int i = 0; ; i += kLFRWindowShift) {
if (i >= numFrames) break;
// 计算 LFR 窗口
int leftPad = std::max(0, kLFRWindowSize / 2 - i);
int rightPad = std::max(0, kLFRWindowSize / 2 - (numFrames - 1 - i));
std::vector<float> frame(lfrOutputDim, 0.0f);
int outIdx = 0;
for (int j = -kLFRWindowSize / 2; j < kLFRWindowSize - kLFRWindowSize / 2; j++) {
int idx = i + j;
// 边界填充:复制第一帧或最后一帧
if (idx < 0) idx = 0;
if (idx >= numFrames) idx = numFrames - 1;
for (int m = 0; m < nMel_; m++) {
frame[outIdx++] = fbankData[idx * nMel_ + m];
}
}
lfrFeatures.insert(lfrFeatures.end(), frame.begin(), frame.end());
}
// 4. CMVN 归一化
cmvn(lfrFeatures);
return lfrFeatures;
}
void SenseVoiceFeatures::cmvn(std::vector<float>& features) const {
int nLFRFrames = static_cast<int>(features.size()) / kLFROutputDim;
int numValues = static_cast<int>(features.size());
for (int i = 0; i < numValues; i++) {
features[i] = (features[i] + kNegMean[i % kLFROutputDim]) *
kInvStddev[i % kLFROutputDim];
}
}
} // namespace impress

View File

@ -0,0 +1,54 @@
#pragma once
#include <vector>
namespace impress {
/**
* @brief SenseVoice
*
* PCM SenseVoice LFR Fbank
* : PCM FFT Mel
* LFR CMVN 560-dim
*/
class SenseVoiceFeatures {
public:
/**
* @brief
* @param sampleRate 16000
*/
explicit SenseVoiceFeatures(int sampleRate = 16000);
/**
* @brief PCM LFR Fbank
* @param samples PCM [-1, 1]
* @return LFR Fbank [nFrames * 560]
*/
std::vector<float> extract(const std::vector<float>& samples) const;
/** @brief 获取特征帧数 */
int nFrames(int numSamples) const;
private:
// Fbank 参数
int sampleRate_;
int nFft_ = 512;
int nMel_ = 80;
int hopLength_ = 160; // 10ms @ 16kHz
int winLength_ = 400; // 25ms @ 16kHz
float preEmphasisCoeff_ = 0.97f;
// Mel 滤波器组 (预计算)
struct MelFilter {
int startBin;
int endBin;
std::vector<float> weights;
};
std::vector<MelFilter> melFilters_;
std::vector<float> hannWindow() const;
void buildMelFilters();
void cmvn(std::vector<float>& features) const;
};
} // namespace impress

View File

@ -0,0 +1,101 @@
#include "sense_voice_tokenizer.h"
#include "utils/logger.h"
#include <QFile>
#include <QTextStream>
#include <QRegularExpression>
static const char* const kTag = "SenseVoiceTokenizer";
namespace impress {
SenseVoiceTokenizer::SenseVoiceTokenizer() = default;
bool SenseVoiceTokenizer::load(const QString& tokensPath) {
QFile file(tokensPath);
if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) {
LOG_ERROR(kTag, QString("无法打开词表文件: %1").arg(tokensPath));
return false;
}
QTextStream stream(&file);
stream.setEncoding(QStringConverter::Utf8);
tokenToString_.clear();
int lineCount = 0;
while (!stream.atEnd()) {
QString line = stream.readLine().trimmed();
if (line.isEmpty()) continue;
// 格式: "<token> <id>" — 最后一个是 token_id
int lastSpace = line.lastIndexOf(' ');
if (lastSpace < 0) continue;
bool ok = false;
int tokenId = line.mid(lastSpace + 1).toInt(&ok);
if (!ok) continue;
QString tokenStr = line.left(lastSpace);
tokenToString_[tokenId] = tokenStr;
lineCount++;
}
LOG_INFO(kTag, QString("词表已加载: %1 个词条 (%2)").arg(lineCount).arg(tokensPath));
return !tokenToString_.empty();
}
QString SenseVoiceTokenizer::decode(const std::vector<int>& tokens) const {
if (tokens.empty()) return "";
QString result;
for (int token : tokens) {
// 跳过特殊 token
if (token == kTokenBOS || token == kTokenEOS || token == kTokenBlank) {
continue;
}
auto it = tokenToString_.find(token);
if (it != tokenToString_.end()) {
result += decodeBPE(it->second);
} else {
result += QString("[T%1]").arg(token);
}
}
return result;
}
QString SenseVoiceTokenizer::decodeBPE(const QString& token) const {
// SenseVoice 使用 SentencePiece BPE 格式
// ▁ (U+2581) 表示单词开头/空格
QString result = token;
// ▁ → 空格
result.replace(QChar(0x2581), ' ');
// 处理 unicode 转义 (如 <0xE5>)
static QRegularExpression hexPattern("<0x([0-9A-Fa-f]+)>");
QRegularExpressionMatchIterator it = hexPattern.globalMatch(result);
QStringList parts;
int lastPos = 0;
while (it.hasNext()) {
QRegularExpressionMatch match = it.next();
parts << result.mid(lastPos, match.capturedStart() - lastPos);
bool ok;
int code = match.captured(1).toInt(&ok, 16);
if (ok) {
parts << QChar(code);
} else {
parts << match.captured(0);
}
lastPos = match.capturedEnd();
}
if (!parts.isEmpty() || lastPos > 0) {
parts << result.mid(lastPos);
result = parts.join("");
}
return result;
}
} // namespace impress

View File

@ -0,0 +1,42 @@
#pragma once
#include <QString>
#include <QStringList>
#include <vector>
#include <unordered_map>
namespace impress {
/**
* @brief SenseVoice Tokenizer
*
* tokens.txt BPE token
* SenseVoice BPE (SentencePiece)
*/
class SenseVoiceTokenizer {
public:
SenseVoiceTokenizer();
/** @brief 从 tokens.txt 加载词表 */
bool load(const QString& tokensPath);
/** @brief 将 token IDs 解码为文本 */
QString decode(const std::vector<int>& tokens) const;
/** @brief 是否已加载 */
bool isLoaded() const { return !tokenToString_.empty(); }
/** @brief 词表大小 */
int vocabSize() const { return static_cast<int>(tokenToString_.size()); }
// 特殊 token
static constexpr int kTokenBlank = 0; // CTC blank / <unk>
static constexpr int kTokenBOS = 1; // <s>
static constexpr int kTokenEOS = 2; // </s>
private:
std::unordered_map<int, QString> tokenToString_;
QString decodeBPE(const QString& token) const;
};
} // namespace impress

View File

@ -1,5 +1,5 @@
#include "file_transcribe_page.h"
#include "core/stt_engine.h"
#include "core/sense_voice_engine.h"
#include "audio/audio_decoder.h"
#include "app/config_manager.h"
#include "utils/logger.h"
@ -34,7 +34,7 @@ namespace impress {
FileTranscribePage::FileTranscribePage(ConfigManager* configManager, QWidget* parent)
: QWidget(parent)
, configManager_(configManager)
, sttEngine_(new STTEngine(this))
, sttEngine_(new SenseVoiceEngine(this))
, audioDecoder_(new AudioDecoder(this))
{
setupUI();
@ -161,6 +161,7 @@ void FileTranscribePage::onStartTranscribe() {
(void)QtConcurrent::run([this, modelPath]() {
bool success = sttEngine_->loadModelSync(modelPath,
configManager_->get("stt.tokens_path").toString(),
configManager_->get("stt.device").toString(),
configManager_->get("stt.num_threads").toInt());

View File

@ -14,7 +14,7 @@ class QByteArray;
namespace impress {
class ConfigManager;
class STTEngine;
class SenseVoiceEngine;
class AudioDecoder;
struct TranscribeTask {
@ -62,7 +62,7 @@ private:
QString formatSRTTime(double seconds) const;
ConfigManager* configManager_;
STTEngine* sttEngine_;
SenseVoiceEngine* sttEngine_;
AudioDecoder* audioDecoder_;
// UI 控件

View File

@ -47,7 +47,7 @@ void SettingsPage::setupUI() {
sttLayout->addRow("模型路径:", modelRow);
modelTypeCombo_ = new QComboBox(this);
modelTypeCombo_->addItems({"whisper", "paraformer", "conformer"});
modelTypeCombo_->addItems({"sense_voice", "whisper", "paraformer", "conformer"});
sttLayout->addRow("模型类型:", modelTypeCombo_);
deviceCombo_ = new QComboBox(this);
@ -59,6 +59,15 @@ void SettingsPage::setupUI() {
threadSpin_->setValue(4);
sttLayout->addRow("推理线程数:", threadSpin_);
auto* tokensRow = new QHBoxLayout();
tokensPathEdit_ = new QLineEdit(this);
tokensPathEdit_->setPlaceholderText("选择 tokens.txt 文件路径...");
tokensBrowseBtn_ = new QPushButton("浏览...", this);
connect(tokensBrowseBtn_, &QPushButton::clicked, this, &SettingsPage::onBrowseTokensPath);
tokensRow->addWidget(tokensPathEdit_);
tokensRow->addWidget(tokensBrowseBtn_);
sttLayout->addRow("词表路径:", tokensRow);
sampleRateSpin_ = new QSpinBox(this);
sampleRateSpin_->setRange(8000, 192000);
sampleRateSpin_->setSingleStep(1000);
@ -158,6 +167,7 @@ void SettingsPage::setupUI() {
void SettingsPage::loadFromConfig() {
modelPathEdit_->setText(configManager_->get("stt.model_path").toString());
tokensPathEdit_->setText(configManager_->get("stt.tokens_path").toString());
modelTypeCombo_->setCurrentText(configManager_->get("stt.model_type").toString());
deviceCombo_->setCurrentText(configManager_->get("stt.device").toString());
threadSpin_->setValue(configManager_->get("stt.num_threads").toInt());
@ -179,6 +189,7 @@ void SettingsPage::loadFromConfig() {
void SettingsPage::saveToConfig() {
configManager_->set("stt.model_path", modelPathEdit_->text());
configManager_->set("stt.tokens_path", tokensPathEdit_->text());
configManager_->set("stt.model_type", modelTypeCombo_->currentText());
configManager_->set("stt.device", deviceCombo_->currentText());
configManager_->set("stt.num_threads", threadSpin_->value());
@ -206,6 +217,14 @@ void SettingsPage::onBrowseModelPath() {
}
}
void SettingsPage::onBrowseTokensPath() {
QString path = QFileDialog::getOpenFileName(this, "选择词表文件", "",
"词表文件 (tokens.txt);;所有文件 (*.*)");
if (!path.isEmpty()) {
tokensPathEdit_->setText(path);
}
}
void SettingsPage::onSaveConfig() {
saveToConfig();
if (configManager_->save()) {

View File

@ -29,6 +29,7 @@ public:
private slots:
void onBrowseModelPath();
void onBrowseTokensPath();
void onSaveConfig();
void onResetConfig();
@ -42,6 +43,8 @@ private:
// STT 设置
QLineEdit* modelPathEdit_;
QPushButton* browseBtn_;
QLineEdit* tokensPathEdit_;
QPushButton* tokensBrowseBtn_;
QComboBox* modelTypeCombo_;
QComboBox* deviceCombo_;
QSpinBox* threadSpin_;

View File

@ -1,5 +1,5 @@
#include "stt_test_page.h"
#include "core/stt_engine.h"
#include "core/sense_voice_engine.h"
#include "audio/audio_capture.h"
#include "audio/audio_ring_buffer.h"
#include "widgets/audio_waveform.h"
@ -26,7 +26,7 @@ namespace impress {
STTTestPage::STTTestPage(ConfigManager* configManager, QWidget* parent)
: QWidget(parent)
, configManager_(configManager)
, sttEngine_(new STTEngine(this))
, sttEngine_(new SenseVoiceEngine(this))
, audioCapture_(new AudioCapture(this))
{
setupUI();
@ -34,11 +34,11 @@ STTTestPage::STTTestPage(ConfigManager* configManager, QWidget* parent)
// 信号连接
connect(audioCapture_, &AudioCapture::audioDataReady,
this, &STTTestPage::onAudioDataReady);
connect(sttEngine_, &STTEngine::modelLoaded,
connect(sttEngine_, &SenseVoiceEngine::modelLoaded,
this, &STTTestPage::onModelLoaded);
connect(sttEngine_, &STTEngine::modelLoadError,
connect(sttEngine_, &SenseVoiceEngine::modelLoadError,
this, &STTTestPage::onModelLoadError);
connect(sttEngine_, &STTEngine::modelUnloaded,
connect(sttEngine_, &SenseVoiceEngine::modelUnloaded,
this, &STTTestPage::onModelUnloaded);
}
@ -133,6 +133,7 @@ void STTTestPage::onToggleRecording() {
updateUIState();
sttEngine_->loadModelAsync(modelPath,
configManager_->get("stt.tokens_path").toString(),
configManager_->get("stt.device").toString(),
configManager_->get("stt.num_threads").toInt());

View File

@ -13,13 +13,14 @@ class QSpinBox;
namespace impress {
class ConfigManager;
class STTEngine;
class SenseVoiceEngine;
class AudioCapture;
/**
* @brief STT
*
* +
* 使 SenseVoice
* UI
*/
class STTTestPage : public QWidget {
@ -43,7 +44,7 @@ private:
void processAudioChunk(const std::vector<float>& samples, int sampleRate);
ConfigManager* configManager_;
STTEngine* sttEngine_;
SenseVoiceEngine* sttEngine_;
AudioCapture* audioCapture_;
// UI 控件