From 9a6dfa3b88497f2e21ac25b1dd4517e22f06f1a6 Mon Sep 17 00:00:00 2001 From: impressionyang Date: Tue, 12 May 2026 19:26:11 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E9=9B=86=E6=88=90=20SenseVoice=20?= =?UTF-8?q?=E5=AE=8C=E6=95=B4=E6=8E=A8=E7=90=86=E7=AE=A1=E7=BA=BF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 SenseVoiceEngine: 专用 SenseVoice ONNX 模型推理引擎 - 新增 SenseVoiceFeatures: Fbank 特征提取 + LFR 拼接 + CMVN 归一化 - 80-dim Mel 滤波器组 + 对数压缩 - LFR (Low Frame Rate): window_size=7, window_shift=6 → 560-dim - CMVN: neg_mean / inv_stddev 从模型元数据自动提取 - 新增 SenseVoiceTokenizer: 加载 tokens.txt 词表,BPE 解码 - 新增 CTC 贪婪解码: 去除重复 token 和空白符 - 配置页面新增词表路径选择 - STT 测试页面和文件转写页面切换至 SenseVoiceEngine - 更新 CMakeLists.txt 包含所有新增源文件 - 模型: /home/alvin/Documents/SenseVoice-Small/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx - 语言支持: 中文/英语/日语/韩语/粤语 自动检测 Co-Authored-By: Claude Opus 4.6 --- CMakeLists.txt | 7 + README.md | 3 +- src/app/config_manager.cpp | 3 +- src/core/sense_voice_cmvn.h | 174 ++++++++++++ src/core/sense_voice_engine.cpp | 417 +++++++++++++++++++++++++++++ src/core/sense_voice_engine.h | 63 +++++ src/core/sense_voice_features.cpp | 215 +++++++++++++++ src/core/sense_voice_features.h | 54 ++++ src/core/sense_voice_tokenizer.cpp | 101 +++++++ src/core/sense_voice_tokenizer.h | 42 +++ src/ui/file_transcribe_page.cpp | 5 +- src/ui/file_transcribe_page.h | 4 +- src/ui/settings_page.cpp | 21 +- src/ui/settings_page.h | 3 + src/ui/stt_test_page.cpp | 11 +- src/ui/stt_test_page.h | 5 +- 16 files changed, 1114 insertions(+), 14 deletions(-) create mode 100644 src/core/sense_voice_cmvn.h create mode 100644 src/core/sense_voice_engine.cpp create mode 100644 src/core/sense_voice_engine.h create mode 100644 src/core/sense_voice_features.cpp create mode 100644 src/core/sense_voice_features.h create mode 100644 src/core/sense_voice_tokenizer.cpp create mode 100644 src/core/sense_voice_tokenizer.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 3add00c..5b25262 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,6 +46,9 @@ set(SOURCES # Core src/core/stt_engine.cpp + src/core/sense_voice_engine.cpp + src/core/sense_voice_features.cpp + src/core/sense_voice_tokenizer.cpp src/core/mel_spectrogram.cpp src/core/whisper_tokenizer.cpp src/core/audio_processor.cpp @@ -76,6 +79,10 @@ set(HEADERS src/app/config_manager.h src/core/stt_engine.h + src/core/sense_voice_engine.h + src/core/sense_voice_features.h + src/core/sense_voice_tokenizer.h + src/core/sense_voice_cmvn.h src/core/mel_spectrogram.h src/core/whisper_tokenizer.h src/core/audio_processor.h diff --git a/README.md b/README.md index 9d4618c..c384086 100644 --- a/README.md +++ b/README.md @@ -131,7 +131,8 @@ ctest - [x] 语音活动检测 (VAD — 短时能量 + 过零率) - [x] 音频文件信息 (时长/采样率/声道数) - [x] 单元测试框架 (Catch2, 39 个测试用例) -- [ ] 完整 Whisper 推理 (自回归解码 + 流式识别) +- [x] 完整 Whisper 推理 (自回归解码 + 流式识别) +- [x] SenseVoice 完整推理 (Fbank → LFR → CMVN → ONNX → CTC 解码) - [ ] 跨平台打包 ## License diff --git a/src/app/config_manager.cpp b/src/app/config_manager.cpp index 5651391..096eb30 100644 --- a/src/app/config_manager.cpp +++ b/src/app/config_manager.cpp @@ -66,7 +66,8 @@ void ConfigManager::loadDefaults() { config_ = QVariantMap{ {"stt", QVariantMap{ {"model_path", ""}, - {"model_type", "whisper"}, + {"model_type", "sense_voice"}, + {"tokens_path", ""}, {"device", "cpu"}, {"num_threads", 4}, {"sample_rate", 16000}, diff --git a/src/core/sense_voice_cmvn.h b/src/core/sense_voice_cmvn.h new file mode 100644 index 0000000..6a66edd --- /dev/null +++ b/src/core/sense_voice_cmvn.h @@ -0,0 +1,174 @@ +// Auto-generated CMVN coefficients from SenseVoice model +// DO NOT EDIT MANUALLY +#pragma once + +namespace impress { + +// neg_mean (560 values) +static constexpr float kNegMean[] = { + -8.311879f, -8.600912f, -9.615928f, -10.435950f, -11.212920f, -11.883330f, -12.362430f, -12.637060f, + -12.881800f, -12.830660f, -12.891030f, -12.956660f, -13.197630f, -13.405980f, -13.491130f, -13.554600f, + -13.556390f, -13.519150f, -13.682840f, -13.532890f, -13.421070f, -13.655190f, -13.507130f, -13.752510f, + -13.767150f, -13.874080f, -13.731090f, -13.704120f, -13.560730f, -13.534880f, -13.548950f, -13.562280f, + -13.594080f, -13.620470f, -13.641980f, -13.661090f, -13.626690f, -13.582970f, -13.573870f, -13.473900f, + -13.530630f, -13.483480f, -13.610470f, -13.647160f, -13.715460f, -13.791840f, -13.906140f, -14.030980f, + -14.182050f, -14.358810f, -14.484190f, -14.601720f, -14.705910f, -14.833620f, -14.921220f, -15.006220f, + -15.051220f, -15.031190f, -14.990280f, -14.923020f, -14.869270f, -14.826910f, -14.797200f, -14.769090f, + -14.713560f, -14.612770f, -14.516960f, -14.422520f, -14.364050f, -14.304510f, -14.231610f, -14.198510f, + -14.166330f, -14.156490f, -14.105040f, -13.995180f, -13.795620f, -13.399600f, -12.776700f, -11.712080f, + -8.311879f, -8.600912f, -9.615928f, -10.435950f, -11.212920f, -11.883330f, -12.362430f, -12.637060f, + -12.881800f, -12.830660f, -12.891030f, -12.956660f, -13.197630f, -13.405980f, -13.491130f, -13.554600f, + -13.556390f, -13.519150f, -13.682840f, -13.532890f, -13.421070f, -13.655190f, -13.507130f, -13.752510f, + -13.767150f, -13.874080f, -13.731090f, -13.704120f, -13.560730f, -13.534880f, -13.548950f, -13.562280f, + -13.594080f, -13.620470f, -13.641980f, -13.661090f, -13.626690f, -13.582970f, -13.573870f, -13.473900f, + -13.530630f, -13.483480f, -13.610470f, -13.647160f, -13.715460f, -13.791840f, -13.906140f, -14.030980f, + -14.182050f, -14.358810f, -14.484190f, -14.601720f, -14.705910f, -14.833620f, -14.921220f, -15.006220f, + -15.051220f, -15.031190f, -14.990280f, -14.923020f, -14.869270f, -14.826910f, -14.797200f, -14.769090f, + -14.713560f, -14.612770f, -14.516960f, -14.422520f, -14.364050f, -14.304510f, -14.231610f, -14.198510f, + -14.166330f, -14.156490f, -14.105040f, -13.995180f, -13.795620f, -13.399600f, -12.776700f, -11.712080f, + -8.311879f, -8.600912f, -9.615928f, -10.435950f, -11.212920f, -11.883330f, -12.362430f, -12.637060f, + -12.881800f, -12.830660f, -12.891030f, -12.956660f, -13.197630f, -13.405980f, -13.491130f, -13.554600f, + -13.556390f, -13.519150f, -13.682840f, -13.532890f, -13.421070f, -13.655190f, -13.507130f, -13.752510f, + -13.767150f, -13.874080f, -13.731090f, -13.704120f, -13.560730f, -13.534880f, -13.548950f, -13.562280f, + -13.594080f, -13.620470f, -13.641980f, -13.661090f, -13.626690f, -13.582970f, -13.573870f, -13.473900f, + -13.530630f, -13.483480f, -13.610470f, -13.647160f, -13.715460f, -13.791840f, -13.906140f, -14.030980f, + -14.182050f, -14.358810f, -14.484190f, -14.601720f, -14.705910f, -14.833620f, -14.921220f, -15.006220f, + -15.051220f, -15.031190f, -14.990280f, -14.923020f, -14.869270f, -14.826910f, -14.797200f, -14.769090f, + -14.713560f, -14.612770f, -14.516960f, -14.422520f, -14.364050f, -14.304510f, -14.231610f, -14.198510f, + -14.166330f, -14.156490f, -14.105040f, -13.995180f, -13.795620f, -13.399600f, -12.776700f, -11.712080f, + -8.311879f, -8.600912f, -9.615928f, -10.435950f, -11.212920f, -11.883330f, -12.362430f, -12.637060f, + -12.881800f, -12.830660f, -12.891030f, -12.956660f, -13.197630f, -13.405980f, -13.491130f, -13.554600f, + -13.556390f, -13.519150f, -13.682840f, -13.532890f, -13.421070f, -13.655190f, -13.507130f, -13.752510f, + -13.767150f, -13.874080f, -13.731090f, -13.704120f, -13.560730f, -13.534880f, -13.548950f, -13.562280f, + -13.594080f, -13.620470f, -13.641980f, -13.661090f, -13.626690f, -13.582970f, -13.573870f, -13.473900f, + -13.530630f, -13.483480f, -13.610470f, -13.647160f, -13.715460f, -13.791840f, -13.906140f, -14.030980f, + -14.182050f, -14.358810f, -14.484190f, -14.601720f, -14.705910f, -14.833620f, -14.921220f, -15.006220f, + -15.051220f, -15.031190f, -14.990280f, -14.923020f, -14.869270f, -14.826910f, -14.797200f, -14.769090f, + -14.713560f, -14.612770f, -14.516960f, -14.422520f, -14.364050f, -14.304510f, -14.231610f, -14.198510f, + -14.166330f, -14.156490f, -14.105040f, -13.995180f, -13.795620f, -13.399600f, -12.776700f, -11.712080f, + -8.311879f, -8.600912f, -9.615928f, -10.435950f, -11.212920f, -11.883330f, -12.362430f, -12.637060f, + -12.881800f, -12.830660f, -12.891030f, -12.956660f, -13.197630f, -13.405980f, -13.491130f, -13.554600f, + -13.556390f, -13.519150f, -13.682840f, -13.532890f, -13.421070f, -13.655190f, -13.507130f, -13.752510f, + -13.767150f, -13.874080f, -13.731090f, -13.704120f, -13.560730f, -13.534880f, -13.548950f, -13.562280f, + -13.594080f, -13.620470f, -13.641980f, -13.661090f, -13.626690f, -13.582970f, -13.573870f, -13.473900f, + -13.530630f, -13.483480f, -13.610470f, -13.647160f, -13.715460f, -13.791840f, -13.906140f, -14.030980f, + -14.182050f, -14.358810f, -14.484190f, -14.601720f, -14.705910f, -14.833620f, -14.921220f, -15.006220f, + -15.051220f, -15.031190f, -14.990280f, -14.923020f, -14.869270f, -14.826910f, -14.797200f, -14.769090f, + -14.713560f, -14.612770f, -14.516960f, -14.422520f, -14.364050f, -14.304510f, -14.231610f, -14.198510f, + -14.166330f, -14.156490f, -14.105040f, -13.995180f, -13.795620f, -13.399600f, -12.776700f, -11.712080f, + -8.311879f, -8.600912f, -9.615928f, -10.435950f, -11.212920f, -11.883330f, -12.362430f, -12.637060f, + -12.881800f, -12.830660f, -12.891030f, -12.956660f, -13.197630f, -13.405980f, -13.491130f, -13.554600f, + -13.556390f, -13.519150f, -13.682840f, -13.532890f, -13.421070f, -13.655190f, -13.507130f, -13.752510f, + -13.767150f, -13.874080f, -13.731090f, -13.704120f, -13.560730f, -13.534880f, -13.548950f, -13.562280f, + -13.594080f, -13.620470f, -13.641980f, -13.661090f, -13.626690f, -13.582970f, -13.573870f, -13.473900f, + -13.530630f, -13.483480f, -13.610470f, -13.647160f, -13.715460f, -13.791840f, -13.906140f, -14.030980f, + -14.182050f, -14.358810f, -14.484190f, -14.601720f, -14.705910f, -14.833620f, -14.921220f, -15.006220f, + -15.051220f, -15.031190f, -14.990280f, -14.923020f, -14.869270f, -14.826910f, -14.797200f, -14.769090f, + -14.713560f, -14.612770f, -14.516960f, -14.422520f, -14.364050f, -14.304510f, -14.231610f, -14.198510f, + -14.166330f, -14.156490f, -14.105040f, -13.995180f, -13.795620f, -13.399600f, -12.776700f, -11.712080f, + -8.311879f, -8.600912f, -9.615928f, -10.435950f, -11.212920f, -11.883330f, -12.362430f, -12.637060f, + -12.881800f, -12.830660f, -12.891030f, -12.956660f, -13.197630f, -13.405980f, -13.491130f, -13.554600f, + -13.556390f, -13.519150f, -13.682840f, -13.532890f, -13.421070f, -13.655190f, -13.507130f, -13.752510f, + -13.767150f, -13.874080f, -13.731090f, -13.704120f, -13.560730f, -13.534880f, -13.548950f, -13.562280f, + -13.594080f, -13.620470f, -13.641980f, -13.661090f, -13.626690f, -13.582970f, -13.573870f, -13.473900f, + -13.530630f, -13.483480f, -13.610470f, -13.647160f, -13.715460f, -13.791840f, -13.906140f, -14.030980f, + -14.182050f, -14.358810f, -14.484190f, -14.601720f, -14.705910f, -14.833620f, -14.921220f, -15.006220f, + -15.051220f, -15.031190f, -14.990280f, -14.923020f, -14.869270f, -14.826910f, -14.797200f, -14.769090f, + -14.713560f, -14.612770f, -14.516960f, -14.422520f, -14.364050f, -14.304510f, -14.231610f, -14.198510f, + -14.166330f, -14.156490f, -14.105040f, -13.995180f, -13.795620f, -13.399600f, -12.776700f, -11.712080f, +}; + +// inv_stddev (560 values) +static constexpr float kInvStddev[] = { + 0.155775f, 0.154484f, 0.152738f, 0.151872f, 0.150603f, 0.148926f, 0.147067f, 0.144706f, + 0.143631f, 0.144357f, 0.145185f, 0.145516f, 0.145282f, 0.144572f, 0.143920f, 0.143587f, + 0.143602f, 0.143878f, 0.144209f, 0.144884f, 0.145476f, 0.145663f, 0.146268f, 0.146739f, + 0.147272f, 0.147664f, 0.148091f, 0.148374f, 0.148884f, 0.149364f, 0.149709f, 0.150038f, + 0.150292f, 0.150539f, 0.150679f, 0.150710f, 0.150599f, 0.150544f, 0.150594f, 0.150813f, + 0.150957f, 0.151240f, 0.151462f, 0.151619f, 0.151616f, 0.151556f, 0.151497f, 0.151398f, + 0.151261f, 0.151076f, 0.151060f, 0.151043f, 0.151077f, 0.151117f, 0.151192f, 0.151023f, + 0.150805f, 0.150588f, 0.150349f, 0.150237f, 0.150173f, 0.150076f, 0.150006f, 0.149978f, + 0.150057f, 0.150266f, 0.150469f, 0.150533f, 0.150551f, 0.150533f, 0.150427f, 0.150244f, + 0.149967f, 0.149712f, 0.149466f, 0.149310f, 0.149368f, 0.149550f, 0.149974f, 0.150965f, + 0.155775f, 0.154484f, 0.152738f, 0.151872f, 0.150603f, 0.148926f, 0.147067f, 0.144706f, + 0.143631f, 0.144357f, 0.145185f, 0.145516f, 0.145282f, 0.144572f, 0.143920f, 0.143587f, + 0.143602f, 0.143878f, 0.144209f, 0.144884f, 0.145476f, 0.145663f, 0.146268f, 0.146739f, + 0.147272f, 0.147664f, 0.148091f, 0.148374f, 0.148884f, 0.149364f, 0.149709f, 0.150038f, + 0.150292f, 0.150539f, 0.150679f, 0.150710f, 0.150599f, 0.150544f, 0.150594f, 0.150813f, + 0.150957f, 0.151240f, 0.151462f, 0.151619f, 0.151616f, 0.151556f, 0.151497f, 0.151398f, + 0.151261f, 0.151076f, 0.151060f, 0.151043f, 0.151077f, 0.151117f, 0.151192f, 0.151023f, + 0.150805f, 0.150588f, 0.150349f, 0.150237f, 0.150173f, 0.150076f, 0.150006f, 0.149978f, + 0.150057f, 0.150266f, 0.150469f, 0.150533f, 0.150551f, 0.150533f, 0.150427f, 0.150244f, + 0.149967f, 0.149712f, 0.149466f, 0.149310f, 0.149368f, 0.149550f, 0.149974f, 0.150965f, + 0.155775f, 0.154484f, 0.152738f, 0.151872f, 0.150603f, 0.148926f, 0.147067f, 0.144706f, + 0.143631f, 0.144357f, 0.145185f, 0.145516f, 0.145282f, 0.144572f, 0.143920f, 0.143587f, + 0.143602f, 0.143878f, 0.144209f, 0.144884f, 0.145476f, 0.145663f, 0.146268f, 0.146739f, + 0.147272f, 0.147664f, 0.148091f, 0.148374f, 0.148884f, 0.149364f, 0.149709f, 0.150038f, + 0.150292f, 0.150539f, 0.150679f, 0.150710f, 0.150599f, 0.150544f, 0.150594f, 0.150813f, + 0.150957f, 0.151240f, 0.151462f, 0.151619f, 0.151616f, 0.151556f, 0.151497f, 0.151398f, + 0.151261f, 0.151076f, 0.151060f, 0.151043f, 0.151077f, 0.151117f, 0.151192f, 0.151023f, + 0.150805f, 0.150588f, 0.150349f, 0.150237f, 0.150173f, 0.150076f, 0.150006f, 0.149978f, + 0.150057f, 0.150266f, 0.150469f, 0.150533f, 0.150551f, 0.150533f, 0.150427f, 0.150244f, + 0.149967f, 0.149712f, 0.149466f, 0.149310f, 0.149368f, 0.149550f, 0.149974f, 0.150965f, + 0.155775f, 0.154484f, 0.152738f, 0.151872f, 0.150603f, 0.148926f, 0.147067f, 0.144706f, + 0.143631f, 0.144357f, 0.145185f, 0.145516f, 0.145282f, 0.144572f, 0.143920f, 0.143587f, + 0.143602f, 0.143878f, 0.144209f, 0.144884f, 0.145476f, 0.145663f, 0.146268f, 0.146739f, + 0.147272f, 0.147664f, 0.148091f, 0.148374f, 0.148884f, 0.149364f, 0.149709f, 0.150038f, + 0.150292f, 0.150539f, 0.150679f, 0.150710f, 0.150599f, 0.150544f, 0.150594f, 0.150813f, + 0.150957f, 0.151240f, 0.151462f, 0.151619f, 0.151616f, 0.151556f, 0.151497f, 0.151398f, + 0.151261f, 0.151076f, 0.151060f, 0.151043f, 0.151077f, 0.151117f, 0.151192f, 0.151023f, + 0.150805f, 0.150588f, 0.150349f, 0.150237f, 0.150173f, 0.150076f, 0.150006f, 0.149978f, + 0.150057f, 0.150266f, 0.150469f, 0.150533f, 0.150551f, 0.150533f, 0.150427f, 0.150244f, + 0.149967f, 0.149712f, 0.149466f, 0.149310f, 0.149368f, 0.149550f, 0.149974f, 0.150965f, + 0.155775f, 0.154484f, 0.152738f, 0.151872f, 0.150603f, 0.148926f, 0.147067f, 0.144706f, + 0.143631f, 0.144357f, 0.145185f, 0.145516f, 0.145282f, 0.144572f, 0.143920f, 0.143587f, + 0.143602f, 0.143878f, 0.144209f, 0.144884f, 0.145476f, 0.145663f, 0.146268f, 0.146739f, + 0.147272f, 0.147664f, 0.148091f, 0.148374f, 0.148884f, 0.149364f, 0.149709f, 0.150038f, + 0.150292f, 0.150539f, 0.150679f, 0.150710f, 0.150599f, 0.150544f, 0.150594f, 0.150813f, + 0.150957f, 0.151240f, 0.151462f, 0.151619f, 0.151616f, 0.151556f, 0.151497f, 0.151398f, + 0.151261f, 0.151076f, 0.151060f, 0.151043f, 0.151077f, 0.151117f, 0.151192f, 0.151023f, + 0.150805f, 0.150588f, 0.150349f, 0.150237f, 0.150173f, 0.150076f, 0.150006f, 0.149978f, + 0.150057f, 0.150266f, 0.150469f, 0.150533f, 0.150551f, 0.150533f, 0.150427f, 0.150244f, + 0.149967f, 0.149712f, 0.149466f, 0.149310f, 0.149368f, 0.149550f, 0.149974f, 0.150965f, + 0.155775f, 0.154484f, 0.152738f, 0.151872f, 0.150603f, 0.148926f, 0.147067f, 0.144706f, + 0.143631f, 0.144357f, 0.145185f, 0.145516f, 0.145282f, 0.144572f, 0.143920f, 0.143587f, + 0.143602f, 0.143878f, 0.144209f, 0.144884f, 0.145476f, 0.145663f, 0.146268f, 0.146739f, + 0.147272f, 0.147664f, 0.148091f, 0.148374f, 0.148884f, 0.149364f, 0.149709f, 0.150038f, + 0.150292f, 0.150539f, 0.150679f, 0.150710f, 0.150599f, 0.150544f, 0.150594f, 0.150813f, + 0.150957f, 0.151240f, 0.151462f, 0.151619f, 0.151616f, 0.151556f, 0.151497f, 0.151398f, + 0.151261f, 0.151076f, 0.151060f, 0.151043f, 0.151077f, 0.151117f, 0.151192f, 0.151023f, + 0.150805f, 0.150588f, 0.150349f, 0.150237f, 0.150173f, 0.150076f, 0.150006f, 0.149978f, + 0.150057f, 0.150266f, 0.150469f, 0.150533f, 0.150551f, 0.150533f, 0.150427f, 0.150244f, + 0.149967f, 0.149712f, 0.149466f, 0.149310f, 0.149368f, 0.149550f, 0.149974f, 0.150965f, + 0.155775f, 0.154484f, 0.152738f, 0.151872f, 0.150603f, 0.148926f, 0.147067f, 0.144706f, + 0.143631f, 0.144357f, 0.145185f, 0.145516f, 0.145282f, 0.144572f, 0.143920f, 0.143587f, + 0.143602f, 0.143878f, 0.144209f, 0.144884f, 0.145476f, 0.145663f, 0.146268f, 0.146739f, + 0.147272f, 0.147664f, 0.148091f, 0.148374f, 0.148884f, 0.149364f, 0.149709f, 0.150038f, + 0.150292f, 0.150539f, 0.150679f, 0.150710f, 0.150599f, 0.150544f, 0.150594f, 0.150813f, + 0.150957f, 0.151240f, 0.151462f, 0.151619f, 0.151616f, 0.151556f, 0.151497f, 0.151398f, + 0.151261f, 0.151076f, 0.151060f, 0.151043f, 0.151077f, 0.151117f, 0.151192f, 0.151023f, + 0.150805f, 0.150588f, 0.150349f, 0.150237f, 0.150173f, 0.150076f, 0.150006f, 0.149978f, + 0.150057f, 0.150266f, 0.150469f, 0.150533f, 0.150551f, 0.150533f, 0.150427f, 0.150244f, + 0.149967f, 0.149712f, 0.149466f, 0.149310f, 0.149368f, 0.149550f, 0.149974f, 0.150965f, +}; + +// LFR 参数 +static constexpr int kLFRWindowSize = 7; +static constexpr int kLFRWindowShift = 6; +static constexpr int kFBankDim = 80; +static constexpr int kLFROutputDim = 560; // 80 * 7 + +// 语言代码 +static constexpr int kLangAuto = 0; +static constexpr int kLangZh = 3; +static constexpr int kLangEn = 4; +static constexpr int kLangYue = 7; +static constexpr int kLangJa = 11; +static constexpr int kLangKo = 12; +static constexpr int kLangNoSpeech = 13; + +// 文本归一化 +static constexpr int kTextNormWithITN = 14; +static constexpr int kTextNormWithoutITN = 15; + +} // namespace impress diff --git a/src/core/sense_voice_engine.cpp b/src/core/sense_voice_engine.cpp new file mode 100644 index 0000000..01b84a4 --- /dev/null +++ b/src/core/sense_voice_engine.cpp @@ -0,0 +1,417 @@ +#include "sense_voice_engine.h" +#include "sense_voice_features.h" +#include "sense_voice_tokenizer.h" +#include "sense_voice_cmvn.h" +#include "audio_processor.h" +#include "utils/logger.h" +#include "utils/timer.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +// ONNX Runtime headers +#ifdef HAVE_ONNXRUNTIME +#include +#endif + +static const char* const kTag = "SenseVoiceEngine"; + +namespace impress { + +/** 语言代码映射 */ +static int languageToInt(const QString& lang) { + if (lang.isEmpty()) return kLangAuto; + if (lang == "zh") return kLangZh; + if (lang == "en") return kLangEn; + if (lang == "ja") return kLangJa; + if (lang == "ko") return kLangKo; + if (lang == "yue") return kLangYue; + if (lang == "auto") return kLangAuto; + return kLangAuto; +} + +/** + * @brief SenseVoice 引擎内部实现 + */ +struct SenseVoiceEngine::Impl { +#ifdef HAVE_ONNXRUNTIME + std::unique_ptr env; + std::unique_ptr sessionOptions; + std::unique_ptr session; + + std::vector inputNames; + std::vector outputNames; + + SenseVoiceTokenizer tokenizer; + std::unique_ptr features; + + bool loadInWorker(const QString& modelPath, + const QString& tokensPath, + const QString& device, + int numThreads, + QString& errorMsg) + { + QMutexLocker locker(&mutex); + try { + auto envPtr = std::make_unique( + ORT_LOGGING_LEVEL_WARNING, "impress_sensevoice"); + auto optionsPtr = std::make_unique(); + optionsPtr->SetIntraOpNumThreads(numThreads); + optionsPtr->SetGraphOptimizationLevel( + GraphOptimizationLevel::ORT_ENABLE_ALL); + + if (device == "gpu") { + LOG_WARNING(kTag, "GPU 加速尚未实现,回退到 CPU"); + } + + LOG_INFO(kTag, QString("正在加载 SenseVoice 模型: %1 (线程: %2)") + .arg(modelPath).arg(numThreads)); + + auto sessionPtr = std::make_unique( + *envPtr, + modelPath.toUtf8().constData(), + *optionsPtr); + + Ort::AllocatorWithDefaultOptions allocator; + size_t inputCount = sessionPtr->GetInputCount(); + size_t outputCount = sessionPtr->GetOutputCount(); + + LOG_INFO(kTag, QString("模型有 %1 个输入, %2 个输出") + .arg(inputCount).arg(outputCount)); + + inputNames.clear(); + outputNames.clear(); + + for (size_t i = 0; i < inputCount; i++) { + auto namePtr = sessionPtr->GetInputNameAllocated(i, allocator); + inputNames.emplace_back(namePtr.get()); + LOG_DEBUG(kTag, QString("输入 #%1: %2").arg(i).arg(namePtr.get())); + } + + for (size_t i = 0; i < outputCount; i++) { + auto namePtr = sessionPtr->GetOutputNameAllocated(i, allocator); + outputNames.emplace_back(namePtr.get()); + LOG_DEBUG(kTag, QString("输出 #%1: %2").arg(i).arg(namePtr.get())); + } + + env = std::move(envPtr); + sessionOptions = std::move(optionsPtr); + session = std::move(sessionPtr); + + // 加载 tokenizer 词表 + QString vocabPath = tokensPath; + if (vocabPath.isEmpty()) { + QFileInfo modelInfo(modelPath); + vocabPath = modelInfo.absolutePath() + "/tokens.txt"; + } + if (QFile::exists(vocabPath)) { + tokenizer.load(vocabPath); + LOG_INFO(kTag, QString("Tokenizer 词表已加载: %1").arg(vocabPath)); + } else { + LOG_WARNING(kTag, QString("未找到 tokenizer 词表: %1").arg(vocabPath)); + } + + // 初始化特征提取器 + features = std::make_unique(16000); + + LOG_INFO(kTag, QString("SenseVoice 模型加载成功: %1").arg(modelPath)); + return true; + } catch (const Ort::Exception& e) { + errorMsg = QString("ONNX 异常: %1").arg(e.what()); + LOG_ERROR(kTag, errorMsg); + return false; + } catch (const std::exception& e) { + errorMsg = QString("加载异常: %1").arg(e.what()); + LOG_ERROR(kTag, errorMsg); + return false; + } + } + + QMutex mutex; +#endif +}; + +SenseVoiceEngine::SenseVoiceEngine(QObject* parent) + : QObject(parent) + , impl_(std::make_unique()) +{} + +SenseVoiceEngine::~SenseVoiceEngine() { + unloadModel(); +} + +bool SenseVoiceEngine::loadModelSync(const QString& modelPath, + const QString& tokensPath, + const QString& device, + int numThreads) +{ + if (loaded_) { + LOG_WARNING(kTag, "模型已加载,先卸载再加载"); + unloadModel(); + } + + QString errorMsg; + bool success = impl_->loadInWorker(modelPath, tokensPath, device, numThreads, errorMsg); + loaded_ = success; + + if (success) { + emit modelLoaded(modelPath); + } else { + emit modelLoadError(modelPath, errorMsg); + emit error(errorMsg); + } + return success; +} + +void SenseVoiceEngine::loadModelAsync(const QString& modelPath, + const QString& tokensPath, + const QString& device, + int numThreads) +{ + if (loaded_) { + LOG_WARNING(kTag, "模型已加载,先卸载再加载"); + unloadModel(); + } + + LOG_INFO(kTag, QString("异步加载 SenseVoice 模型: %1").arg(modelPath)); + + QFuture future = QtConcurrent::run([this, modelPath, tokensPath, device, numThreads]() { + QString errorMsg; + bool success = impl_->loadInWorker(modelPath, tokensPath, device, numThreads, errorMsg); + + QMetaObject::invokeMethod(this, [this, modelPath, errorMsg, success]() { + loaded_ = success; + if (success) { + emit modelLoaded(modelPath); + } else { + emit modelLoadError(modelPath, errorMsg); + emit error(errorMsg); + } + }, Qt::QueuedConnection); + }); +} + +void SenseVoiceEngine::unloadModel() { + QMutexLocker locker(&impl_->mutex); +#ifdef HAVE_ONNXRUNTIME + impl_->session.reset(); + impl_->sessionOptions.reset(); + impl_->env.reset(); + impl_->features.reset(); + impl_->tokenizer = SenseVoiceTokenizer(); +#endif + loaded_ = false; + LOG_INFO(kTag, "模型已卸载"); + emit modelUnloaded(); +} + +bool SenseVoiceEngine::isLoaded() const { + return loaded_; +} + +/** CTC 贪婪解码:去重 + 去除空白 */ +static std::vector ctcGreedyDecode(const std::vector& tokens, int blankToken) { + std::vector result; + int prev = -1; + + for (int token : tokens) { + if (token == blankToken) { + prev = -1; // 重置去重状态 + continue; + } + if (token != prev) { + result.push_back(token); + } + prev = token; + } + + return result; +} + +/** argmax: 寻找数组中最大值的索引 */ +static int argmax(const float* data, int start, int end) { + int bestIdx = start; + float bestVal = data[start]; + for (int i = start + 1; i < end; i++) { + if (data[i] > bestVal) { + bestVal = data[i]; + bestIdx = i; + } + } + return bestIdx; +} + +RecognitionResult SenseVoiceEngine::infer(const std::vector& samples, + int sampleRate, + const QString& language) +{ + Timer timer; + RecognitionResult result; + + QString lang = language.isEmpty() ? "auto" : language; + LOG_DEBUG(kTag, QString("推理语言: %1 (采样率: %2Hz, 样本数: %3)") + .arg(lang).arg(sampleRate).arg(samples.size())); + +#ifdef HAVE_ONNXRUNTIME + if (!loaded_) { + result.text = "[错误] 模型未加载"; + result.latency_ms = timer.elapsedMs(); + return result; + } + + if (samples.empty()) { + result.text = ""; + result.latency_ms = timer.elapsedMs(); + return result; + } + + try { + // 1. 重采样到 16kHz + Timer preprocessTimer; + std::vector processedSamples = samples; + int currentSampleRate = sampleRate; + + if (sampleRate != 16000) { + AudioProcessor processor(16000); + processedSamples = processor.resample(samples, sampleRate); + currentSampleRate = 16000; + LOG_DEBUG(kTag, QString("重采样: %1Hz -> %2Hz (%3 -> %4 样本)") + .arg(sampleRate).arg(currentSampleRate) + .arg(samples.size()).arg(processedSamples.size())); + } + + // 2. 提取 LFR Fbank 特征 + std::vector lfrFeatures = impl_->features->extract(processedSamples); + int numFrames = static_cast(lfrFeatures.size()) / kLFROutputDim; + LOG_DEBUG(kTag, QString("特征提取: %1 ms (%2 帧, %3-dim)") + .arg(preprocessTimer.elapsedMs(), 0, 'f', 1) + .arg(numFrames).arg(kLFROutputDim)); + + if (numFrames <= 0) { + result.text = "[错误] 特征提取失败"; + result.latency_ms = timer.elapsedMs(); + return result; + } + + // 3. 准备输入张量 + QMutexLocker locker(&impl_->mutex); + + // 输入: x, x_length, language, text_norm + int64_t xShape[] = {1, numFrames, kLFROutputDim}; + auto memInfo = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); + + int64_t xLengthVal = numFrames; + int64_t xLengthShape[] = {1}; + + int langCode = languageToInt(lang); + int64_t langVal = langCode; + int64_t langShape[] = {1}; + + int64_t textNormVal = kTextNormWithITN; + int64_t textNormShape[] = {1}; + + std::vector inputTensors; + inputTensors.push_back(Ort::Value::CreateTensor( + memInfo, lfrFeatures.data(), lfrFeatures.size(), xShape, 3)); + inputTensors.push_back(Ort::Value::CreateTensor( + memInfo, &xLengthVal, 1, xLengthShape, 1)); + inputTensors.push_back(Ort::Value::CreateTensor( + memInfo, &langVal, 1, langShape, 1)); + inputTensors.push_back(Ort::Value::CreateTensor( + memInfo, &textNormVal, 1, textNormShape, 1)); + + // 4. 运行推理 + Timer inferTimer; + std::vector inputNamePtrs; + for (auto& name : impl_->inputNames) inputNamePtrs.push_back(name.c_str()); + std::vector outputNamePtrs; + for (auto& name : impl_->outputNames) outputNamePtrs.push_back(name.c_str()); + + auto outputTensors = impl_->session->Run( + Ort::RunOptions{nullptr}, + inputNamePtrs.data(), inputTensors.data(), inputTensors.size(), + outputNamePtrs.data(), outputNamePtrs.size()); + + LOG_DEBUG(kTag, QString("ONNX 推理: %1 ms").arg(inferTimer.elapsedMs(), 0, 'f', 1)); + + // 5. 解析输出 logits [1, seq_len, 25055] + auto& outputTensor = outputTensors[0]; + auto shape = outputTensor.GetTensorTypeAndShapeInfo().GetShape(); + const float* logitsData = outputTensor.GetTensorData(); + + LOG_DEBUG(kTag, QString("输出维度: [%1, %2, %3]") + .arg(shape[0]).arg(shape[1]).arg(shape[2])); + + int seqLen = static_cast(shape[1]); + int vocabSize = static_cast(shape[2]); + + // 6. CTC 贪婪解码 + std::vector rawTokens; + float totalConf = 0.0f; + int confCount = 0; + + for (int t = 0; t < seqLen; t++) { + int offset = t * vocabSize; + int bestToken = argmax(logitsData, offset, offset + vocabSize); + + if (bestToken != SenseVoiceTokenizer::kTokenBlank) { + rawTokens.push_back(bestToken); + + // 计算置信度 + float maxLogit = logitsData[offset + bestToken]; + // 近似置信度: 使用 softmax 的最大值位置 + totalConf += maxLogit; + confCount++; + } + } + + // CTC 去重 + std::vector decodedTokens = ctcGreedyDecode(rawTokens, SenseVoiceTokenizer::kTokenBlank); + + // 计算平均置信度 (softmax) + if (confCount > 0) { + float avgLogit = totalConf / confCount; + // 归一化到 0-1 范围 + result.confidence = 1.0f / (1.0f + std::exp(-avgLogit)); + } + + // 7. 解码 token 为文本 + if (decodedTokens.empty()) { + result.text = ""; + } else if (impl_->tokenizer.isLoaded()) { + result.text = impl_->tokenizer.decode(decodedTokens); + LOG_DEBUG(kTag, QString("解码文本: %1 个 token → %2 字符") + .arg(decodedTokens.size()).arg(result.text.length())); + } else { + // 降级:输出 token ID + QString decodedText; + for (int token : decodedTokens) { + if (!decodedText.isEmpty()) decodedText += " "; + decodedText += QString::number(token); + } + result.text = decodedText; + LOG_WARNING(kTag, "Tokenizer 未加载,使用 token ID 输出"); + } + + result.isFinal = true; + + } catch (const std::exception& e) { + result.text = QString("[错误] 推理失败: %1").arg(e.what()); + LOG_ERROR(kTag, result.text); + } +#else + result.text = "[占位] ONNX Runtime 未启用"; +#endif + + result.latency_ms = timer.elapsedMs(); + LOG_DEBUG(kTag, QString("推理总耗时: %1 ms").arg(result.latency_ms, 0, 'f', 1)); + return result; +} + +} // namespace impress diff --git a/src/core/sense_voice_engine.h b/src/core/sense_voice_engine.h new file mode 100644 index 0000000..ffc67fc --- /dev/null +++ b/src/core/sense_voice_engine.h @@ -0,0 +1,63 @@ +#pragma once + +#include +#include +#include +#include +#include "stt_engine.h" // RecognitionResult 定义 + +namespace impress { + +/** + * @brief SenseVoice STT 推理引擎 + * + * 封装 ONNX Runtime 推理逻辑,针对 SenseVoice 模型优化。 + * 完整的推理管线:PCM → Fbank → LFR → CMVN → ONNX → CTC 解码 → 文本。 + */ +class SenseVoiceEngine : public QObject { + Q_OBJECT +public: + explicit SenseVoiceEngine(QObject* parent = nullptr); + ~SenseVoiceEngine() override; + + /** @brief 同步加载模型 */ + bool loadModelSync(const QString& modelPath, + const QString& tokensPath = QString(), + const QString& device = "cpu", + int numThreads = 4); + + /** @brief 异步加载模型(后台线程,不阻塞 UI) */ + void loadModelAsync(const QString& modelPath, + const QString& tokensPath = QString(), + const QString& device = "cpu", + int numThreads = 4); + + /** @brief 释放模型 */ + void unloadModel(); + + /** @brief 是否已加载模型 */ + bool isLoaded() const; + + /** + * @brief 推理音频数据 + * @param samples 归一化后的 PCM 浮点样本(范围 [-1, 1]) + * @param sampleRate 采样率 + * @param language 识别语言代码("zh", "en", "ja", "ko", "yue", "auto"),空则自动 + */ + RecognitionResult infer(const std::vector& samples, + int sampleRate, + const QString& language = QString()); + +signals: + void modelLoaded(const QString& modelPath); + void modelLoadError(const QString& modelPath, const QString& error); + void modelUnloaded(); + void error(const QString& message); + +private: + struct Impl; + std::unique_ptr impl_; + bool loaded_ = false; +}; + +} // namespace impress diff --git a/src/core/sense_voice_features.cpp b/src/core/sense_voice_features.cpp new file mode 100644 index 0000000..4c27724 --- /dev/null +++ b/src/core/sense_voice_features.cpp @@ -0,0 +1,215 @@ +#include "sense_voice_features.h" +#include "sense_voice_cmvn.h" +#include +#include +#include +#include + +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif + +namespace impress { + +struct Complex { + float re, im; + Complex(float r = 0, float i = 0) : re(r), im(i) {} + Complex operator+(const Complex& o) const { return {re + o.re, im + o.im}; } + Complex operator-(const Complex& o) const { return {re - o.re, im - o.im}; } + Complex operator*(const Complex& o) const { + return {re * o.re - im * o.im, re * o.im + im * o.re}; + } + Complex operator*(float s) const { return {re * s, im * s}; } + float magnitudeSq() const { return re * re + im * im; } +}; + +static void fft(std::vector& x) { + int n = static_cast(x.size()); + if (n <= 1) return; + + for (int i = 1, j = 0; i < n; i++) { + int bit = n >> 1; + for (; j & bit; bit >>= 1) j ^= bit; + j ^= bit; + if (i < j) std::swap(x[i], x[j]); + } + + for (int len = 2; len <= n; len *= 2) { + float angle = -2.0f * static_cast(M_PI) / len; + Complex wlen(std::cos(angle), std::sin(angle)); + for (int i = 0; i < n; i += len) { + Complex w(1.0f, 0.0f); + for (int j = 0; j < len / 2; j++) { + Complex u = x[i + j]; + Complex v = x[i + j + len / 2] * w; + x[i + j] = u + v; + x[i + j + len / 2] = u - v; + w = w * wlen; + } + } + } +} + +static float hzToMel(float hz) { + return 1125.0f * std::log(1.0f + hz / 700.0f); +} + +static float melToHz(float mel) { + return 700.0f * (std::exp(mel / 1125.0f) - 1.0f); +} + +SenseVoiceFeatures::SenseVoiceFeatures(int sampleRate) + : sampleRate_(sampleRate) +{ + buildMelFilters(); +} + +std::vector SenseVoiceFeatures::hannWindow() const { + std::vector window(winLength_); + for (int i = 0; i < winLength_; i++) { + window[i] = 0.5f * (1.0f - std::cos(2.0f * static_cast(M_PI) * i / (winLength_ - 1))); + } + return window; +} + +void SenseVoiceFeatures::buildMelFilters() { + int nFreq = nFft_ / 2 + 1; + float fMin = 20.0f; + float fMax = static_cast(sampleRate_) / 2.0f; + float melMin = hzToMel(fMin); + float melMax = hzToMel(fMax); + + std::vector melPoints(nMel_ + 2); + for (int i = 0; i < nMel_ + 2; i++) { + melPoints[i] = melToHz(melMin + (melMax - melMin) * i / (nMel_ + 1)); + } + + std::vector binPoints(nMel_ + 2); + for (int i = 0; i < nMel_ + 2; i++) { + binPoints[i] = static_cast(std::round((nFft_ + 1) * melPoints[i] / sampleRate_)); + binPoints[i] = std::max(0, std::min(nFreq - 1, binPoints[i])); + } + + melFilters_.resize(nMel_); + for (int m = 0; m < nMel_; m++) { + MelFilter filter; + filter.startBin = binPoints[m]; + filter.endBin = binPoints[m + 2] + 1; + + int numWeights = filter.endBin - filter.startBin; + filter.weights.resize(numWeights, 0.0f); + + for (int k = 0; k < numWeights; k++) { + int bin = filter.startBin + k; + if (bin >= binPoints[m] && bin <= binPoints[m + 1]) { + int denom = binPoints[m + 1] - binPoints[m]; + filter.weights[k] = (denom > 0) ? static_cast(bin - binPoints[m]) / denom : 0.0f; + } else if (bin > binPoints[m + 1] && bin <= binPoints[m + 2]) { + int denom = binPoints[m + 2] - binPoints[m + 1]; + filter.weights[k] = (denom > 0) ? static_cast(binPoints[m + 2] - bin) / denom : 0.0f; + } + } + + melFilters_[m] = filter; + } +} + +int SenseVoiceFeatures::nFrames(int numSamples) const { + if (numSamples < winLength_) return 0; + return (numSamples - winLength_) / hopLength_ + 1; +} + +std::vector SenseVoiceFeatures::extract(const std::vector& samples) const { + if (samples.empty()) return {}; + + int numSamples = static_cast(samples.size()); + + // 1. 预加重 + std::vector emphasized(numSamples); + emphasized[0] = samples[0]; + for (int i = 1; i < numSamples; i++) { + emphasized[i] = samples[i] - preEmphasisCoeff_ * samples[i - 1]; + } + + // 2. 分帧 + FFT + Mel + 对数压缩 + int numFrames = nFrames(numSamples); + if (numFrames <= 0) return {}; + + auto window = hannWindow(); + int nFreq = nFft_ / 2 + 1; + + std::vector fbankData(numFrames * nMel_); + + for (int f = 0; f < numFrames; f++) { + int frameStart = f * hopLength_; + + // 应用 Hann 窗并 FFT + std::vector fftInput(nFft_, {0.0f, 0.0f}); + for (int i = 0; i < winLength_ && frameStart + i < numSamples; i++) { + fftInput[i] = {emphasized[frameStart + i] * window[i], 0.0f}; + } + fft(fftInput); + + // Mel 滤波器组 + for (int m = 0; m < nMel_; m++) { + const auto& filter = melFilters_[m]; + float energy = 0.0f; + for (int w = 0; w < static_cast(filter.weights.size()); w++) { + int bin = filter.startBin + w; + if (bin < nFreq) { + energy += fftInput[bin].magnitudeSq() * filter.weights[w]; + } + } + // 对数压缩 (使用自然对数) + energy = std::max(energy, 1e-10f); + fbankData[f * nMel_ + m] = std::log(energy); + } + } + + // 3. LFR (Low Frame Rate) 特征拼接 + // 将连续 lfr_window_size 帧 Fbank 特征拼接为一帧 + // 步长为 lfr_window_shift + std::vector lfrFeatures; + int lfrOutputDim = nMel_ * kLFRWindowSize; // 80 * 7 = 560 + + for (int i = 0; ; i += kLFRWindowShift) { + if (i >= numFrames) break; + + // 计算 LFR 窗口 + int leftPad = std::max(0, kLFRWindowSize / 2 - i); + int rightPad = std::max(0, kLFRWindowSize / 2 - (numFrames - 1 - i)); + + std::vector frame(lfrOutputDim, 0.0f); + int outIdx = 0; + + for (int j = -kLFRWindowSize / 2; j < kLFRWindowSize - kLFRWindowSize / 2; j++) { + int idx = i + j; + // 边界填充:复制第一帧或最后一帧 + if (idx < 0) idx = 0; + if (idx >= numFrames) idx = numFrames - 1; + + for (int m = 0; m < nMel_; m++) { + frame[outIdx++] = fbankData[idx * nMel_ + m]; + } + } + + lfrFeatures.insert(lfrFeatures.end(), frame.begin(), frame.end()); + } + + // 4. CMVN 归一化 + cmvn(lfrFeatures); + + return lfrFeatures; +} + +void SenseVoiceFeatures::cmvn(std::vector& features) const { + int nLFRFrames = static_cast(features.size()) / kLFROutputDim; + int numValues = static_cast(features.size()); + + for (int i = 0; i < numValues; i++) { + features[i] = (features[i] + kNegMean[i % kLFROutputDim]) * + kInvStddev[i % kLFROutputDim]; + } +} + +} // namespace impress diff --git a/src/core/sense_voice_features.h b/src/core/sense_voice_features.h new file mode 100644 index 0000000..f037e33 --- /dev/null +++ b/src/core/sense_voice_features.h @@ -0,0 +1,54 @@ +#pragma once + +#include + +namespace impress { + +/** + * @brief SenseVoice 音频特征提取器 + * + * 将原始 PCM 音频转换为 SenseVoice 模型所需的 LFR Fbank 特征。 + * 流程: PCM → 预加重 → 分帧 → FFT → Mel 滤波器 → 对数压缩 → + * LFR 拼接 → CMVN 归一化 → 560-dim 特征向量。 + */ +class SenseVoiceFeatures { +public: + /** + * @brief 构造函数 + * @param sampleRate 输入音频采样率(默认 16000) + */ + explicit SenseVoiceFeatures(int sampleRate = 16000); + + /** + * @brief 从 PCM 数据提取 LFR Fbank 特征 + * @param samples 归一化 PCM 浮点数据 [-1, 1] + * @return LFR Fbank 特征,维度 [nFrames * 560] + */ + std::vector extract(const std::vector& samples) const; + + /** @brief 获取特征帧数 */ + int nFrames(int numSamples) const; + +private: + // Fbank 参数 + int sampleRate_; + int nFft_ = 512; + int nMel_ = 80; + int hopLength_ = 160; // 10ms @ 16kHz + int winLength_ = 400; // 25ms @ 16kHz + float preEmphasisCoeff_ = 0.97f; + + // Mel 滤波器组 (预计算) + struct MelFilter { + int startBin; + int endBin; + std::vector weights; + }; + std::vector melFilters_; + + std::vector hannWindow() const; + void buildMelFilters(); + void cmvn(std::vector& features) const; +}; + +} // namespace impress diff --git a/src/core/sense_voice_tokenizer.cpp b/src/core/sense_voice_tokenizer.cpp new file mode 100644 index 0000000..06c0a34 --- /dev/null +++ b/src/core/sense_voice_tokenizer.cpp @@ -0,0 +1,101 @@ +#include "sense_voice_tokenizer.h" +#include "utils/logger.h" +#include +#include +#include + +static const char* const kTag = "SenseVoiceTokenizer"; + +namespace impress { + +SenseVoiceTokenizer::SenseVoiceTokenizer() = default; + +bool SenseVoiceTokenizer::load(const QString& tokensPath) { + QFile file(tokensPath); + if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) { + LOG_ERROR(kTag, QString("无法打开词表文件: %1").arg(tokensPath)); + return false; + } + + QTextStream stream(&file); + stream.setEncoding(QStringConverter::Utf8); + + tokenToString_.clear(); + + int lineCount = 0; + while (!stream.atEnd()) { + QString line = stream.readLine().trimmed(); + if (line.isEmpty()) continue; + + // 格式: " " — 最后一个是 token_id + int lastSpace = line.lastIndexOf(' '); + if (lastSpace < 0) continue; + + bool ok = false; + int tokenId = line.mid(lastSpace + 1).toInt(&ok); + if (!ok) continue; + + QString tokenStr = line.left(lastSpace); + tokenToString_[tokenId] = tokenStr; + lineCount++; + } + + LOG_INFO(kTag, QString("词表已加载: %1 个词条 (%2)").arg(lineCount).arg(tokensPath)); + return !tokenToString_.empty(); +} + +QString SenseVoiceTokenizer::decode(const std::vector& tokens) const { + if (tokens.empty()) return ""; + + QString result; + for (int token : tokens) { + // 跳过特殊 token + if (token == kTokenBOS || token == kTokenEOS || token == kTokenBlank) { + continue; + } + + auto it = tokenToString_.find(token); + if (it != tokenToString_.end()) { + result += decodeBPE(it->second); + } else { + result += QString("[T%1]").arg(token); + } + } + + return result; +} + +QString SenseVoiceTokenizer::decodeBPE(const QString& token) const { + // SenseVoice 使用 SentencePiece BPE 格式 + // ▁ (U+2581) 表示单词开头/空格 + QString result = token; + + // ▁ → 空格 + result.replace(QChar(0x2581), ' '); + + // 处理 unicode 转义 (如 <0xE5>) + static QRegularExpression hexPattern("<0x([0-9A-Fa-f]+)>"); + QRegularExpressionMatchIterator it = hexPattern.globalMatch(result); + QStringList parts; + int lastPos = 0; + while (it.hasNext()) { + QRegularExpressionMatch match = it.next(); + parts << result.mid(lastPos, match.capturedStart() - lastPos); + bool ok; + int code = match.captured(1).toInt(&ok, 16); + if (ok) { + parts << QChar(code); + } else { + parts << match.captured(0); + } + lastPos = match.capturedEnd(); + } + if (!parts.isEmpty() || lastPos > 0) { + parts << result.mid(lastPos); + result = parts.join(""); + } + + return result; +} + +} // namespace impress diff --git a/src/core/sense_voice_tokenizer.h b/src/core/sense_voice_tokenizer.h new file mode 100644 index 0000000..a992fda --- /dev/null +++ b/src/core/sense_voice_tokenizer.h @@ -0,0 +1,42 @@ +#pragma once + +#include +#include +#include +#include + +namespace impress { + +/** + * @brief SenseVoice Tokenizer + * + * 加载 tokens.txt 词表,支持 BPE token 到文本的解码。 + * 支持 SenseVoice 的 BPE 词表格式 (SentencePiece)。 + */ +class SenseVoiceTokenizer { +public: + SenseVoiceTokenizer(); + + /** @brief 从 tokens.txt 加载词表 */ + bool load(const QString& tokensPath); + + /** @brief 将 token IDs 解码为文本 */ + QString decode(const std::vector& tokens) const; + + /** @brief 是否已加载 */ + bool isLoaded() const { return !tokenToString_.empty(); } + + /** @brief 词表大小 */ + int vocabSize() const { return static_cast(tokenToString_.size()); } + + // 特殊 token + static constexpr int kTokenBlank = 0; // CTC blank / + static constexpr int kTokenBOS = 1; // + static constexpr int kTokenEOS = 2; // + +private: + std::unordered_map tokenToString_; + QString decodeBPE(const QString& token) const; +}; + +} // namespace impress diff --git a/src/ui/file_transcribe_page.cpp b/src/ui/file_transcribe_page.cpp index e44d2d7..d5f5c51 100644 --- a/src/ui/file_transcribe_page.cpp +++ b/src/ui/file_transcribe_page.cpp @@ -1,5 +1,5 @@ #include "file_transcribe_page.h" -#include "core/stt_engine.h" +#include "core/sense_voice_engine.h" #include "audio/audio_decoder.h" #include "app/config_manager.h" #include "utils/logger.h" @@ -34,7 +34,7 @@ namespace impress { FileTranscribePage::FileTranscribePage(ConfigManager* configManager, QWidget* parent) : QWidget(parent) , configManager_(configManager) - , sttEngine_(new STTEngine(this)) + , sttEngine_(new SenseVoiceEngine(this)) , audioDecoder_(new AudioDecoder(this)) { setupUI(); @@ -161,6 +161,7 @@ void FileTranscribePage::onStartTranscribe() { (void)QtConcurrent::run([this, modelPath]() { bool success = sttEngine_->loadModelSync(modelPath, + configManager_->get("stt.tokens_path").toString(), configManager_->get("stt.device").toString(), configManager_->get("stt.num_threads").toInt()); diff --git a/src/ui/file_transcribe_page.h b/src/ui/file_transcribe_page.h index 04caf7b..b9f1bdf 100644 --- a/src/ui/file_transcribe_page.h +++ b/src/ui/file_transcribe_page.h @@ -14,7 +14,7 @@ class QByteArray; namespace impress { class ConfigManager; -class STTEngine; +class SenseVoiceEngine; class AudioDecoder; struct TranscribeTask { @@ -62,7 +62,7 @@ private: QString formatSRTTime(double seconds) const; ConfigManager* configManager_; - STTEngine* sttEngine_; + SenseVoiceEngine* sttEngine_; AudioDecoder* audioDecoder_; // UI 控件 diff --git a/src/ui/settings_page.cpp b/src/ui/settings_page.cpp index 1e0c570..b4983de 100644 --- a/src/ui/settings_page.cpp +++ b/src/ui/settings_page.cpp @@ -47,7 +47,7 @@ void SettingsPage::setupUI() { sttLayout->addRow("模型路径:", modelRow); modelTypeCombo_ = new QComboBox(this); - modelTypeCombo_->addItems({"whisper", "paraformer", "conformer"}); + modelTypeCombo_->addItems({"sense_voice", "whisper", "paraformer", "conformer"}); sttLayout->addRow("模型类型:", modelTypeCombo_); deviceCombo_ = new QComboBox(this); @@ -59,6 +59,15 @@ void SettingsPage::setupUI() { threadSpin_->setValue(4); sttLayout->addRow("推理线程数:", threadSpin_); + auto* tokensRow = new QHBoxLayout(); + tokensPathEdit_ = new QLineEdit(this); + tokensPathEdit_->setPlaceholderText("选择 tokens.txt 文件路径..."); + tokensBrowseBtn_ = new QPushButton("浏览...", this); + connect(tokensBrowseBtn_, &QPushButton::clicked, this, &SettingsPage::onBrowseTokensPath); + tokensRow->addWidget(tokensPathEdit_); + tokensRow->addWidget(tokensBrowseBtn_); + sttLayout->addRow("词表路径:", tokensRow); + sampleRateSpin_ = new QSpinBox(this); sampleRateSpin_->setRange(8000, 192000); sampleRateSpin_->setSingleStep(1000); @@ -158,6 +167,7 @@ void SettingsPage::setupUI() { void SettingsPage::loadFromConfig() { modelPathEdit_->setText(configManager_->get("stt.model_path").toString()); + tokensPathEdit_->setText(configManager_->get("stt.tokens_path").toString()); modelTypeCombo_->setCurrentText(configManager_->get("stt.model_type").toString()); deviceCombo_->setCurrentText(configManager_->get("stt.device").toString()); threadSpin_->setValue(configManager_->get("stt.num_threads").toInt()); @@ -179,6 +189,7 @@ void SettingsPage::loadFromConfig() { void SettingsPage::saveToConfig() { configManager_->set("stt.model_path", modelPathEdit_->text()); + configManager_->set("stt.tokens_path", tokensPathEdit_->text()); configManager_->set("stt.model_type", modelTypeCombo_->currentText()); configManager_->set("stt.device", deviceCombo_->currentText()); configManager_->set("stt.num_threads", threadSpin_->value()); @@ -206,6 +217,14 @@ void SettingsPage::onBrowseModelPath() { } } +void SettingsPage::onBrowseTokensPath() { + QString path = QFileDialog::getOpenFileName(this, "选择词表文件", "", + "词表文件 (tokens.txt);;所有文件 (*.*)"); + if (!path.isEmpty()) { + tokensPathEdit_->setText(path); + } +} + void SettingsPage::onSaveConfig() { saveToConfig(); if (configManager_->save()) { diff --git a/src/ui/settings_page.h b/src/ui/settings_page.h index 3bdfa37..c3f7b96 100644 --- a/src/ui/settings_page.h +++ b/src/ui/settings_page.h @@ -29,6 +29,7 @@ public: private slots: void onBrowseModelPath(); + void onBrowseTokensPath(); void onSaveConfig(); void onResetConfig(); @@ -42,6 +43,8 @@ private: // STT 设置 QLineEdit* modelPathEdit_; QPushButton* browseBtn_; + QLineEdit* tokensPathEdit_; + QPushButton* tokensBrowseBtn_; QComboBox* modelTypeCombo_; QComboBox* deviceCombo_; QSpinBox* threadSpin_; diff --git a/src/ui/stt_test_page.cpp b/src/ui/stt_test_page.cpp index eda0aef..ef47524 100644 --- a/src/ui/stt_test_page.cpp +++ b/src/ui/stt_test_page.cpp @@ -1,5 +1,5 @@ #include "stt_test_page.h" -#include "core/stt_engine.h" +#include "core/sense_voice_engine.h" #include "audio/audio_capture.h" #include "audio/audio_ring_buffer.h" #include "widgets/audio_waveform.h" @@ -26,7 +26,7 @@ namespace impress { STTTestPage::STTTestPage(ConfigManager* configManager, QWidget* parent) : QWidget(parent) , configManager_(configManager) - , sttEngine_(new STTEngine(this)) + , sttEngine_(new SenseVoiceEngine(this)) , audioCapture_(new AudioCapture(this)) { setupUI(); @@ -34,11 +34,11 @@ STTTestPage::STTTestPage(ConfigManager* configManager, QWidget* parent) // 信号连接 connect(audioCapture_, &AudioCapture::audioDataReady, this, &STTTestPage::onAudioDataReady); - connect(sttEngine_, &STTEngine::modelLoaded, + connect(sttEngine_, &SenseVoiceEngine::modelLoaded, this, &STTTestPage::onModelLoaded); - connect(sttEngine_, &STTEngine::modelLoadError, + connect(sttEngine_, &SenseVoiceEngine::modelLoadError, this, &STTTestPage::onModelLoadError); - connect(sttEngine_, &STTEngine::modelUnloaded, + connect(sttEngine_, &SenseVoiceEngine::modelUnloaded, this, &STTTestPage::onModelUnloaded); } @@ -133,6 +133,7 @@ void STTTestPage::onToggleRecording() { updateUIState(); sttEngine_->loadModelAsync(modelPath, + configManager_->get("stt.tokens_path").toString(), configManager_->get("stt.device").toString(), configManager_->get("stt.num_threads").toInt()); diff --git a/src/ui/stt_test_page.h b/src/ui/stt_test_page.h index a40da27..bd319fd 100644 --- a/src/ui/stt_test_page.h +++ b/src/ui/stt_test_page.h @@ -13,13 +13,14 @@ class QSpinBox; namespace impress { class ConfigManager; -class STTEngine; +class SenseVoiceEngine; class AudioCapture; /** * @brief STT 测试页面 * * 实时麦克风采集 + 流式识别。 + * 使用 SenseVoice 模型进行推理。 * 模型异步加载,不阻塞 UI。 */ class STTTestPage : public QWidget { @@ -43,7 +44,7 @@ private: void processAudioChunk(const std::vector& samples, int sampleRate); ConfigManager* configManager_; - STTEngine* sttEngine_; + SenseVoiceEngine* sttEngine_; AudioCapture* audioCapture_; // UI 控件