fix: 过滤 SenseVoice 特殊标签 <|zh|> <|speech|> <|NEUTRAL|>

CTC 解码输出的 token 包含 SenseVoice 元数据标签（语言、事件、情感），导致识别结果以 <|zh|><|speech|><|NEUTRAL|> 开头。在 tokenizer.decode() 中跳过所有 <|...|> 格式的标签，只保留实际识别文本，并清理首尾空白和合并多余空格。 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-12 20:35:40 +08:00 · 2026-05-12 20:35:40 +08:00 · 85b67780b1
commit 85b67780b1
parent 32d3a8e986
1 changed files with 11 additions and 1 deletions
--- a/src/core/sense_voice_tokenizer.cpp
+++ b/src/core/sense_voice_tokenizer.cpp
@ -56,12 +56,22 @@ QString SenseVoiceTokenizer::decode(const std::vector<int>& tokens) const {

        auto it = tokenToString_.find(token);
        if (it != tokenToString_.end()) {
-            result += decodeBPE(it->second);
+            QString decoded = decodeBPE(it->second);
+            // 过滤 SenseVoice 特殊标签: <|zh|>, <|speech|>, <|NEUTRAL|> 等
+            if (decoded.startsWith("<|") && decoded.endsWith("|>")) {
+                continue;
+            }
+            result += decoded;
        } else {
            result += QString("[T%1]").arg(token);
        }
    }

+    // 清理首尾空白
+    result = result.trimmed();
+    // 将多个连续空格合并为单个空格
+    result.replace(QRegularExpression("\\s+"), " ");
+
    return result;
 }