diff --git a/src/core/sense_voice_tokenizer.cpp b/src/core/sense_voice_tokenizer.cpp index 06c0a34..454282c 100644 --- a/src/core/sense_voice_tokenizer.cpp +++ b/src/core/sense_voice_tokenizer.cpp @@ -56,12 +56,22 @@ QString SenseVoiceTokenizer::decode(const std::vector& tokens) const { auto it = tokenToString_.find(token); if (it != tokenToString_.end()) { - result += decodeBPE(it->second); + QString decoded = decodeBPE(it->second); + // 过滤 SenseVoice 特殊标签: <|zh|>, <|speech|>, <|NEUTRAL|> 等 + if (decoded.startsWith("<|") && decoded.endsWith("|>")) { + continue; + } + result += decoded; } else { result += QString("[T%1]").arg(token); } } + // 清理首尾空白 + result = result.trimmed(); + // 将多个连续空格合并为单个空格 + result.replace(QRegularExpression("\\s+"), " "); + return result; }