fix: 过滤 SenseVoice 特殊标签 <|zh|> <|speech|> <|NEUTRAL|>
CTC 解码输出的 token 包含 SenseVoice 元数据标签(语言、事件、情感), 导致识别结果以 <|zh|><|speech|><|NEUTRAL|> 开头。在 tokenizer.decode() 中跳过所有 <|...|> 格式的标签,只保留实际识别文本,并清理首尾空白 和合并多余空格。 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
32d3a8e986
commit
85b67780b1
@ -56,12 +56,22 @@ QString SenseVoiceTokenizer::decode(const std::vector<int>& tokens) const {
|
||||
|
||||
auto it = tokenToString_.find(token);
|
||||
if (it != tokenToString_.end()) {
|
||||
result += decodeBPE(it->second);
|
||||
QString decoded = decodeBPE(it->second);
|
||||
// 过滤 SenseVoice 特殊标签: <|zh|>, <|speech|>, <|NEUTRAL|> 等
|
||||
if (decoded.startsWith("<|") && decoded.endsWith("|>")) {
|
||||
continue;
|
||||
}
|
||||
result += decoded;
|
||||
} else {
|
||||
result += QString("[T%1]").arg(token);
|
||||
}
|
||||
}
|
||||
|
||||
// 清理首尾空白
|
||||
result = result.trimmed();
|
||||
// 将多个连续空格合并为单个空格
|
||||
result.replace(QRegularExpression("\\s+"), " ");
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user