fix: 过滤 SenseVoice 特殊标签 <|zh|> <|speech|> <|NEUTRAL|>
CTC 解码输出的 token 包含 SenseVoice 元数据标签(语言、事件、情感), 导致识别结果以 <|zh|><|speech|><|NEUTRAL|> 开头。在 tokenizer.decode() 中跳过所有 <|...|> 格式的标签,只保留实际识别文本,并清理首尾空白 和合并多余空格。 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
32d3a8e986
commit
85b67780b1
@ -56,12 +56,22 @@ QString SenseVoiceTokenizer::decode(const std::vector<int>& tokens) const {
|
|||||||
|
|
||||||
auto it = tokenToString_.find(token);
|
auto it = tokenToString_.find(token);
|
||||||
if (it != tokenToString_.end()) {
|
if (it != tokenToString_.end()) {
|
||||||
result += decodeBPE(it->second);
|
QString decoded = decodeBPE(it->second);
|
||||||
|
// 过滤 SenseVoice 特殊标签: <|zh|>, <|speech|>, <|NEUTRAL|> 等
|
||||||
|
if (decoded.startsWith("<|") && decoded.endsWith("|>")) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
result += decoded;
|
||||||
} else {
|
} else {
|
||||||
result += QString("[T%1]").arg(token);
|
result += QString("[T%1]").arg(token);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 清理首尾空白
|
||||||
|
result = result.trimmed();
|
||||||
|
// 将多个连续空格合并为单个空格
|
||||||
|
result.replace(QRegularExpression("\\s+"), " ");
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user