From 85b67780b1b39aa089f134a91fd9a0e0f9206b94 Mon Sep 17 00:00:00 2001 From: impressionyang Date: Tue, 12 May 2026 20:35:40 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E8=BF=87=E6=BB=A4=20SenseVoice=20?= =?UTF-8?q?=E7=89=B9=E6=AE=8A=E6=A0=87=E7=AD=BE=20<|zh|>=20<|speech|>=20<|?= =?UTF-8?q?NEUTRAL|>?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CTC 解码输出的 token 包含 SenseVoice 元数据标签(语言、事件、情感), 导致识别结果以 <|zh|><|speech|><|NEUTRAL|> 开头。在 tokenizer.decode() 中跳过所有 <|...|> 格式的标签,只保留实际识别文本,并清理首尾空白 和合并多余空格。 Co-Authored-By: Claude Opus 4.6 --- src/core/sense_voice_tokenizer.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/core/sense_voice_tokenizer.cpp b/src/core/sense_voice_tokenizer.cpp index 06c0a34..454282c 100644 --- a/src/core/sense_voice_tokenizer.cpp +++ b/src/core/sense_voice_tokenizer.cpp @@ -56,12 +56,22 @@ QString SenseVoiceTokenizer::decode(const std::vector& tokens) const { auto it = tokenToString_.find(token); if (it != tokenToString_.end()) { - result += decodeBPE(it->second); + QString decoded = decodeBPE(it->second); + // 过滤 SenseVoice 特殊标签: <|zh|>, <|speech|>, <|NEUTRAL|> 等 + if (decoded.startsWith("<|") && decoded.endsWith("|>")) { + continue; + } + result += decoded; } else { result += QString("[T%1]").arg(token); } } + // 清理首尾空白 + result = result.trimmed(); + // 将多个连续空格合并为单个空格 + result.replace(QRegularExpression("\\s+"), " "); + return result; }