From 85b67780b1b39aa089f134a91fd9a0e0f9206b94 Mon Sep 17 00:00:00 2001
From: impressionyang <impressionyang@outlook.com>
Date: Tue, 12 May 2026 20:35:40 +0800
Subject: [PATCH] =?UTF-8?q?fix:=20=E8=BF=87=E6=BB=A4=20SenseVoice=20?=
 =?UTF-8?q?=E7=89=B9=E6=AE=8A=E6=A0=87=E7=AD=BE=20<|zh|>=20<|speech|>=20<|?=
 =?UTF-8?q?NEUTRAL|>?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CTC 解码输出的 token 包含 SenseVoice 元数据标签（语言、事件、情感），
导致识别结果以 <|zh|><|speech|><|NEUTRAL|> 开头。在 tokenizer.decode()
中跳过所有 <|...|> 格式的标签，只保留实际识别文本，并清理首尾空白
和合并多余空格。

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/core/sense_voice_tokenizer.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/src/core/sense_voice_tokenizer.cpp b/src/core/sense_voice_tokenizer.cpp
index 06c0a34..454282c 100644
--- a/src/core/sense_voice_tokenizer.cpp
+++ b/src/core/sense_voice_tokenizer.cpp
@@ -56,12 +56,22 @@ QString SenseVoiceTokenizer::decode(const std::vector<int>& tokens) const {
 
         auto it = tokenToString_.find(token);
         if (it != tokenToString_.end()) {
-            result += decodeBPE(it->second);
+            QString decoded = decodeBPE(it->second);
+            // 过滤 SenseVoice 特殊标签: <|zh|>, <|speech|>, <|NEUTRAL|> 等
+            if (decoded.startsWith("<|") && decoded.endsWith("|>")) {
+                continue;
+            }
+            result += decoded;
         } else {
             result += QString("[T%1]").arg(token);
         }
     }
 
+    // 清理首尾空白
+    result = result.trimmed();
+    // 将多个连续空格合并为单个空格
+    result.replace(QRegularExpression("\\s+"), " ");
+
     return result;
 }