fix 分句逻辑,增加测试用例

不包含。!?的句子没有被正确分句
This commit is contained in:
yrom 2025-05-18 22:45:42 +08:00
parent c0c17fe387
commit 3553a3755d
2 changed files with 49 additions and 40 deletions

View File

@ -68,7 +68,7 @@ class TextNormalizer:
匹配人名格式中文·中文中文·中文-中文
例如克里斯托弗·诺兰约瑟夫·高登-莱维特
"""
NAME_PATTERN = r"[\u4e00-\u9fff]+([-·—][\u4e00-\u9fff]+){1,2}"
NAME_PATTERN = r"[\u4e00-\u9fff]+(?:[-·—][\u4e00-\u9fff]+){1,2}"
def use_chinese(self, s):
has_chinese = bool(re.search(r"[\u4e00-\u9fff]", s))
@ -334,55 +334,57 @@ class TextTokenizer:
"""
将tokenize后的结果按特定token进一步分割
"""
# 处理特殊情况
if len(tokenized_str) == 0:
return []
sentences: List[List[str]] = []
current_sentence = []
current_sentence_tokens_len = 0
for i in range(len(tokenized_str)):
token = tokenized_str[i]
current_sentence.append(token)
if token in split_tokens:
if len(current_sentence) == 1:
# 如果当前tokens只有一个且是切分符号则忽略这条句子
pass
elif len(current_sentence) == 2 and current_sentence[0] == '':
# 如果当前tokens只有两个且仅有切分符号则忽略这条句子
pass
elif len(current_sentence) <= max_tokens_per_sentence:
current_sentence_tokens_len += 1
if current_sentence_tokens_len <= max_tokens_per_sentence:
if token in split_tokens and current_sentence_tokens_len > 2:
if i < len(tokenized_str) - 1:
if tokenized_str[i + 1] in ["'", "'"]:
# 后续token是',则不切分
current_sentence.append(tokenized_str[i + 1])
i += 1
sentences.append(current_sentence)
else:
# 如果当前tokens的长度超过最大限制
if not ("," in split_tokens or "▁," in split_tokens ) and ("," in current_sentence or "▁," in current_sentence):
# 如果当前tokens中有,,则按,分割
sub_sentences = TextTokenizer.split_sentences_by_token(
current_sentence, [",", "▁,"], max_tokens_per_sentence=max_tokens_per_sentence
)
elif not ("-" in split_tokens ) and "-" in current_sentence:
# 没有,,则按-分割
sub_sentences = TextTokenizer.split_sentences_by_token(
current_sentence, ["-"], max_tokens_per_sentence=max_tokens_per_sentence
)
current_sentence = []
current_sentence_tokens_len = 0
continue
# 如果当前tokens的长度超过最大限制
if not ("," in split_tokens or "▁," in split_tokens ) and ("," in current_sentence or "▁," in current_sentence):
# 如果当前tokens中有,,则按,分割
sub_sentences = TextTokenizer.split_sentences_by_token(
current_sentence, [",", "▁,"], max_tokens_per_sentence=max_tokens_per_sentence
)
elif not ("-" in split_tokens ) and "-" in current_sentence:
# 没有,,则按-分割
sub_sentences = TextTokenizer.split_sentences_by_token(
current_sentence, ["-"], max_tokens_per_sentence=max_tokens_per_sentence
)
else:
# 按照长度分割
sub_sentences = []
for j in range(0, len(current_sentence), max_tokens_per_sentence):
if j + max_tokens_per_sentence < len(current_sentence):
sub_sentences.append(current_sentence[j : j + max_tokens_per_sentence])
else:
# 按照长度分割
sub_sentences = []
for j in range(0, len(current_sentence), max_tokens_per_sentence):
if j + max_tokens_per_sentence < len(current_sentence):
sub_sentences.append(current_sentence[j : j + max_tokens_per_sentence])
else:
sub_sentences.append(current_sentence[j:])
warnings.warn(
f"The tokens length of sentence exceeds limit: {max_tokens_per_sentence}, "
f"Tokens in sentence: {current_sentence}."
"Maybe unexpected behavior",
RuntimeWarning,
)
sentences.extend(sub_sentences)
current_sentence = []
if len(current_sentence) > 0:
sub_sentences.append(current_sentence[j:])
warnings.warn(
f"The tokens length of sentence exceeds limit: {max_tokens_per_sentence}, "
f"Tokens in sentence: {current_sentence}."
"Maybe unexpected behavior",
RuntimeWarning,
)
sentences.extend(sub_sentences)
current_sentence = []
current_sentence_tokens_len = 0
if current_sentence_tokens_len > 0:
assert current_sentence_tokens_len <= max_tokens_per_sentence
sentences.append(current_sentence)
# 如果相邻的句子加起来长度小于最大限制,则合并
merged_sentences = []
@ -442,6 +444,7 @@ if __name__ == "__main__":
"Couting down 3, 2, 1, go!",
"数到3就开始1、2、3",
"This sales for 2.5% off, only $12.5.",
"5G网络是4G网络的升级版2G网络是3G网络的前身",
"苹果于2030/1/2发布新 iPhone 2X 系列手机,最低售价仅 ¥12999",
"这酒...里...有毒...",
# 异常case
@ -452,6 +455,8 @@ if __name__ == "__main__":
# 长句子
"《盗梦空间》是由美国华纳兄弟影片公司出品的电影,由克里斯托弗·诺兰执导并编剧,莱昂纳多·迪卡普里奥、玛丽昂·歌迪亚、约瑟夫·高登-莱维特、艾利奥特·佩吉、汤姆·哈迪等联袂主演2010年7月16日在美国上映2010年9月1日在中国内地上映2020年8月28日在中国内地重映。影片剧情游走于梦境与现实之间被定义为“发生在意识结构内的当代动作科幻片”讲述了由莱昂纳多·迪卡普里奥扮演的造梦师带领特工团队进入他人梦境从他人的潜意识中盗取机密并重塑他人梦境的故事。",
"清晨拉开窗帘阳光洒在窗台的Bloomixy花艺礼盒上——薰衣草香薰蜡烛唤醒嗅觉永生花束折射出晨露般光泽。设计师将“自然绽放美学”融入每个细节手工陶瓷花瓶可作首饰收纳香薰精油含依兰依兰舒缓配方。限量款附赠《365天插花灵感手册》让每个平凡日子都有花开仪式感。\n宴会厅灯光暗下的刹那Glimmeria星月系列耳坠开始发光——瑞士冷珐琅工艺让蓝宝石如银河流动钛合金骨架仅3.2g无负重感。设计师秘密内置微型重力感应器随步伐产生0.01mm振幅,打造“行走的星光”。七夕限定礼盒含星座定制铭牌,让爱意如星辰永恒闪耀。",
"电影1“黑暗骑士”演员克里斯蒂安·贝尔、希斯·莱杰导演克里斯托弗·诺兰电影2“盗梦空间”演员莱昂纳多·迪卡普里奥导演克里斯托弗·诺兰电影3“钢琴家”演员艾德里安·布洛迪导演罗曼·波兰斯基电影4“泰坦尼克号”演员莱昂纳多·迪卡普里奥导演詹姆斯·卡梅隆电影5“阿凡达”演员萨姆·沃辛顿导演詹姆斯·卡梅隆电影6“南方公园大电影”演员马特·斯通、托马斯·艾恩格瑞导演特雷·帕克",
]
# 测试分词器
tokenizer = TextTokenizer(
@ -479,16 +484,19 @@ if __name__ == "__main__":
# 测试 normalize后的字符能被分词器识别
print(f"`{ch}`", "->", tokenizer.sp_model.Encode(ch, out_type=str))
print(f"` {ch}`", "->", tokenizer.sp_model.Encode(f" {ch}", out_type=str))
max_tokens_per_sentence=120
for i in range(len(cases)):
print(f"原始文本: {cases[i]}")
print(f"Normalized: {text_normalizer.normalize(cases[i])}")
tokens = tokenizer.tokenize(cases[i])
print(f"Tokenzied: {tokens}")
sentences = tokenizer.split_sentences(tokens, max_tokens_per_sentence=100)
print("Tokenzied: ", ", ".join([f"`{t}`" for t in tokens]))
sentences = tokenizer.split_sentences(tokens, max_tokens_per_sentence=max_tokens_per_sentence)
print("Splitted sentences count:", len(sentences))
if len(sentences) > 1:
for j in range(len(sentences)):
print(f" {j}, count:", len(sentences[j]), ", tokens:", "".join(sentences[j]))
if len(sentences[j]) > max_tokens_per_sentence:
print(f"Warning: sentence {j} is too long, length: {len(sentences[j])}")
#print(f"Token IDs (first 10): {codes[i][:10]}")
if tokenizer.unk_token in codes[i]:
print(f"Warning: `{cases[i]}` contains UNKNOWN token")

View File

@ -2,6 +2,7 @@
{"prompt_audio":"sample_prompt.wav","text":"大家好我现在正在bilibili 体验 ai 科技说实话来之前我绝对想不到AI技术已经发展到这样匪夷所思的地步了","infer_mode":0}
{"prompt_audio":"sample_prompt.wav","text":"晕XUAN4是一种GAN3觉","infer_mode":0}
{"prompt_audio":"sample_prompt.wav","text":"最zhong4要的是不要chong2蹈覆辙","infer_mode":0}
{"prompt_audio":"sample_prompt.wav","text":"ni3 dao4 di3 xing2 bu5 xing2 a5gei3 wo3 chong2 zuo4 yi5 bian4","infer_mode":0}
{"prompt_audio":"sample_prompt.wav","text":"Matt Hougan, chief investment officer at Bitwise, predicts Bitcoin (BTC) will reach $200,000 by the end of 2025 due to a supply shock from heightened institutional demand. In an interview with Cointelegraph at Consensus 2025 in Toronto, the executive said that Bitwise's Bitcoin price prediction model is driven exclusively by supply and demand metrics. \"I think eventually that will exhaust sellers at the $100,000 level where we have been stuck, and I think the next stopping point above that is $200,000,\" the executive said.","infer_mode":1}
{"prompt_audio":"sample_prompt.wav","text":"《盗梦空间》英语Inception是由美国华纳兄弟影片公司出品的电影由克里斯托弗·诺兰Christopher Edward Nolan执导并编剧莱昂纳多·迪卡普里奥Leonardo Wilhelm DiCaprio、玛丽昂·歌迪亚、约瑟夫·高登-莱维特、艾利奥特·佩吉、汤姆·哈迪等联袂主演2010年7月16日在美国上映2010年9月1日在中国内地上映2020年8月28日在中国内地重映。豆瓣评分9.4IMDB 8.8。影片剧情游走于梦境与现实之间,被定义为“发生在意识结构内的当代动作科幻片”,讲述了由 Leonardo 扮演的造梦师,带领特工团队进入他人梦境,从他人的潜意识中盗取机密,并重塑他人梦境的故事。","infer_mode":1}
{"prompt_audio":"sample_prompt.wav","text":"清晨拉开窗帘阳光洒在窗台的Bloomixy花艺礼盒上——薰衣草香薰蜡烛唤醒嗅觉永生花束折射出晨露般光泽。设计师将“自然绽放美学”融入每个细节手工陶瓷花瓶可作首饰收纳香薰精油含依兰依兰舒缓配方。限量款附赠《365天插花灵感手册》让每个平凡日子都有花开仪式感。宴会厅灯光暗下的刹那Glimmeria星月系列耳坠开始发光——瑞士冷珐琅工艺让蓝宝石如银河流动钛合金骨架仅3.2g无负重感。设计师秘密内置微型重力感应器随步伐产生0.01mm振幅,打造“行走的星光”。七夕限定礼盒含星座定制铭牌,让爱意如星辰永恒闪耀。","infer_mode":1}