fix: update PINYIN_TONE_PATTERN and NormalizerZh

This commit is contained in:
Yrom 2025-05-21 13:01:25 +08:00
parent d26310d7cf
commit 7d943b362d
No known key found for this signature in database

View File

@ -62,8 +62,9 @@ class TextNormalizer:
"""
匹配拼音声调格式pinyin+数字声调1-55表示轻声
例如xuan4, jve2, ying1, zhong4, shang5
不匹配beta1, voice2
"""
PINYIN_TONE_PATTERN = r"([bmnpqdfghjklzcsxwy]?h?[aeiouüv]{1,2}[ng]*|ng)([1-5])"
PINYIN_TONE_PATTERN = r"(?<![a-z])((?:[bpmfdtnlgkhjqxzcsryw]|[zcs]h)?(?:[aeiouüv]|[ae]i|u[aio]|ao|ou|i[aue]|[uüv]e|[uvü]ang?|uai|[aeiuv]n|[aeio]ng|ia[no]|i[ao]ng)|ng|er)([1-5])"
"""
匹配人名格式中文·中文中文·中文-中文
例如克里斯托弗·诺兰约瑟夫·高登-莱维特
@ -94,8 +95,15 @@ class TextNormalizer:
else:
from tn.chinese.normalizer import Normalizer as NormalizerZh
from tn.english.normalizer import Normalizer as NormalizerEn
self.zh_normalizer = NormalizerZh(remove_interjections=False, remove_erhua=False, overwrite_cache=False)
# use new cache dir for build tagger rules with disable remove_interjections and remove_erhua
cache_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tagger_cache")
if not os.path.exists(cache_dir):
os.makedirs(cache_dir)
with open(os.path.join(cache_dir, ".gitignore"), "w") as f:
f.write("*\n")
self.zh_normalizer = NormalizerZh(
cache_dir=cache_dir, remove_interjections=False, remove_erhua=False, overwrite_cache=False
)
self.en_normalizer = NormalizerEn(overwrite_cache=False)
def normalize(self, text: str) -> str:
@ -361,7 +369,7 @@ class TextTokenizer:
sub_sentences = TextTokenizer.split_sentences_by_token(
current_sentence, [",", "▁,"], max_tokens_per_sentence=max_tokens_per_sentence
)
elif not ("-" in split_tokens ) and "-" in current_sentence:
elif "-" not in split_tokens and "-" in current_sentence:
# 没有,,则按-分割
sub_sentences = TextTokenizer.split_sentences_by_token(
current_sentence, ["-"], max_tokens_per_sentence=max_tokens_per_sentence
@ -449,6 +457,10 @@ if __name__ == "__main__":
"这酒...里...有毒...",
# 异常case
"只有,,,才是最好的",
"babala2是什么", # babala二是什么?
"用beta1测试", # 用beta一测试
"have you ever been to beta2?", # have you ever been to beta two?
"such as XTTS, CosyVoice2, Fish-Speech, and F5-TTS", # such as xtts,cosyvoice two,fish-speech,and f five-tts
# 人名
"约瑟夫·高登-莱维特Joseph Gordon-Levitt is an American actor",
"蒂莫西·唐纳德·库克英文名Timothy Donald Cook通称蒂姆·库克Tim Cook美国商业经理、工业工程师和工业开发商现任苹果公司首席执行官。",
@ -456,7 +468,6 @@ if __name__ == "__main__":
"《盗梦空间》是由美国华纳兄弟影片公司出品的电影,由克里斯托弗·诺兰执导并编剧,莱昂纳多·迪卡普里奥、玛丽昂·歌迪亚、约瑟夫·高登-莱维特、艾利奥特·佩吉、汤姆·哈迪等联袂主演2010年7月16日在美国上映2010年9月1日在中国内地上映2020年8月28日在中国内地重映。影片剧情游走于梦境与现实之间被定义为“发生在意识结构内的当代动作科幻片”讲述了由莱昂纳多·迪卡普里奥扮演的造梦师带领特工团队进入他人梦境从他人的潜意识中盗取机密并重塑他人梦境的故事。",
"清晨拉开窗帘阳光洒在窗台的Bloomixy花艺礼盒上——薰衣草香薰蜡烛唤醒嗅觉永生花束折射出晨露般光泽。设计师将“自然绽放美学”融入每个细节手工陶瓷花瓶可作首饰收纳香薰精油含依兰依兰舒缓配方。限量款附赠《365天插花灵感手册》让每个平凡日子都有花开仪式感。\n宴会厅灯光暗下的刹那Glimmeria星月系列耳坠开始发光——瑞士冷珐琅工艺让蓝宝石如银河流动钛合金骨架仅3.2g无负重感。设计师秘密内置微型重力感应器随步伐产生0.01mm振幅,打造“行走的星光”。七夕限定礼盒含星座定制铭牌,让爱意如星辰永恒闪耀。",
"电影1“黑暗骑士”演员克里斯蒂安·贝尔、希斯·莱杰导演克里斯托弗·诺兰电影2“盗梦空间”演员莱昂纳多·迪卡普里奥导演克里斯托弗·诺兰电影3“钢琴家”演员艾德里安·布洛迪导演罗曼·波兰斯基电影4“泰坦尼克号”演员莱昂纳多·迪卡普里奥导演詹姆斯·卡梅隆电影5“阿凡达”演员萨姆·沃辛顿导演詹姆斯·卡梅隆电影6“南方公园大电影”演员马特·斯通、托马斯·艾恩格瑞导演特雷·帕克",
]
# 测试分词器
tokenizer = TextTokenizer(
@ -474,6 +485,16 @@ if __name__ == "__main__":
print(f"bos_token: {tokenizer.bos_token}, bos_token_id: {tokenizer.bos_token_id}")
print(f"eos_token: {tokenizer.eos_token}, eos_token_id: {tokenizer.eos_token_id}")
print(f"unk_token: {tokenizer.unk_token}, unk_token_id: {tokenizer.unk_token_id}")
# 测试拼音 (8474-10201)
for id in range(8474, 10201):
pinyin = tokenizer.convert_ids_to_tokens(id)
if re.match(TextNormalizer.PINYIN_TONE_PATTERN, pinyin, re.IGNORECASE) is None:
print(f"{pinyin} should be matched")
for badcase in [
"beta1", "better1", "voice2", "bala2", "babala2", "hunger2"
]:
if re.match(TextNormalizer.PINYIN_TONE_PATTERN, badcase, re.IGNORECASE) is not None:
print(f"{badcase} should not be matched!")
# 不应该有 unk_token_id
for t in set([*TextTokenizer.punctuation_marks_tokens, ",", "▁,", "-", "▁..."]):
tokens = tokenizer.convert_tokens_to_ids(t)