diff --git a/indextts/utils/front.py b/indextts/utils/front.py index b629e9a..2571035 100644 --- a/indextts/utils/front.py +++ b/indextts/utils/front.py @@ -62,8 +62,9 @@ class TextNormalizer: """ 匹配拼音声调格式:pinyin+数字,声调1-5,5表示轻声 例如:xuan4, jve2, ying1, zhong4, shang5 + 不匹配:beta1, voice2 """ - PINYIN_TONE_PATTERN = r"([bmnpqdfghjklzcsxwy]?h?[aeiouüv]{1,2}[ng]*|ng)([1-5])" + PINYIN_TONE_PATTERN = r"(? str: @@ -361,7 +369,7 @@ class TextTokenizer: sub_sentences = TextTokenizer.split_sentences_by_token( current_sentence, [",", "▁,"], max_tokens_per_sentence=max_tokens_per_sentence ) - elif not ("-" in split_tokens ) and "-" in current_sentence: + elif "-" not in split_tokens and "-" in current_sentence: # 没有,,则按-分割 sub_sentences = TextTokenizer.split_sentences_by_token( current_sentence, ["-"], max_tokens_per_sentence=max_tokens_per_sentence @@ -449,6 +457,10 @@ if __name__ == "__main__": "这酒...里...有毒...", # 异常case "只有,,,才是最好的", + "babala2是什么?", # babala二是什么? + "用beta1测试", # 用beta一测试 + "have you ever been to beta2?", # have you ever been to beta two? + "such as XTTS, CosyVoice2, Fish-Speech, and F5-TTS", # such as xtts,cosyvoice two,fish-speech,and f five-tts # 人名 "约瑟夫·高登-莱维特(Joseph Gordon-Levitt is an American actor)", "蒂莫西·唐纳德·库克(英文名:Timothy Donald Cook),通称蒂姆·库克(Tim Cook),美国商业经理、工业工程师和工业开发商,现任苹果公司首席执行官。", @@ -456,7 +468,6 @@ if __name__ == "__main__": "《盗梦空间》是由美国华纳兄弟影片公司出品的电影,由克里斯托弗·诺兰执导并编剧,莱昂纳多·迪卡普里奥、玛丽昂·歌迪亚、约瑟夫·高登-莱维特、艾利奥特·佩吉、汤姆·哈迪等联袂主演,2010年7月16日在美国上映,2010年9月1日在中国内地上映,2020年8月28日在中国内地重映。影片剧情游走于梦境与现实之间,被定义为“发生在意识结构内的当代动作科幻片”,讲述了由莱昂纳多·迪卡普里奥扮演的造梦师,带领特工团队进入他人梦境,从他人的潜意识中盗取机密,并重塑他人梦境的故事。", "清晨拉开窗帘,阳光洒在窗台的Bloomixy花艺礼盒上——薰衣草香薰蜡烛唤醒嗅觉,永生花束折射出晨露般光泽。设计师将“自然绽放美学”融入每个细节:手工陶瓷花瓶可作首饰收纳,香薰精油含依兰依兰舒缓配方。限量款附赠《365天插花灵感手册》,让每个平凡日子都有花开仪式感。\n宴会厅灯光暗下的刹那,Glimmeria星月系列耳坠开始发光——瑞士冷珐琅工艺让蓝宝石如银河流动,钛合金骨架仅3.2g无负重感。设计师秘密:内置微型重力感应器,随步伐产生0.01mm振幅,打造“行走的星光”。七夕限定礼盒含星座定制铭牌,让爱意如星辰永恒闪耀。", "电影1:“黑暗骑士”(演员:克里斯蒂安·贝尔、希斯·莱杰;导演:克里斯托弗·诺兰);电影2:“盗梦空间”(演员:莱昂纳多·迪卡普里奥;导演:克里斯托弗·诺兰);电影3:“钢琴家”(演员:艾德里安·布洛迪;导演:罗曼·波兰斯基);电影4:“泰坦尼克号”(演员:莱昂纳多·迪卡普里奥;导演:詹姆斯·卡梅隆);电影5:“阿凡达”(演员:萨姆·沃辛顿;导演:詹姆斯·卡梅隆);电影6:“南方公园:大电影”(演员:马特·斯通、托马斯·艾恩格瑞;导演:特雷·帕克)", - ] # 测试分词器 tokenizer = TextTokenizer( @@ -474,6 +485,16 @@ if __name__ == "__main__": print(f"bos_token: {tokenizer.bos_token}, bos_token_id: {tokenizer.bos_token_id}") print(f"eos_token: {tokenizer.eos_token}, eos_token_id: {tokenizer.eos_token_id}") print(f"unk_token: {tokenizer.unk_token}, unk_token_id: {tokenizer.unk_token_id}") + # 测试拼音 (8474-10201) + for id in range(8474, 10201): + pinyin = tokenizer.convert_ids_to_tokens(id) + if re.match(TextNormalizer.PINYIN_TONE_PATTERN, pinyin, re.IGNORECASE) is None: + print(f"{pinyin} should be matched") + for badcase in [ + "beta1", "better1", "voice2", "bala2", "babala2", "hunger2" + ]: + if re.match(TextNormalizer.PINYIN_TONE_PATTERN, badcase, re.IGNORECASE) is not None: + print(f"{badcase} should not be matched!") # 不应该有 unk_token_id for t in set([*TextTokenizer.punctuation_marks_tokens, ",", "▁,", "-", "▁..."]): tokens = tokenizer.convert_tokens_to_ids(t)