From 1734698f3d9431a816172b57bb342ee4df404610 Mon Sep 17 00:00:00 2001 From: kemuriririn Date: Thu, 27 Mar 2025 14:12:12 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E4=B8=80=E4=BA=9B=E8=BE=93=E5=85=A5?= =?UTF-8?q?=E6=96=87=E6=9C=AC=E5=A4=84=E7=90=86=E7=9A=84fix=20(#18)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * +简单前端 * 前端兼容arm机器 * fix * fix * WeTextProcessing: overwrite_cache=True 刷新前端缓存 * 恢复输入中的拼音 --------- Co-authored-by: kemuriririn <10inspiral@gmail.com> --- indextts/infer.py | 3 ++ indextts/utils/front.py | 85 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 78 insertions(+), 10 deletions(-) diff --git a/indextts/infer.py b/indextts/infer.py index bc62784..6cb432d 100644 --- a/indextts/infer.py +++ b/indextts/infer.py @@ -58,7 +58,10 @@ class IndexTTS: return self.normalizer.infer(text) def infer(self, audio_prompt, text, output_path): + print(f"origin text:{text}") text = self.preprocess_text(text) + print(f"normalized text:{text}") + audio, sr = torchaudio.load(audio_prompt) audio = torch.mean(audio, dim=0, keepdim=True) diff --git a/indextts/utils/front.py b/indextts/utils/front.py index 5e703e9..cf5a7e9 100644 --- a/indextts/utils/front.py +++ b/indextts/utils/front.py @@ -1,12 +1,6 @@ # -*- coding: utf-8 -*- import traceback -import os -import sys import re -import re - - - class TextNormalizer: def __init__(self): @@ -69,16 +63,15 @@ class TextNormalizer: # print(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")) # sys.path.append(model_dir) import platform - if platform.machine() == "aarch64": + if platform.system() == "Darwin": from wetext import Normalizer self.zh_normalizer = Normalizer(remove_erhua=False,lang="zh",operator="tn") self.en_normalizer = Normalizer(lang="en",operator="tn") else: from tn.chinese.normalizer import Normalizer as NormalizerZh from tn.english.normalizer import Normalizer as NormalizerEn - self.zh_normalizer = NormalizerZh(remove_interjections=False, remove_erhua=False) - self.en_normalizer = NormalizerEn() - + self.zh_normalizer = NormalizerZh(remove_interjections=False, remove_erhua=False,overwrite_cache=True) + self.en_normalizer = NormalizerEn(overwrite_cache=True) def infer(self, text): pattern = re.compile("|".join(re.escape(p) for p in self.char_rep_map.keys())) @@ -92,8 +85,80 @@ class TextNormalizer: except Exception: result = "" print(traceback.format_exc()) + result = self.restore_pinyin_tone_numbers(replaced_text, result) return result + def pinyin_match(self, pinyin): + pattern = r"(qun)(\d)" + repl = r"qvn\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + + pattern = r"(quan)(\d)" + repl = r"qvan\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + + pattern = r"(que)(\d)" + repl = r"qve\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + + pattern = r"(qu)(\d)" + repl = r"qv\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + + pattern = r"(ju)(\d)" + repl = r"jv\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + + pattern = r"(jue)(\d)" + repl = r"jve\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + + pattern = r"(xun)(\d)" + repl = r"xvn\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + + pattern = r"(xue)(\d)" + repl = r"xve\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + + pattern = r"(xu)(\d)" + repl = r"xv\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + + pattern = r"(juan)(\d)" + repl = r"jvan\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + + pattern = r"(jun)(\d)" + repl = r"jvn\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + + pattern = r"(xuan)(\d)" + repl = r"xvan\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + return pinyin + + def restore_pinyin_tone_numbers(self,original_text, processed_text): + # 第一步:恢复拼音后的音调数字(1-4) + # 建立中文数字到阿拉伯数字的映射 + chinese_to_num = {'一': '1', '二': '2', '三': '3', '四': '4'} + + # 使用正则表达式找到拼音+中文数字的组合(如 "xuan四") + def replace_tone(match): + pinyin = match.group(1) # 拼音部分 + chinese_num = match.group(2) # 中文数字部分 + # 将中文数字转换为阿拉伯数字 + num = chinese_to_num.get(chinese_num, chinese_num) + return f"{pinyin}{num}" + + # 匹配拼音后跟中文数字(一、二、三、四)的情况 + pattern = r'([a-zA-Z]+)([一二三四])' + restored_text = re.sub(pattern, replace_tone, processed_text) + restored_text = restored_text.lower() + restored_text = self.pinyin_match(restored_text) + + return restored_text + if __name__ == '__main__': # 测试程序 From c9bea55903f4b10e0fe5ca1ec01d12b4772dde99 Mon Sep 17 00:00:00 2001 From: index-tts Date: Fri, 28 Mar 2025 15:56:47 +0800 Subject: [PATCH 2/2] Update README.md --- README.md | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index b9202dd..ad7737c 100644 --- a/README.md +++ b/README.md @@ -11,12 +11,20 @@ ## 👉🏻 IndexTTS 👈🏻 -[[HuggingFace Demo]](https://huggingface.co/spaces/IndexTeam/IndexTTS) [[Paper]](https://arxiv.org/abs/2502.05512) [[Demos]](https://index-tts.github.io) +[[HuggingFace Demo]](https://huggingface.co/spaces/IndexTeam/IndexTTS) [[ModelScope Demo]](https://modelscope.cn/studios/IndexTeam/IndexTTS-Demo) \ +[[Paper]](https://arxiv.org/abs/2502.05512) [[Demos]](https://index-tts.github.io) **IndexTTS** is a GPT-style text-to-speech (TTS) model mainly based on XTTS and Tortoise. It is capable of correcting the pronunciation of Chinese characters using pinyin and controlling pauses at any position through punctuation marks. We enhanced multiple modules of the system, including the improvement of speaker condition feature representation, and the integration of BigVGAN2 to optimize audio quality. Trained on tens of thousands of hours of data, our system achieves state-of-the-art performance, outperforming current popular TTS systems such as XTTS, CosyVoice2, Fish-Speech, and F5-TTS. Experience **IndexTTS**: Please contact xuanwu@bilibili.com for more detailed information. +### Contact +QQ群:553460296 \ +Discord:https://discord.gg/uT32E7KDmy \ +欢迎大家来交流讨论! +## 📣 Updates +- `2025/03/25` 🔥🔥 We release the model parameters and inference code. +- `2025/02/12` 🔥 We submitted our paper on arXiv, and released our demos and test sets. ## 🖥️ Method @@ -32,15 +40,12 @@ The main improvements and contributions are summarized as follows: - **IndexTTS** incorporate a conformer conditioning encoder and a BigVGAN2-based speechcode decoder. This improves training stability, voice timbre similarity, and sound quality. - We release all test sets here, including those for polysyllabic words, subjective and objective test sets. -## 📣 Updates -- `2025/03/25` 🔥🔥 We release the model parameters and inference code. -- `2025/02/12` 🔥 We submitted our paper on arXiv, and released our demos and test sets. ## Model Download -| **HuggingFace** | -|----------------------------------------------------------| -| [😁IndexTTS](https://huggingface.co/IndexTeam/Index-TTS) | +| **HuggingFace** | **ModelScope** | +|----------------------------------------------------------|----------------------------------------------------------| +| [😁IndexTTS](https://huggingface.co/IndexTeam/Index-TTS) | [IndexTTS](https://modelscope.cn/models/IndexTeam/Index-TTS) | ## 📑 Evaluation