From 46630ca45bfd161ae64bda0db02cd8b5417a4b57 Mon Sep 17 00:00:00 2001 From: wangyining02 Date: Wed, 26 Mar 2025 19:14:47 +0800 Subject: [PATCH 1/7] =?UTF-8?q?+=E7=AE=80=E5=8D=95=E5=89=8D=E7=AB=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- indextts/infer.py | 18 +++++--- indextts/utils/front.py | 96 +++++++++++++++++++++++++++++++++++++++++ requirements.txt | 3 +- 3 files changed, 109 insertions(+), 8 deletions(-) create mode 100644 indextts/utils/front.py diff --git a/indextts/infer.py b/indextts/infer.py index eed25d4..bc62784 100644 --- a/indextts/infer.py +++ b/indextts/infer.py @@ -14,7 +14,7 @@ from indextts.utils.feature_extractors import MelSpectrogramFeatures from indextts.utils.common import tokenize_by_CJK_char from indextts.vqvae.xtts_dvae import DiscreteVAE - +from indextts.utils.front import TextNormalizer class IndexTTS: def __init__(self, cfg_path='checkpoints/config.yaml', model_dir='checkpoints'): self.cfg = OmegaConf.load(cfg_path) @@ -42,16 +42,20 @@ class IndexTTS: self.bigvgan = self.bigvgan.to(self.device) self.bigvgan.eval() print(">> bigvgan weights restored from:", self.bigvgan_path) + self.normalizer = TextNormalizer() + self.normalizer.load() + print(">> TextNormalizer loaded") def preprocess_text(self, text): - chinese_punctuation = ",。!?;:“”‘’()【】《》" - english_punctuation = ",.!?;:\"\"''()[]<>" - - # 创建一个映射字典 - punctuation_map = str.maketrans(chinese_punctuation, english_punctuation) + # chinese_punctuation = ",。!?;:“”‘’()【】《》" + # english_punctuation = ",.!?;:\"\"''()[]<>" + # + # # 创建一个映射字典 + # punctuation_map = str.maketrans(chinese_punctuation, english_punctuation) # 使用translate方法替换标点符号 - return text.translate(punctuation_map) + # return text.translate(punctuation_map) + return self.normalizer.infer(text) def infer(self, audio_prompt, text, output_path): text = self.preprocess_text(text) diff --git a/indextts/utils/front.py b/indextts/utils/front.py new file mode 100644 index 0000000..24ddf03 --- /dev/null +++ b/indextts/utils/front.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- +import traceback +import os +import sys +import re +import re + + + + +class TextNormalizer: + def __init__(self): + # self.normalizer = Normalizer(cache_dir="textprocessing/tn") + self.zh_normalizer = None + self.en_normalizer = None + self.char_rep_map = { + ":": ",", + ";": ",", + ";": ",", + ",": ",", + "。": ".", + "!": "!", + "?": "?", + "\n": ".", + "·": ",", + "、": ",", + "...": "…", + "……": "…", + "$": ".", + "“": "'", + "”": "'", + '"': "'", + "‘": "'", + "’": "'", + "(": "'", + ")": "'", + "(": "'", + ")": "'", + "《": "'", + "》": "'", + "【": "'", + "】": "'", + "[": "'", + "]": "'", + "—": "-", + "~": "-", + "~": "-", + "「": "'", + "」": "'", + ":": ",", + } + + def match_email(self, email): + # 正则表达式匹配邮箱格式:数字英文@数字英文.英文 + pattern = r'^[a-zA-Z0-9]+@[a-zA-Z0-9]+\.[a-zA-Z]+$' + return re.match(pattern, email) is not None + + def use_chinese(self, s): + has_chinese = bool(re.search(r'[\u4e00-\u9fff]', s)) + has_digit = bool(re.search(r'\d', s)) + has_alpha = bool(re.search(r'[a-zA-Z]', s)) + is_email = self.match_email(s) + if has_chinese or not has_alpha or is_email: + return True + else: + return False + + def load(self): + # print(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")) + # sys.path.append(model_dir) + + from tn.chinese.normalizer import Normalizer as NormalizerZh + from tn.english.normalizer import Normalizer as NormalizerEn + + self.zh_normalizer = NormalizerZh(remove_interjections=False, remove_erhua=False) + self.en_normalizer = NormalizerEn() + + def infer(self, text): + pattern = re.compile("|".join(re.escape(p) for p in self.char_rep_map.keys())) + replaced_text = pattern.sub(lambda x: self.char_rep_map[x.group()], text) + if not self.zh_normalizer or not self.en_normalizer: + print("Error, text normalizer is not initialized !!!") + return "" + try: + normalizer = self.zh_normalizer if self.use_chinese(text) else self.en_normalizer + result = normalizer.normalize(text) + except Exception: + result = "" + print(traceback.format_exc()) + return result + + +if __name__ == '__main__': + # 测试程序 + text_normalizer = TextNormalizer() + print(text_normalizer.infer("2.5平方电线")) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 846e188..b813fe9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,4 +20,5 @@ sentencepiece pypinyin librosa gradio -tqdm \ No newline at end of file +tqdm +WeTextProcessing \ No newline at end of file From 9a925a14974040b695cf30e4e48aae3cf3e6dcc3 Mon Sep 17 00:00:00 2001 From: wangyining02 Date: Wed, 26 Mar 2025 19:28:44 +0800 Subject: [PATCH 2/7] =?UTF-8?q?=E5=89=8D=E7=AB=AF=E5=85=BC=E5=AE=B9arm?= =?UTF-8?q?=E6=9C=BA=E5=99=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- indextts/utils/front.py | 15 ++++++++++----- requirements.txt | 3 ++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/indextts/utils/front.py b/indextts/utils/front.py index 24ddf03..ec3eb7f 100644 --- a/indextts/utils/front.py +++ b/indextts/utils/front.py @@ -68,12 +68,17 @@ class TextNormalizer: def load(self): # print(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")) # sys.path.append(model_dir) + import platform + if platform.machine() == "aarch64": + from wetext import Normalizer + self.zh_normalizer = Normalizer(remove_erhua=False,lang="zh",operator="tn") + self.en_normalizer = Normalizer(lang="en",operator="tn") + else: + from tn.chinese.normalizer import Normalizer as NormalizerZh + from tn.english.normalizer import Normalizer as NormalizerEn + self.zh_normalizer = NormalizerZh(remove_interjections=False, remove_erhua=False) + self.en_normalizer = NormalizerEn() - from tn.chinese.normalizer import Normalizer as NormalizerZh - from tn.english.normalizer import Normalizer as NormalizerEn - - self.zh_normalizer = NormalizerZh(remove_interjections=False, remove_erhua=False) - self.en_normalizer = NormalizerEn() def infer(self, text): pattern = re.compile("|".join(re.escape(p) for p in self.char_rep_map.keys())) diff --git a/requirements.txt b/requirements.txt index b813fe9..803d193 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,4 +21,5 @@ pypinyin librosa gradio tqdm -WeTextProcessing \ No newline at end of file +WeTextProcessing # arm机器如果安装失败,请注释此行 +wetext \ No newline at end of file From fb0bc6a4867db661fead6c53c8618fb63e9e3f41 Mon Sep 17 00:00:00 2001 From: wangyining02 Date: Wed, 26 Mar 2025 19:29:31 +0800 Subject: [PATCH 3/7] fix --- indextts/utils/front.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/indextts/utils/front.py b/indextts/utils/front.py index ec3eb7f..a07cb1e 100644 --- a/indextts/utils/front.py +++ b/indextts/utils/front.py @@ -88,7 +88,7 @@ class TextNormalizer: return "" try: normalizer = self.zh_normalizer if self.use_chinese(text) else self.en_normalizer - result = normalizer.normalize(text) + result = normalizer.normalize(replaced_text) except Exception: result = "" print(traceback.format_exc()) From f6e7b4acf6639e0aa245d723090b8b1f21e9f545 Mon Sep 17 00:00:00 2001 From: wangyining02 Date: Wed, 26 Mar 2025 19:33:12 +0800 Subject: [PATCH 4/7] fix --- indextts/utils/front.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/indextts/utils/front.py b/indextts/utils/front.py index a07cb1e..5e703e9 100644 --- a/indextts/utils/front.py +++ b/indextts/utils/front.py @@ -87,7 +87,7 @@ class TextNormalizer: print("Error, text normalizer is not initialized !!!") return "" try: - normalizer = self.zh_normalizer if self.use_chinese(text) else self.en_normalizer + normalizer = self.zh_normalizer if self.use_chinese(replaced_text) else self.en_normalizer result = normalizer.normalize(replaced_text) except Exception: result = "" From 1004452e958d3b771355b5c097720a10eb62846b Mon Sep 17 00:00:00 2001 From: wangyining02 Date: Wed, 26 Mar 2025 20:29:12 +0800 Subject: [PATCH 5/7] =?UTF-8?q?WeTextProcessing:=20overwrite=5Fcache=3DTru?= =?UTF-8?q?e=20=E5=88=B7=E6=96=B0=E5=89=8D=E7=AB=AF=E7=BC=93=E5=AD=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- indextts/infer.py | 3 +++ indextts/utils/front.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/indextts/infer.py b/indextts/infer.py index bc62784..6cb432d 100644 --- a/indextts/infer.py +++ b/indextts/infer.py @@ -58,7 +58,10 @@ class IndexTTS: return self.normalizer.infer(text) def infer(self, audio_prompt, text, output_path): + print(f"origin text:{text}") text = self.preprocess_text(text) + print(f"normalized text:{text}") + audio, sr = torchaudio.load(audio_prompt) audio = torch.mean(audio, dim=0, keepdim=True) diff --git a/indextts/utils/front.py b/indextts/utils/front.py index 5e703e9..4276e08 100644 --- a/indextts/utils/front.py +++ b/indextts/utils/front.py @@ -76,8 +76,8 @@ class TextNormalizer: else: from tn.chinese.normalizer import Normalizer as NormalizerZh from tn.english.normalizer import Normalizer as NormalizerEn - self.zh_normalizer = NormalizerZh(remove_interjections=False, remove_erhua=False) - self.en_normalizer = NormalizerEn() + self.zh_normalizer = NormalizerZh(remove_interjections=False, remove_erhua=False,overwrite_cache=True) + self.en_normalizer = NormalizerEn(overwrite_cache=True) def infer(self, text): From fd81f4a5bd80150b6cceafa40e68563b22e07629 Mon Sep 17 00:00:00 2001 From: kemuriririn <10inspiral@gmail.com> Date: Thu, 27 Mar 2025 14:03:51 +0800 Subject: [PATCH 6/7] =?UTF-8?q?=E6=81=A2=E5=A4=8D=E8=BE=93=E5=85=A5?= =?UTF-8?q?=E4=B8=AD=E7=9A=84=E6=8B=BC=E9=9F=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- indextts/utils/front.py | 80 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 74 insertions(+), 6 deletions(-) diff --git a/indextts/utils/front.py b/indextts/utils/front.py index 4276e08..b2ad70a 100644 --- a/indextts/utils/front.py +++ b/indextts/utils/front.py @@ -1,11 +1,6 @@ # -*- coding: utf-8 -*- import traceback -import os -import sys import re -import re - - class TextNormalizer: @@ -69,7 +64,7 @@ class TextNormalizer: # print(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")) # sys.path.append(model_dir) import platform - if platform.machine() == "aarch64": + if platform.system() == "Darwin": from wetext import Normalizer self.zh_normalizer = Normalizer(remove_erhua=False,lang="zh",operator="tn") self.en_normalizer = Normalizer(lang="en",operator="tn") @@ -92,8 +87,81 @@ class TextNormalizer: except Exception: result = "" print(traceback.format_exc()) + result = self.restore_pinyin_tone_numbers(replaced_text, result) return result + def pinyin_match(self, pinyin): + pattern = r"(qun)(\d)" + repl = r"qvn\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + + pattern = r"(quan)(\d)" + repl = r"qvan\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + + pattern = r"(que)(\d)" + repl = r"qve\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + + pattern = r"(qu)(\d)" + repl = r"qv\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + + pattern = r"(ju)(\d)" + repl = r"jv\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + + pattern = r"(jue)(\d)" + repl = r"jve\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + + pattern = r"(xun)(\d)" + repl = r"xvn\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + + pattern = r"(xue)(\d)" + repl = r"xve\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + + pattern = r"(xu)(\d)" + repl = r"xv\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + + pattern = r"(juan)(\d)" + repl = r"jvan\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + + pattern = r"(jun)(\d)" + repl = r"jvn\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + + pattern = r"(xuan)(\d)" + repl = r"xvan\g<2>" + pinyin = re.sub(pattern, repl, pinyin) + return pinyin + + def restore_pinyin_tone_numbers(self,original_text, processed_text): + # 第一步:恢复拼音后的音调数字(1-4) + # 建立中文数字到阿拉伯数字的映射 + chinese_to_num = {'一': '1', '二': '2', '三': '3', '四': '4'} + + # 使用正则表达式找到拼音+中文数字的组合(如 "xuan四") + def replace_tone(match): + pinyin = match.group(1) # 拼音部分 + chinese_num = match.group(2) # 中文数字部分 + # 将中文数字转换为阿拉伯数字 + num = chinese_to_num.get(chinese_num, chinese_num) + return f"{pinyin}{num}" + + # 匹配拼音后跟中文数字(一、二、三、四)的情况 + pattern = r'([a-zA-Z]+)([一二三四])' + restored_text = re.sub(pattern, replace_tone, processed_text) + restored_text = restored_text.lower() + restored_text = self.pinyin_match(restored_text) + + return restored_text + + if __name__ == '__main__': # 测试程序 From 6286b0ffc966ac7b774cfdffffe165ffa5352c7c Mon Sep 17 00:00:00 2001 From: kemuriririn <10inspiral@gmail.com> Date: Wed, 2 Apr 2025 17:40:41 +0800 Subject: [PATCH 7/7] =?UTF-8?q?=E6=8E=A8=E7=90=86=E6=97=B6=E5=8A=A0?= =?UTF-8?q?=E8=BD=BDbpe=20model=E4=BD=BF=E7=94=A8=E7=9B=B8=E5=AF=B9?= =?UTF-8?q?=E4=BA=8E=E6=A8=A1=E5=9E=8B=E6=A0=B9=E7=9B=AE=E5=BD=95=E7=9A=84?= =?UTF-8?q?=E8=B7=AF=E5=BE=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 - checkpoints/config.yaml | 112 ++++++++++++++++++++++++++++++++++++++++ indextts/infer.py | 2 +- 3 files changed, 113 insertions(+), 3 deletions(-) create mode 100644 checkpoints/config.yaml diff --git a/README.md b/README.md index ad7737c..2d5eea3 100644 --- a/README.md +++ b/README.md @@ -105,11 +105,9 @@ apt-get install ffmpeg ``` 3. Download models: ```bash -mkdir checkpoints wget https://huggingface.co/IndexTeam/Index-TTS/resolve/main/bigvgan_discriminator.pth -P checkpoints wget https://huggingface.co/IndexTeam/Index-TTS/resolve/main/bigvgan_generator.pth -P checkpoints wget https://huggingface.co/IndexTeam/Index-TTS/resolve/main/bpe.model -P checkpoints -wget https://huggingface.co/IndexTeam/Index-TTS/resolve/main/config.yaml -P checkpoints wget https://huggingface.co/IndexTeam/Index-TTS/resolve/main/dvae.pth -P checkpoints wget https://huggingface.co/IndexTeam/Index-TTS/resolve/main/gpt.pth -P checkpoints wget https://huggingface.co/IndexTeam/Index-TTS/resolve/main/unigram_12000.vocab -P checkpoints diff --git a/checkpoints/config.yaml b/checkpoints/config.yaml new file mode 100644 index 0000000..e24336d --- /dev/null +++ b/checkpoints/config.yaml @@ -0,0 +1,112 @@ +dataset: + bpe_model: bpe.model + sample_rate: 24000 + squeeze: false + mel: + sample_rate: 24000 + n_fft: 1024 + hop_length: 256 + win_length: 1024 + n_mels: 100 + mel_fmin: 0 + normalize: false + +gpt: + model_dim: 1024 + max_mel_tokens: 605 + max_text_tokens: 402 + heads: 16 + use_mel_codes_as_input: true + mel_length_compression: 1024 + layers: 20 + number_text_tokens: 12000 + number_mel_codes: 8194 + start_mel_token: 8192 + stop_mel_token: 8193 + start_text_token: 0 + stop_text_token: 1 + train_solo_embeddings: false + condition_type: "conformer_perceiver" + condition_module: + output_size: 512 + linear_units: 2048 + attention_heads: 8 + num_blocks: 6 + input_layer: "conv2d2" + perceiver_mult: 2 + +vqvae: + channels: 100 + num_tokens: 8192 + hidden_dim: 512 + num_resnet_blocks: 3 + codebook_dim: 512 + num_layers: 2 + positional_dims: 1 + kernel_size: 3 + smooth_l1_loss: true + use_transposed_convs: false + +bigvgan: + adam_b1: 0.8 + adam_b2: 0.99 + lr_decay: 0.999998 + seed: 1234 + + resblock: "1" + upsample_rates: [4,4,4,4,2,2] + upsample_kernel_sizes: [8,8,4,4,4,4] + upsample_initial_channel: 1536 + resblock_kernel_sizes: [3,7,11] + resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] + feat_upsample: false + speaker_embedding_dim: 512 + cond_d_vector_in_each_upsampling_layer: true + + gpt_dim: 1024 + + activation: "snakebeta" + snake_logscale: true + + use_cqtd_instead_of_mrd: true + cqtd_filters: 128 + cqtd_max_filters: 1024 + cqtd_filters_scale: 1 + cqtd_dilations: [1, 2, 4] + cqtd_hop_lengths: [512, 256, 256] + cqtd_n_octaves: [9, 9, 9] + cqtd_bins_per_octaves: [24, 36, 48] + + resolutions: [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]] + mpd_reshapes: [2, 3, 5, 7, 11] + use_spectral_norm: false + discriminator_channel_mult: 1 + + use_multiscale_melloss: true + lambda_melloss: 15 + + clip_grad_norm: 1000 + + segment_size: 16384 + num_mels: 100 + num_freq: 1025 + n_fft: 1024 + hop_size: 256 + win_size: 1024 + + sampling_rate: 24000 + + fmin: 0 + fmax: null + fmax_for_loss: null + mel_type: "pytorch" + + num_workers: 2 + dist_config: + dist_backend: "nccl" + dist_url: "tcp://localhost:54321" + world_size: 1 + +dvae_checkpoint: dvae.pth +gpt_checkpoint: gpt.pth +bigvgan_checkpoint: bigvgan_generator.pth \ No newline at end of file diff --git a/indextts/infer.py b/indextts/infer.py index 6cb432d..f1f419e 100644 --- a/indextts/infer.py +++ b/indextts/infer.py @@ -74,7 +74,7 @@ class IndexTTS: auto_conditioning = cond_mel tokenizer = spm.SentencePieceProcessor() - tokenizer.load(self.cfg.dataset['bpe_model']) + tokenizer.load(os.path.join(self.model_dir,self.cfg.dataset['bpe_model'])) punctuation = ["!", "?", ".", ";", "!", "?", "。", ";"] pattern = r"(?<=[{0}])\s*".format("".join(punctuation))