diff --git a/README.md b/README.md index ad7737c..2d5eea3 100644 --- a/README.md +++ b/README.md @@ -105,11 +105,9 @@ apt-get install ffmpeg ``` 3. Download models: ```bash -mkdir checkpoints wget https://huggingface.co/IndexTeam/Index-TTS/resolve/main/bigvgan_discriminator.pth -P checkpoints wget https://huggingface.co/IndexTeam/Index-TTS/resolve/main/bigvgan_generator.pth -P checkpoints wget https://huggingface.co/IndexTeam/Index-TTS/resolve/main/bpe.model -P checkpoints -wget https://huggingface.co/IndexTeam/Index-TTS/resolve/main/config.yaml -P checkpoints wget https://huggingface.co/IndexTeam/Index-TTS/resolve/main/dvae.pth -P checkpoints wget https://huggingface.co/IndexTeam/Index-TTS/resolve/main/gpt.pth -P checkpoints wget https://huggingface.co/IndexTeam/Index-TTS/resolve/main/unigram_12000.vocab -P checkpoints diff --git a/checkpoints/config.yaml b/checkpoints/config.yaml new file mode 100644 index 0000000..e24336d --- /dev/null +++ b/checkpoints/config.yaml @@ -0,0 +1,112 @@ +dataset: + bpe_model: bpe.model + sample_rate: 24000 + squeeze: false + mel: + sample_rate: 24000 + n_fft: 1024 + hop_length: 256 + win_length: 1024 + n_mels: 100 + mel_fmin: 0 + normalize: false + +gpt: + model_dim: 1024 + max_mel_tokens: 605 + max_text_tokens: 402 + heads: 16 + use_mel_codes_as_input: true + mel_length_compression: 1024 + layers: 20 + number_text_tokens: 12000 + number_mel_codes: 8194 + start_mel_token: 8192 + stop_mel_token: 8193 + start_text_token: 0 + stop_text_token: 1 + train_solo_embeddings: false + condition_type: "conformer_perceiver" + condition_module: + output_size: 512 + linear_units: 2048 + attention_heads: 8 + num_blocks: 6 + input_layer: "conv2d2" + perceiver_mult: 2 + +vqvae: + channels: 100 + num_tokens: 8192 + hidden_dim: 512 + num_resnet_blocks: 3 + codebook_dim: 512 + num_layers: 2 + positional_dims: 1 + kernel_size: 3 + smooth_l1_loss: true + use_transposed_convs: false + +bigvgan: + adam_b1: 0.8 + adam_b2: 0.99 + lr_decay: 0.999998 + seed: 1234 + + resblock: "1" + upsample_rates: [4,4,4,4,2,2] + upsample_kernel_sizes: [8,8,4,4,4,4] + upsample_initial_channel: 1536 + resblock_kernel_sizes: [3,7,11] + resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] + feat_upsample: false + speaker_embedding_dim: 512 + cond_d_vector_in_each_upsampling_layer: true + + gpt_dim: 1024 + + activation: "snakebeta" + snake_logscale: true + + use_cqtd_instead_of_mrd: true + cqtd_filters: 128 + cqtd_max_filters: 1024 + cqtd_filters_scale: 1 + cqtd_dilations: [1, 2, 4] + cqtd_hop_lengths: [512, 256, 256] + cqtd_n_octaves: [9, 9, 9] + cqtd_bins_per_octaves: [24, 36, 48] + + resolutions: [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]] + mpd_reshapes: [2, 3, 5, 7, 11] + use_spectral_norm: false + discriminator_channel_mult: 1 + + use_multiscale_melloss: true + lambda_melloss: 15 + + clip_grad_norm: 1000 + + segment_size: 16384 + num_mels: 100 + num_freq: 1025 + n_fft: 1024 + hop_size: 256 + win_size: 1024 + + sampling_rate: 24000 + + fmin: 0 + fmax: null + fmax_for_loss: null + mel_type: "pytorch" + + num_workers: 2 + dist_config: + dist_backend: "nccl" + dist_url: "tcp://localhost:54321" + world_size: 1 + +dvae_checkpoint: dvae.pth +gpt_checkpoint: gpt.pth +bigvgan_checkpoint: bigvgan_generator.pth \ No newline at end of file diff --git a/indextts/infer.py b/indextts/infer.py index 6cb432d..f1f419e 100644 --- a/indextts/infer.py +++ b/indextts/infer.py @@ -74,7 +74,7 @@ class IndexTTS: auto_conditioning = cond_mel tokenizer = spm.SentencePieceProcessor() - tokenizer.load(self.cfg.dataset['bpe_model']) + tokenizer.load(os.path.join(self.model_dir,self.cfg.dataset['bpe_model'])) punctuation = ["!", "?", ".", ";", "!", "?", "。", ";"] pattern = r"(?<=[{0}])\s*".format("".join(punctuation))