113 lines
2.5 KiB
YAML
113 lines
2.5 KiB
YAML
dataset:
|
|
bpe_model: bpe.model
|
|
sample_rate: 24000
|
|
squeeze: false
|
|
mel:
|
|
sample_rate: 24000
|
|
n_fft: 1024
|
|
hop_length: 256
|
|
win_length: 1024
|
|
n_mels: 100
|
|
mel_fmin: 0
|
|
normalize: false
|
|
|
|
gpt:
|
|
model_dim: 1024
|
|
max_mel_tokens: 605
|
|
max_text_tokens: 402
|
|
heads: 16
|
|
use_mel_codes_as_input: true
|
|
mel_length_compression: 1024
|
|
layers: 20
|
|
activation_function: "gelu_pytorch_tanh"
|
|
number_text_tokens: 12000
|
|
number_mel_codes: 8194
|
|
start_mel_token: 8192
|
|
stop_mel_token: 8193
|
|
start_text_token: 0
|
|
stop_text_token: 1
|
|
train_solo_embeddings: false
|
|
condition_type: "conformer_perceiver"
|
|
condition_module:
|
|
output_size: 512
|
|
linear_units: 2048
|
|
attention_heads: 8
|
|
num_blocks: 6
|
|
input_layer: "conv2d2"
|
|
perceiver_mult: 2
|
|
|
|
vqvae:
|
|
channels: 100
|
|
num_tokens: 8192
|
|
hidden_dim: 512
|
|
num_resnet_blocks: 3
|
|
codebook_dim: 512
|
|
num_layers: 2
|
|
positional_dims: 1
|
|
kernel_size: 3
|
|
smooth_l1_loss: true
|
|
use_transposed_convs: false
|
|
|
|
bigvgan:
|
|
adam_b1: 0.8
|
|
adam_b2: 0.99
|
|
lr_decay: 0.999998
|
|
seed: 1234
|
|
|
|
resblock: "1"
|
|
upsample_rates: [4,4,4,4,2,2]
|
|
upsample_kernel_sizes: [8,8,4,4,4,4]
|
|
upsample_initial_channel: 1536
|
|
resblock_kernel_sizes: [3,7,11]
|
|
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
|
|
feat_upsample: false
|
|
speaker_embedding_dim: 512
|
|
cond_d_vector_in_each_upsampling_layer: true
|
|
|
|
gpt_dim: 1024
|
|
|
|
activation: "snakebeta"
|
|
snake_logscale: true
|
|
|
|
use_cqtd_instead_of_mrd: true
|
|
cqtd_filters: 128
|
|
cqtd_max_filters: 1024
|
|
cqtd_filters_scale: 1
|
|
cqtd_dilations: [1, 2, 4]
|
|
cqtd_hop_lengths: [512, 256, 256]
|
|
cqtd_n_octaves: [9, 9, 9]
|
|
cqtd_bins_per_octaves: [24, 36, 48]
|
|
|
|
resolutions: [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]]
|
|
mpd_reshapes: [2, 3, 5, 7, 11]
|
|
use_spectral_norm: false
|
|
discriminator_channel_mult: 1
|
|
|
|
use_multiscale_melloss: true
|
|
lambda_melloss: 15
|
|
|
|
clip_grad_norm: 1000
|
|
|
|
segment_size: 16384
|
|
num_mels: 100
|
|
num_freq: 1025
|
|
n_fft: 1024
|
|
hop_size: 256
|
|
win_size: 1024
|
|
|
|
sampling_rate: 24000
|
|
|
|
fmin: 0
|
|
fmax: null
|
|
fmax_for_loss: null
|
|
mel_type: "pytorch"
|
|
|
|
num_workers: 2
|
|
dist_config:
|
|
dist_backend: "nccl"
|
|
dist_url: "tcp://localhost:54321"
|
|
world_size: 1
|
|
|
|
dvae_checkpoint: dvae.pth
|
|
gpt_checkpoint: gpt.pth
|
|
bigvgan_checkpoint: bigvgan_generator.pth |