diff --git a/indextts/infer_v2.py b/indextts/infer_v2.py index afc285a..b39090b 100644 --- a/indextts/infer_v2.py +++ b/indextts/infer_v2.py @@ -305,6 +305,22 @@ class IndexTTS2: print(f"Audio too long ({audio.shape[1]} samples), truncating to {max_audio_samples} samples") audio = audio[:, :max_audio_samples] return audio, sr + + def normalize_emo_vec(self, emo_vector, apply_bias=True): + # apply biased emotion factors for better user experience, + # by de-emphasizing emotions that can cause strange results + if apply_bias: + # [happy, angry, sad, afraid, disgusted, melancholic, surprised, calm] + emo_bias = [0.9375, 0.875, 1.0, 1.0, 0.9375, 0.9375, 0.6875, 0.5625] + emo_vector = [vec * bias for vec, bias in zip(emo_vector, emo_bias)] + + # the total emotion sum must be 0.8 or less + emo_sum = sum(emo_vector) + if emo_sum > 0.8: + scale_factor = 0.8 / emo_sum + emo_vector = [vec * scale_factor for vec in emo_vector] + + return emo_vector # 原始推理模式 def infer(self, spk_audio_prompt, text, output_path, diff --git a/webui.py b/webui.py index 302a5b3..0a97f75 100644 --- a/webui.py +++ b/webui.py @@ -6,8 +6,6 @@ import time import warnings -import numpy as np - warnings.filterwarnings("ignore", category=FutureWarning) warnings.filterwarnings("ignore", category=UserWarning) @@ -104,14 +102,6 @@ with open("examples/cases.jsonl", "r", encoding="utf-8") as f: example.get("emo_text") is not None] ) -def normalize_emo_vec(emo_vec): - # emotion factors for better user experience - k_vec = [0.75,0.70,0.80,0.80,0.75,0.75,0.55,0.45] - tmp = np.array(k_vec) * np.array(emo_vec) - if np.sum(tmp) > 0.8: - tmp = tmp * 0.8/ np.sum(tmp) - return tmp.tolist() - def gen_single(emo_control_method,prompt, text, emo_ref_path, emo_weight, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, @@ -145,7 +135,7 @@ def gen_single(emo_control_method,prompt, text, pass if emo_control_method == 2: # emotion from custom vectors vec = [vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8] - vec = normalize_emo_vec(vec) + vec = tts.normalize_emo_vec(vec, apply_bias=True) else: # don't use the emotion vector inputs for the other modes vec = None