From 8aa8064a53c5b5b53ff20f6de94aaadc18a4cd9d Mon Sep 17 00:00:00 2001 From: Arcitec <38923130+Arcitec@users.noreply.github.com> Date: Sun, 14 Sep 2025 00:22:41 +0200 Subject: [PATCH] feat: Add reusable Emotion Vector normalization helper - The WebUI was secretly squashing all emotion vectors and re-scaling them. It's a good idea for user friendliness, but it makes it harder to learn what values will work in Python when using the WebUI for testing. - Instead, let's move the normalization code into IndexTTS2 as a helper function which is used by Gradio and can be used from other people's code too. - The emotion bias (which reduces the influence of certain emotions) has also been converted into an optional feature, which can be turned off if such biasing isn't wanted. And all biasing values have been re-scaled to use 1.0 as the reference, to avoid scaling relative to 0.8 (which previously meant that it applied double scaling). --- indextts/infer_v2.py | 16 ++++++++++++++++ webui.py | 12 +----------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/indextts/infer_v2.py b/indextts/infer_v2.py index afc285a..b39090b 100644 --- a/indextts/infer_v2.py +++ b/indextts/infer_v2.py @@ -305,6 +305,22 @@ class IndexTTS2: print(f"Audio too long ({audio.shape[1]} samples), truncating to {max_audio_samples} samples") audio = audio[:, :max_audio_samples] return audio, sr + + def normalize_emo_vec(self, emo_vector, apply_bias=True): + # apply biased emotion factors for better user experience, + # by de-emphasizing emotions that can cause strange results + if apply_bias: + # [happy, angry, sad, afraid, disgusted, melancholic, surprised, calm] + emo_bias = [0.9375, 0.875, 1.0, 1.0, 0.9375, 0.9375, 0.6875, 0.5625] + emo_vector = [vec * bias for vec, bias in zip(emo_vector, emo_bias)] + + # the total emotion sum must be 0.8 or less + emo_sum = sum(emo_vector) + if emo_sum > 0.8: + scale_factor = 0.8 / emo_sum + emo_vector = [vec * scale_factor for vec in emo_vector] + + return emo_vector # 原始推理模式 def infer(self, spk_audio_prompt, text, output_path, diff --git a/webui.py b/webui.py index 302a5b3..0a97f75 100644 --- a/webui.py +++ b/webui.py @@ -6,8 +6,6 @@ import time import warnings -import numpy as np - warnings.filterwarnings("ignore", category=FutureWarning) warnings.filterwarnings("ignore", category=UserWarning) @@ -104,14 +102,6 @@ with open("examples/cases.jsonl", "r", encoding="utf-8") as f: example.get("emo_text") is not None] ) -def normalize_emo_vec(emo_vec): - # emotion factors for better user experience - k_vec = [0.75,0.70,0.80,0.80,0.75,0.75,0.55,0.45] - tmp = np.array(k_vec) * np.array(emo_vec) - if np.sum(tmp) > 0.8: - tmp = tmp * 0.8/ np.sum(tmp) - return tmp.tolist() - def gen_single(emo_control_method,prompt, text, emo_ref_path, emo_weight, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, @@ -145,7 +135,7 @@ def gen_single(emo_control_method,prompt, text, pass if emo_control_method == 2: # emotion from custom vectors vec = [vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8] - vec = normalize_emo_vec(vec) + vec = tts.normalize_emo_vec(vec, apply_bias=True) else: # don't use the emotion vector inputs for the other modes vec = None