feat(webui): Implement emotion weighting for vectors and text modes

- This is a major new feature, which now allows for much more natural speech generation by lowering the influence of the emotion vector/text control modes. - It is particularly useful for the "emotion text description" control mode, where a strength of 0.6 or lower is useful to get much more natural speech.
2025-09-11 04:19:58 +02:00 · 2025-09-11 04:19:58 +02:00 · d899770313
commit d899770313
parent 9668064377
1 changed files with 28 additions and 21 deletions
--- a/webui.py
+++ b/webui.py
@ -126,25 +126,26 @@ def gen_single(emo_control_method,prompt, text,
    }
    if type(emo_control_method) is not int:
        emo_control_method = emo_control_method.value
-    if emo_control_method == 0:
-        emo_ref_path = None
+    if emo_control_method == 0:  # emotion from speaker
+        emo_ref_path = None  # remove external reference audio
        emo_weight = 1.0
-    if emo_control_method == 1:
-        emo_weight = emo_weight
-    if emo_control_method == 2:
+    if emo_control_method == 1:  # emotion from reference audio
+        # emo_weight = emo_weight
+        pass
+    if emo_control_method == 2:  # emotion from custom vectors
        vec = [vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8]
-        vec_sum = sum([vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8])
-        if vec_sum > 1.5:
+        if sum(vec) > 1.5:
            gr.Warning(i18n("情感向量之和不能超过1.5，请调整后重试。"))
            return
    else:
+        # don't use the emotion vector inputs for the other modes
        vec = None

    if emo_text == "":
        # erase empty emotion descriptions; `infer()` will then automatically use the main prompt
        emo_text = None

-    print(f"Emo control mode:{emo_control_method},vec:{vec}")
+    print(f"Emo control mode:{emo_control_method},weight:{emo_weight},vec:{vec}")
    output = tts.infer(spk_audio_prompt=prompt, text=text,
                       output_path=output_path,
                       emo_audio_prompt=emo_ref_path, emo_alpha=emo_weight,
@ -167,6 +168,7 @@ with gr.Blocks(title="IndexTTS Demo") as demo:
 <a href='https://arxiv.org/abs/2506.21619'><img src='https://img.shields.io/badge/ArXiv-2506.21619-red'></a>
 </p>
    ''')
+
    with gr.Tab(i18n("音频生成")):
        with gr.Row():
            os.makedirs("prompts",exist_ok=True)
@ -192,9 +194,6 @@ with gr.Blocks(title="IndexTTS Demo") as demo:
            with gr.Row():
                emo_upload = gr.Audio(label=i18n("上传情感参考音频"), type="filepath")

-            with gr.Row():
-                emo_weight = gr.Slider(label=i18n("情感权重"), minimum=0.0, maximum=1.6, value=0.8, step=0.01)
-
        # 情感随机采样
        with gr.Row():
            emo_random = gr.Checkbox(label=i18n("情感随机采样"),value=False,visible=False)
@ -217,6 +216,9 @@ with gr.Blocks(title="IndexTTS Demo") as demo:
            with gr.Row():
                emo_text = gr.Textbox(label=i18n("情感描述文本"), placeholder=i18n("请输入情绪描述（或留空以自动使用目标文本作为情绪描述）"), value="", info=i18n("例如：高兴，愤怒，悲伤等"))

+        with gr.Row(visible=False) as emo_weight_group:
+            emo_weight = gr.Slider(label=i18n("情感权重"), minimum=0.0, maximum=1.6, value=0.8, step=0.01)
+
        with gr.Accordion(i18n("高级生成参数设置"), open=False):
            with gr.Row():
                with gr.Column(scale=1):
@ -287,26 +289,30 @@ with gr.Blocks(title="IndexTTS Demo") as demo:
                segments_preview: gr.update(value=df),
            }
    def on_method_select(emo_control_method):
-        if emo_control_method == 1:
+        if emo_control_method == 1:  # emotion reference audio
            return (gr.update(visible=True),
                    gr.update(visible=False),
                    gr.update(visible=False),
-                    gr.update(visible=False)
+                    gr.update(visible=False),
+                    gr.update(visible=True)
                    )
-        elif emo_control_method == 2:
+        elif emo_control_method == 2:  # emotion vectors
            return (gr.update(visible=False),
                    gr.update(visible=True),
-                    gr.update(visible=True),
-                    gr.update(visible=False)
-                    )
-        elif emo_control_method == 3:
-            return (gr.update(visible=False),
                    gr.update(visible=True),
                    gr.update(visible=False),
                    gr.update(visible=True)
                    )
-        else:
+        elif emo_control_method == 3:  # emotion text description
            return (gr.update(visible=False),
+                    gr.update(visible=True),
+                    gr.update(visible=False),
+                    gr.update(visible=True),
+                    gr.update(visible=True)
+                    )
+        else:  # 0: same as speaker voice
+            return (gr.update(visible=False),
+                    gr.update(visible=False),
                    gr.update(visible=False),
                    gr.update(visible=False),
                    gr.update(visible=False)
@ -317,7 +323,8 @@ with gr.Blocks(title="IndexTTS Demo") as demo:
        outputs=[emotion_reference_group,
                 emo_random,
                 emotion_vector_group,
-                 emo_text_group]
+                 emo_text_group,
+                 emo_weight_group]
    )

    input_text_single.change(