fix: Fast and robust text-to-emotion algorithm

- The new algorithm is now very fast and uses less memory, since it doesn't chain multiple `.replace()` calls or create a bunch of temporary strings and temporary dictionaries and lists anymore. - Parses the JSON output from the QwenEmotion model directly instead of trying to manually parse it. If JSON parsing fails, it falls back to a fast and highly-accurate RegEx search which finds all key-value pairs. - The desired emotion vector order is now stored as a static class attribute instead of being created from scratch on every call. - The emotion dictionary creation has been completely rewritten to use a clear algorithm which takes the QwenEmotion answers, builds a new dictionary using `self.desired_vector_order`, maps each key's name to their English translations, fetches the values from QwenEmotion's answers or 0.0 if no value was given by QE, and clamps the values to the min/max ranges. - The `backup_dict` is now removed, since it was error-prone and fragile. It could grow out of sync with the code if not carefully maintained to keep the correct order and labels. - To handle the "fallback" dictionary creation, we now automatically scan the final emotion vectors, and if none of them are above 0.0 (meaning we didn't detect any emotions in the input text), we give the final vectors a "calm: 1.0" value. This means that we never have to worry about the fallback dictionary's correctness. - The previous algorithm had multiple bugs. This rewrite fixes a serious vector order bug: The old algorithm built the dictionary via the found keys, and only checked if there's 8 keys in QwenEmotion's response, but it didn't check that the keys were valid. When building the final emotion dict, it skipped any values if they were not found in QE's response. Meaning that if the QE response only contained 4 of the 8 expected emotion vector labels, those would all be added at the start of the new dictionary as the "first 4 dict slots". After that, it looped through the "backup_dict" and appended any missing values at the end. This resulted in a final emotion dictionary with the wrong order for the emotion vectors. The new code always produces the correct emotion vector order. - Discovered another bug in the text-to-emotion handling for the "melancholic" emotion, which has never worked for Chinese or English at all. It will be fixed in an upcoming patch.
2025-09-08 14:00:23 +02:00 · 2025-09-08 14:00:23 +02:00 · 58ad225fb4
commit 58ad225fb4
parent feba501013
1 changed files with 40 additions and 43 deletions
--- a/indextts/infer_v2.py
+++ b/indextts/infer_v2.py
@ -2,6 +2,8 @@ import os
 from subprocess import CalledProcessError

 os.environ['HF_HUB_CACHE'] = './checkpoints/hf_cache'
+import json
+import re
 import time
 import librosa
 import torch
@ -315,8 +317,9 @@ class IndexTTS2:
            # assert emo_alpha == 1.0
            if emo_text is None:
                emo_text = text
-            emo_dict, content = self.qwen_emo.inference(emo_text)
+            emo_dict = self.qwen_emo.inference(emo_text)
            print(emo_dict)
+            # convert ordered dict to list of vectors; the order is VERY important!
            emo_vector = list(emo_dict.values())

        if emo_vector is not None:
@ -602,59 +605,41 @@ class QwenEmotion:
            device_map="auto"
        )
        self.prompt = "文本情感分类"
-        self.convert_dict = {
+        self.cn_key_to_en = {
            "高兴": "happy",
            "愤怒": "angry",
            "悲伤": "sad",
            "恐惧": "afraid",
            "反感": "disgusted",
+            # TODO: the "低落" (melancholic) emotion will always be mapped to
+            # "悲伤" (sad) by QwenEmotion's text analysis. it doesn't know the
+            # difference between those emotions even if user writes exact words.
            "低落": "melancholic",
            "惊讶": "surprised",
            "自然": "calm",
        }
-        self.backup_dict = {"happy": 0, "angry": 0, "sad": 0, "afraid": 0, "disgusted": 0, "melancholic": 0,
-                            "surprised": 0, "calm": 1.0}
+        self.desired_vector_order = ["高兴", "愤怒", "悲伤", "恐惧", "反感", "低落", "惊讶", "自然"]
        self.max_score = 1.2
        self.min_score = 0.0

+    def clamp_score(self, value):
+        return max(self.min_score, min(self.max_score, value))
+
    def convert(self, content):
-        content = content.replace("\n", " ")
-        content = content.replace(" ", "")
-        content = content.replace("{", "")
-        content = content.replace("}", "")
-        content = content.replace('"', "")
-        parts = content.strip().split(',')
-        print(parts)
-        parts_dict = {}
-        desired_order = ["高兴", "愤怒", "悲伤", "恐惧", "反感", "低落", "惊讶", "自然"]
-        for part in parts:
-            key_value = part.strip().split(':')
-            if len(key_value) == 2:
-                parts_dict[key_value[0].strip()] = part
-        # 按照期望顺序重新排列
-        ordered_parts = [parts_dict[key] for key in desired_order if key in parts_dict]
-        parts = ordered_parts
-        if len(parts) != len(self.convert_dict):
-            return self.backup_dict
+        # generate emotion vector dictionary:
+        # - insert values in desired order (Python 3.7+ `dict` remembers insertion order)
+        # - convert Chinese keys to English
+        # - clamp all values to the allowed min/max range
+        # - use 0.0 for any values that were missing in `content`
+        emotion_dict = {
+            self.cn_key_to_en[cn_key]: self.clamp_score(content.get(cn_key, 0.0))
+            for cn_key in self.desired_vector_order
+        }

-        emotion_dict = {}
-        for part in parts:
-            key_value = part.strip().split(':')
-            if len(key_value) == 2:
-                try:
-                    key = self.convert_dict[key_value[0].strip()]
-                    value = float(key_value[1].strip())
-                    value = max(self.min_score, min(self.max_score, value))
-                    emotion_dict[key] = value
-                except Exception:
-                    continue
-
-        for key in self.backup_dict:
-            if key not in emotion_dict:
-                emotion_dict[key] = 0.0
-
-        if sum(emotion_dict.values()) <= 0:
-            return self.backup_dict
+        # default to a calm/neutral voice if all emotion vectors were empty
+        if all(val <= 0.0 for val in emotion_dict.values()):
+            print(">> no emotions detected; using default calm/neutral voice")
+            emotion_dict["calm"] = 1.0

        return emotion_dict

@ -687,9 +672,21 @@ class QwenEmotion:
        except ValueError:
            index = 0

-        content = self.tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
-        emotion_dict = self.convert(content)
-        return emotion_dict, content
+        content = self.tokenizer.decode(output_ids[index:], skip_special_tokens=True)
+
+        # decode the JSON emotion detections as a dictionary
+        try:
+            content = json.loads(content)
+        except json.decoder.JSONDecodeError:
+            # invalid JSON; fallback to manual string parsing
+            # print(">> parsing QwenEmotion response", content)
+            content = {
+                m.group(1): float(m.group(2))
+                for m in re.finditer(r'([^\s":.,]+?)"?\s*:\s*([\d.]+)', content)
+            }
+            # print(">> dict result", content)
+
+        return self.convert(content)


 if __name__ == "__main__":