From a6a955d2aa18d6400b2f4cdaad35e792d5bb231b Mon Sep 17 00:00:00 2001 From: Arcitec <38923130+Arcitec@users.noreply.github.com> Date: Mon, 8 Sep 2025 14:07:35 +0200 Subject: [PATCH] fix: Add support for melancholic emotion in text-to-emotion vectors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - The "低落" (melancholic) emotion will always be mapped to "悲伤" (sad) by QwenEmotion's text analysis. It doesn't know the difference between those emotions even if the user writes the exact words. - Since the words and their meanings are so similar, it might not be possible to train QwenEmotion to learn the difference. - As a workaround, we perform input text analysis and look for words that mean "melancholic", and swap the "sad" detection result, to make the melancholic/low-energy speech emotion work correctly for users via text-to-emotion. --- indextts/infer_v2.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/indextts/infer_v2.py b/indextts/infer_v2.py index b851055..4d53e99 100644 --- a/indextts/infer_v2.py +++ b/indextts/infer_v2.py @@ -614,11 +614,22 @@ class QwenEmotion: # TODO: the "低落" (melancholic) emotion will always be mapped to # "悲伤" (sad) by QwenEmotion's text analysis. it doesn't know the # difference between those emotions even if user writes exact words. + # SEE: `self.melancholic_words` for current workaround. "低落": "melancholic", "惊讶": "surprised", "自然": "calm", } self.desired_vector_order = ["高兴", "愤怒", "悲伤", "恐惧", "反感", "低落", "惊讶", "自然"] + self.melancholic_words = { + # emotion text phrases that will force QwenEmotion's "悲伤" (sad) detection + # to become "低落" (melancholic) instead, to fix limitations mentioned above. + "低落", + "melancholy", + "melancholic", + "depression", + "depressed", + "gloomy", + } self.max_score = 1.2 self.min_score = 0.0 @@ -686,6 +697,15 @@ class QwenEmotion: } # print(">> dict result", content) + # workaround for QwenEmotion's inability to distinguish "悲伤" (sad) vs "低落" (melancholic). + # if we detect any of the IndexTTS "melancholic" words, we swap those vectors + # to encode the "sad" emotion as "melancholic" (instead of sadness). + text_input_lower = text_input.lower() + if any(word in text_input_lower for word in self.melancholic_words): + # print(">> before vec swap", content) + content["悲伤"], content["低落"] = content.get("低落", 0.0), content.get("悲伤", 0.0) + # print(">> after vec swap", content) + return self.convert(content)