From 34be9bfb146ed4cf41808f002fc47332ddddaebd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christopher=20=C3=96zbek?= <c.oezbek@gmail.com>
Date: Fri, 26 Sep 2025 13:36:11 +0200
Subject: [PATCH] feat: Warn if input text contains UNK tokens

Added warnings for unknown tokens in input text.
---
 indextts/infer_v2.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/indextts/infer_v2.py b/indextts/infer_v2.py
index d3aaa7d..cefc875 100644
--- a/indextts/infer_v2.py
+++ b/indextts/infer_v2.py
@@ -447,6 +447,13 @@ class IndexTTS2:
         text_tokens_list = self.tokenizer.tokenize(text)
         segments = self.tokenizer.split_segments(text_tokens_list, max_text_tokens_per_segment)
         segments_count = len(segments)
+
+        text_token_ids = self.tokenizer.convert_tokens_to_ids(text_tokens_list)
+        if self.tokenizer.unk_token_id in text_token_ids:
+            print(f"  >> Warning: input text contains {text_token_ids.count(self.tokenizer.unk_token_id)} unknown tokens (id={self.tokenizer.unk_token_id}):")
+            print( "     Tokens which can't be encoded: ", [t for t, id in zip(text_tokens_list, text_token_ids) if id == self.tokenizer.unk_token_id])
+            print(f"     Consider updating the BPE model or modifying the text to avoid unknown tokens.")
+                  
         if verbose:
             print("text_tokens_list:", text_tokens_list)
             print("segments count:", segments_count)