Merge pull request #441 from coezbek/patch-3

Feat: Warn if input text contains UNK tokens
This commit is contained in:
nanaoto 2025-09-29 16:15:48 +08:00 committed by GitHub
commit 2ca41d738f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -447,6 +447,13 @@ class IndexTTS2:
text_tokens_list = self.tokenizer.tokenize(text)
segments = self.tokenizer.split_segments(text_tokens_list, max_text_tokens_per_segment)
segments_count = len(segments)
text_token_ids = self.tokenizer.convert_tokens_to_ids(text_tokens_list)
if self.tokenizer.unk_token_id in text_token_ids:
print(f" >> Warning: input text contains {text_token_ids.count(self.tokenizer.unk_token_id)} unknown tokens (id={self.tokenizer.unk_token_id}):")
print( " Tokens which can't be encoded: ", [t for t, id in zip(text_tokens_list, text_token_ids) if id == self.tokenizer.unk_token_id])
print(f" Consider updating the BPE model or modifying the text to avoid unknown tokens.")
if verbose:
print("text_tokens_list:", text_tokens_list)
print("segments count:", segments_count)