diff --git a/indextts/infer_v2.py b/indextts/infer_v2.py index d3aaa7d..cefc875 100644 --- a/indextts/infer_v2.py +++ b/indextts/infer_v2.py @@ -447,6 +447,13 @@ class IndexTTS2: text_tokens_list = self.tokenizer.tokenize(text) segments = self.tokenizer.split_segments(text_tokens_list, max_text_tokens_per_segment) segments_count = len(segments) + + text_token_ids = self.tokenizer.convert_tokens_to_ids(text_tokens_list) + if self.tokenizer.unk_token_id in text_token_ids: + print(f" >> Warning: input text contains {text_token_ids.count(self.tokenizer.unk_token_id)} unknown tokens (id={self.tokenizer.unk_token_id}):") + print( " Tokens which can't be encoded: ", [t for t, id in zip(text_tokens_list, text_token_ids) if id == self.tokenizer.unk_token_id]) + print(f" Consider updating the BPE model or modifying the text to avoid unknown tokens.") + if verbose: print("text_tokens_list:", text_tokens_list) print("segments count:", segments_count)