Skip to content

Commit

Permalink
fix ctc tokenization
Browse files Browse the repository at this point in the history
Signed-off-by: andrusenkoau <andrusenkoau@gmail.com>
  • Loading branch information
andrusenkoau committed Aug 26, 2024
1 parent 28c8761 commit f75a8f7
Showing 1 changed file with 0 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -329,16 +329,10 @@ def _process_example(self, context: str, output: str, lang_id: str):
answer_ids = pre_pad + self.tokenizer.text_to_ids(answer_text, self.sample_alpha)

# Labels for ctc head
#ctc_tokens_ids = answer_ids[1:]
# logging.warning("++++"*10)
# logging.warning(f"original_text: {original_text}")
normalized_text = self._normilize_text(output)
ctc_tokens_ids = []
if getattr(self.tokenizer, "asr_tokenizer", None):
ctc_tokens_ids = self.tokenizer.asr_tokenizer.text_to_ids(normalized_text, lang_id)
# logging.warning(f"lang_id: {lang_id}")
# logging.warning(f"ctc_tokens_ids: {ctc_tokens_ids}")
# raise ValueError("stop here")

if self.end_string:
answer_ids += self.tokenizer.text_to_ids(self.end_string)
Expand Down

0 comments on commit f75a8f7

Please sign in to comment.