From 9beb7ac47f9791b43bb6e5dedc537dc906b2a14a Mon Sep 17 00:00:00 2001 From: Zifei Tong Date: Wed, 26 Jun 2024 07:07:32 +0900 Subject: [PATCH 1/5] Fix off-by-one bug in decode_prompt_logprobs_inplace() --- vllm/transformers_utils/detokenizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py index e8e53f4946ef..3eea410ed63a 100644 --- a/vllm/transformers_utils/detokenizer.py +++ b/vllm/transformers_utils/detokenizer.py @@ -37,7 +37,8 @@ def decode_prompt_logprobs_inplace( # We can pick any sequence for the prompt. seq = next(iter(seq_group.seqs_dict.values())) # Only prompt, without the generated token. - all_token_ids = seq.get_token_ids() + # Skip the first token as its logprob is not defined. + all_token_ids = seq.get_token_ids()[1:] prompt_token_ids = all_token_ids[:-1] tokenizer = self.get_tokenizer_for_seq(seq) prefix_offset = 0 @@ -46,7 +47,6 @@ def decode_prompt_logprobs_inplace( next_iter_read_offset = 0 next_iter_tokens: List[str] = [] prev_tokens = None - for token_position, prompt_logprobs_for_token in enumerate( prompt_logprobs): if not prompt_logprobs_for_token: From fd050e2f36f067e013f339c2528175e268fc3f29 Mon Sep 17 00:00:00 2001 From: Zifei Tong Date: Wed, 26 Jun 2024 07:39:28 +0900 Subject: [PATCH 2/5] Fix unittests --- tests/tokenization/test_detokenize.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 12e5ae85adea..36c8df396b8b 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -138,6 +138,10 @@ def create_dummy_logprobs( token_id + 1: Logprob(logprob=0.1) } for token_id in complete_sequence_token_ids] +def create_dummy_prompt_logprobs( + complete_sequence_token_ids: List[int]) -> List[Dict[int, Logprob]]: + # logprob for the first prompt token is not defined. + return create_dummy_logprobs(complete_sequence_token_ids)[1:] @pytest.mark.parametrize("complete_sequence", TRUTH) @pytest.mark.parametrize("tokenizer_name", TOKENIZERS) @@ -192,19 +196,23 @@ def test_decode_prompt_logprobs(complete_sequence: str, seqs=[seq], sampling_params=sampling_params, arrival_time=0.0) - dummy_logprobs = create_dummy_logprobs(complete_sequence_token_ids) + dummy_logprobs = create_dummy_prompt_logprobs(complete_sequence_token_ids) detokenizer.decode_prompt_logprobs_inplace(seq_group, dummy_logprobs) decoded_prompt_logprobs = dummy_logprobs if skip_special_tokens: + # decoded_prompt_logprobs doesn't contain the first token. + token_ids = complete_sequence_token_ids[1:] + tokenzier = detokenizer.get_tokenizer_for_seq(seq) + text = tokenzier.decode(token_ids, skip_special_tokens=skip_special_tokens) # Text for logprobs for the chosen token should be the same as the # prompt text. Note that this will only be true if we skip # special tokens. - assert complete_sequence == "".join([ + assert text == "".join([ logprobs[token_id].decoded_token for token_id, logprobs in zip( - complete_sequence_token_ids, decoded_prompt_logprobs) + token_ids, decoded_prompt_logprobs) ]) - assert complete_sequence != "".join([ + assert text != "".join([ logprobs[token_id + 1].decoded_token for token_id, logprobs in zip( - complete_sequence_token_ids, decoded_prompt_logprobs) + token_ids, decoded_prompt_logprobs) ]) From cc4bc6935930f15e92270d4cfd91b01a0f823f3c Mon Sep 17 00:00:00 2001 From: Zifei Tong Date: Wed, 26 Jun 2024 07:41:27 +0900 Subject: [PATCH 3/5] reformat --- tests/tokenization/test_detokenize.py | 13 ++++++++----- vllm/transformers_utils/detokenizer.py | 1 + 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 36c8df396b8b..3395589fe507 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -138,11 +138,13 @@ def create_dummy_logprobs( token_id + 1: Logprob(logprob=0.1) } for token_id in complete_sequence_token_ids] + def create_dummy_prompt_logprobs( complete_sequence_token_ids: List[int]) -> List[Dict[int, Logprob]]: # logprob for the first prompt token is not defined. return create_dummy_logprobs(complete_sequence_token_ids)[1:] + @pytest.mark.parametrize("complete_sequence", TRUTH) @pytest.mark.parametrize("tokenizer_name", TOKENIZERS) @pytest.mark.parametrize("skip_special_tokens", [True, False]) @@ -204,15 +206,16 @@ def test_decode_prompt_logprobs(complete_sequence: str, # decoded_prompt_logprobs doesn't contain the first token. token_ids = complete_sequence_token_ids[1:] tokenzier = detokenizer.get_tokenizer_for_seq(seq) - text = tokenzier.decode(token_ids, skip_special_tokens=skip_special_tokens) + text = tokenzier.decode(token_ids, + skip_special_tokens=skip_special_tokens) # Text for logprobs for the chosen token should be the same as the # prompt text. Note that this will only be true if we skip # special tokens. assert text == "".join([ - logprobs[token_id].decoded_token for token_id, logprobs in zip( - token_ids, decoded_prompt_logprobs) + logprobs[token_id].decoded_token + for token_id, logprobs in zip(token_ids, decoded_prompt_logprobs) ]) assert text != "".join([ - logprobs[token_id + 1].decoded_token for token_id, logprobs in zip( - token_ids, decoded_prompt_logprobs) + logprobs[token_id + 1].decoded_token + for token_id, logprobs in zip(token_ids, decoded_prompt_logprobs) ]) diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py index 3eea410ed63a..0ae8fc5fe7be 100644 --- a/vllm/transformers_utils/detokenizer.py +++ b/vllm/transformers_utils/detokenizer.py @@ -47,6 +47,7 @@ def decode_prompt_logprobs_inplace( next_iter_read_offset = 0 next_iter_tokens: List[str] = [] prev_tokens = None + for token_position, prompt_logprobs_for_token in enumerate( prompt_logprobs): if not prompt_logprobs_for_token: From 7d20f999ae508e8b74ed47bd4bd1c2fc0abae4fe Mon Sep 17 00:00:00 2001 From: Zifei Tong Date: Wed, 26 Jun 2024 08:20:54 +0900 Subject: [PATCH 4/5] Add regression test --- tests/tokenization/test_detokenize.py | 39 +++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 3395589fe507..b66c5b506b76 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -219,3 +219,42 @@ def test_decode_prompt_logprobs(complete_sequence: str, logprobs[token_id + 1].decoded_token for token_id, logprobs in zip(token_ids, decoded_prompt_logprobs) ]) + + +@pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m"]) +def test_decode_prompt_logprobs_5846(detokenizer: Detokenizer): + """ Regression test for #5846. """ + + # This set of random input will generate incorrect output before #5846. + prompt_token_ids = [3290, 1562, 8652, 3123, 1838, 9660] + dummy_logprobs = [{ + 1562: Logprob(logprob=0.0), + 3290: Logprob(logprob=0.1) + }, { + 8652: Logprob(logprob=0.0), + 977: Logprob(logprob=0.1) + }, { + 3123: Logprob(logprob=0.0), + 30: Logprob(logprob=0.1) + }, { + 1838: Logprob(logprob=0.0), + 6: Logprob(logprob=0.1) + }, { + 9660: Logprob(logprob=0.0), + 1316: Logprob(logprob=0.1) + }] + + seq = create_sequence(prompt_token_ids) + seq_group = SequenceGroup( + request_id="1", + seqs=[seq], + sampling_params=SamplingParams(prompt_logprobs=1), + arrival_time=0.0) + + detokenizer.decode_prompt_logprobs_inplace(seq_group, dummy_logprobs) + decoded_prompt_logprobs = dummy_logprobs + + tokenzier = detokenizer.get_tokenizer_for_seq(seq) + for logprobs in decoded_prompt_logprobs: + for token_id, logprob in logprobs.items(): + assert tokenzier.decode(token_id) == logprob.decoded_token From e0df956907f81929260df5f1a9a2cc570d90b57c Mon Sep 17 00:00:00 2001 From: Zifei Tong Date: Wed, 26 Jun 2024 14:05:34 +0900 Subject: [PATCH 5/5] rename --- tests/tokenization/test_detokenize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index b66c5b506b76..5fc5df4cb115 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -222,8 +222,8 @@ def test_decode_prompt_logprobs(complete_sequence: str, @pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m"]) -def test_decode_prompt_logprobs_5846(detokenizer: Detokenizer): - """ Regression test for #5846. """ +def test_decode_prompt_logprobs_pr_5846(detokenizer: Detokenizer): + """ Regression test for PR #5846. """ # This set of random input will generate incorrect output before #5846. prompt_token_ids = [3290, 1562, 8652, 3123, 1838, 9660]