Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
robertgshaw2-neuralmagic committed Oct 21, 2024
1 parent eb3f39e commit bb2fbe1
Showing 1 changed file with 3 additions and 8 deletions.
11 changes: 3 additions & 8 deletions examples/offline_inference_tpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,11 @@
max_tokens=16)

# Set `enforce_eager=True` to avoid ahead-of-time compilation.
# In real workloads, `enforce_eager` should be `False`.
llm = LLM(
# model="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
# model="neuralmagic/gemma-2-2b-it-quantized.w8a16",
model="neuralmagic/SmolLM-1.7B-Instruct-quantized.w8a16",
enforce_eager=True,
max_model_len=1024)
# In real workloads, `enforace_eager` should be `False`.
llm = LLM(model="google/gemma-2b", enforce_eager=True)
outputs = llm.generate(prompts, sampling_params)
for output, answer in zip(outputs, answers):
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
# assert generated_text.startswith(answer)
assert generated_text.startswith(answer)

0 comments on commit bb2fbe1

Please sign in to comment.