Skip to content

Commit

Permalink
Add Notus format in Prompt.format_as and update examples/*.py (#147)
Browse files Browse the repository at this point in the history
* Add `notus` within the pre-defined `Prompt` formats

* Align `examples/*.py` and use `argilla/notus-7b-v1` from 🤗 Hub

* Re-run `ruff format` and `ruff --fix`
  • Loading branch information
alvarobartt authored Dec 11, 2023
1 parent 35a4b0d commit 547e82a
Show file tree
Hide file tree
Showing 11 changed files with 101 additions and 83 deletions.
2 changes: 1 addition & 1 deletion examples/inference-endpoints-llm-custom-task.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def output_args_names(self) -> list[str]:
llm = InferenceEndpointsLLM(
endpoint_name=os.getenv("HF_INFERENCE_ENDPOINT_NAME"), # type: ignore
endpoint_namespace=os.getenv("HF_NAMESPACE"), # type: ignore
token=os.getenv("HF_TOKEN") or None,
token=os.getenv("HF_TOKEN", None),
task=Llama2QuestionAnsweringTask(),
)
print(llm.generate([{"question": "What's the capital of Spain?"}]))
Expand Down
23 changes: 13 additions & 10 deletions examples/pipeline-accelerate-and-openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,17 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# WARNING: To run this example, you will need to install `accelerate` as
# `pip install accelerate`

# usage 'accelerate launch examples/pipeline-accelerate-and-openai.py'
# Usage: `accelerate launch examples/pipeline-accelerate-and-openai.py`

import os

import torch
from accelerate import Accelerator
from accelerate.utils import gather_object
from datasets import load_dataset, Dataset
from datasets import Dataset, load_dataset
from distilabel.dataset import CustomDataset
from distilabel.llm import OpenAILLM, TransformersLLM
from distilabel.pipeline import Pipeline
Expand All @@ -31,6 +33,7 @@ def get_current_device() -> int:
"""Get the current device. For GPU we return the local process index to enable multiple GPU training."""
return Accelerator().local_process_index if torch.cuda.is_available() else "cpu"


if __name__ == "__main__":
accelerator = Accelerator()
with accelerator.local_main_process_first():
Expand All @@ -56,7 +59,7 @@ def get_current_device() -> int:
max_new_tokens=128,
temperature=0.3,
prompt_format="zephyr",
do_sample=True
do_sample=True,
),
labeller=OpenAILLM(
model="gpt-3.5-turbo",
Expand All @@ -67,8 +70,8 @@ def get_current_device() -> int:
temperature=0.0,
),
)
with accelerator.split_between_processes(dataset.to_dict()) as inputs:
inputs = Dataset.from_dict(inputs)
with accelerator.split_between_processes(dataset.to_dict()) as inputs: # type: ignore
inputs = Dataset.from_dict(inputs) # type: ignore
dataset = pipeline.generate(
inputs, # type: ignore
num_generations=2,
Expand All @@ -77,7 +80,7 @@ def get_current_device() -> int:
display_progress_bar=True,
)
dataset = gather_object(dataset)

# Push to the HuggingFace Hub
if accelerator.is_main_process:
dataset = Dataset.from_list(dataset)
Expand All @@ -87,7 +90,7 @@ def get_current_device() -> int:
private=True,
token=os.getenv("HF_TOKEN", None),
)

try:
from uuid import uuid4

Expand All @@ -100,13 +103,13 @@ def get_current_device() -> int:

# Convert into an Argilla dataset and push it to Argilla
dataset.__class__ = CustomDataset
dataset.task = UltraFeedbackTask.for_instruction_following()
rg_dataset = dataset.to_argilla()
dataset.task = UltraFeedbackTask.for_instruction_following() # type: ignore
rg_dataset = dataset.to_argilla() # type: ignore
rg_dataset.push_to_argilla(
name=f"my-dataset-{uuid4()}",
workspace="admin",
)
except ImportError:
pass
accelerator.wait_for_everyone()

accelerator.wait_for_everyone()
15 changes: 11 additions & 4 deletions examples/pipeline-fn-ultrafeedback-labeller.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"honesty",
max_new_tokens=256,
num_threads=2,
openai_api_key=os.getenv("OPENAI_API_KEY"),
openai_api_key=os.getenv("OPENAI_API_KEY", None),
temperature=0.0,
)

Expand All @@ -46,10 +46,12 @@
end = time.time()
print("Elapsed", end - start)

# Push to the HuggingFace Hub
dataset.push_to_hub(
os.getenv("HF_REPO_ID"), # type: ignore
split="train",
private=False,
private=True,
token=os.getenv("HF_TOKEN", None),
)

try:
Expand All @@ -58,10 +60,15 @@
import argilla as rg

rg.init(
api_url=os.getenv("ARGILLA_API_URL"), api_key=os.getenv("ARGILLA_API_KEY")
api_url=os.getenv("ARGILLA_API_URL"),
api_key=os.getenv("ARGILLA_API_KEY"),
)

# Convert into an Argilla dataset and push it to Argilla
rg_dataset = dataset.to_argilla()
rg_dataset.push_to_argilla(name=f"my-dataset-{uuid4()}", workspace="admin")
rg_dataset.push_to_argilla(
name=f"my-dataset-{uuid4()}",
workspace="admin",
)
except ImportError:
pass
13 changes: 10 additions & 3 deletions examples/pipeline-fn-ultrafeedback.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,12 @@
end = time.time()
print("Elapsed", end - start)

# Push to the HuggingFace Hub
dataset.push_to_hub(
os.getenv("HF_REPO_ID"), # type: ignore
split="train",
private=False,
private=True,
token=os.getenv("HF_TOKEN", None),
)

try:
Expand All @@ -67,10 +69,15 @@
import argilla as rg

rg.init(
api_url=os.getenv("ARGILLA_API_URL"), api_key=os.getenv("ARGILLA_API_KEY")
api_url=os.getenv("ARGILLA_API_URL"),
api_key=os.getenv("ARGILLA_API_KEY"),
)

# Convert into an Argilla dataset and push it to Argilla
rg_dataset = dataset.to_argilla()
rg_dataset.push_to_argilla(name=f"my-dataset-{uuid4()}", workspace="admin")
rg_dataset.push_to_argilla(
name=f"my-dataset-{uuid4()}",
workspace="admin",
)
except ImportError:
pass
14 changes: 8 additions & 6 deletions examples/pipeline-llamacpp-and-openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from datasets import load_dataset
from distilabel.llm import LlamaCppLLM, OpenAILLM
from distilabel.pipeline import Pipeline
from distilabel.tasks import Llama2TextGenerationTask, UltraFeedbackTask
from distilabel.tasks import TextGenerationTask, UltraFeedbackTask
from llama_cpp import Llama

if __name__ == "__main__":
Expand All @@ -31,8 +31,9 @@
generator=LlamaCppLLM(
model=Llama(
model_path="<PATH_TO_GGUF_MODEL>", n_gpu_layers=-1
), # e.g. https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_0.gguf
task=Llama2TextGenerationTask(),
), # e.g. download it from https://huggingface.co/TheBloke/notus-7b-v1-GGUF/blob/main/notus-7b-v1.Q4_0.gguf
task=TextGenerationTask(),
prompt_format="notus",
max_new_tokens=128,
temperature=0.3,
),
Expand All @@ -59,6 +60,7 @@
os.getenv("HF_REPO_ID"), # type: ignore
split="train",
private=True,
token=os.getenv("HF_TOKEN", None),
)

try:
Expand All @@ -67,15 +69,15 @@
import argilla as rg

rg.init(
api_url="<ARGILLA_API_URL>",
api_key="<ARGILLA_API_KEY>",
api_url=os.getenv("ARGILLA_API_URL"),
api_key=os.getenv("ARGILLA_API_KEY"),
)

# Convert into an Argilla dataset and push it to Argilla
rg_dataset = dataset.to_argilla()
rg_dataset.push_to_argilla(
name=f"my-dataset-{uuid4()}",
workspace="<ARGILLA_WORKSPACE_NAME>",
workspace="admin",
)
except ImportError:
pass
84 changes: 40 additions & 44 deletions examples/pipeline-selfinstruct-math-openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,10 @@

import os

from distilabel.tasks import SelfInstructTask
from distilabel.pipeline import Pipeline
from distilabel.llm import OpenAILLM

from datasets import Dataset
from distilabel.llm import OpenAILLM
from distilabel.pipeline import Pipeline
from distilabel.tasks import SelfInstructTask

math_topics = [
"Algebraic Expressions",
Expand Down Expand Up @@ -70,52 +69,49 @@
"Linear Programming",
"Analytical Geometry",
"Euclidean Geometry",
"Non-Euclidean Geometry"
"Non-Euclidean Geometry",
]

dataset = Dataset.from_dict({
"input": math_topics
})

instruction_task = SelfInstructTask(
application_description="A question-answering assistant for engaging and challenging math quizzes and problems"
)
if __name__ == "__main__":
dataset = Dataset.from_dict({"input": math_topics})

instruction_generator = OpenAILLM(
task=instruction_task,
openai_api_key=os.getenv("OPENAI_API_KEY"),
num_threads=4,
max_new_tokens=1024
)
instruction_task = SelfInstructTask(
application_description="A question-answering assistant for engaging and challenging math quizzes and problems"
)

pipeline = Pipeline(
generator=instruction_generator
)
instruction_generator = OpenAILLM(
task=instruction_task,
openai_api_key=os.getenv("OPENAI_API_KEY", None),
num_threads=4,
max_new_tokens=1024,
)

distiset = pipeline.generate(dataset=dataset, num_generations=4, batch_size=2)
pipeline = Pipeline(generator=instruction_generator)

instructions = []
for generations in distiset["generations"]:
for generation in generations:
instructions.extend(generation)
print(f"Number of generated instructions: {len(instructions)}")
print(instructions)
distiset = pipeline.generate(dataset=dataset, num_generations=4, batch_size=2)

# Output:
# Number of generated instructions: 2044
# 1. Provide an explanation for solving a quadratic equation step by step.
# 2. What is the process for simplifying an algebraic expression with exponents?
# 3. Detail how to factorize a polynomial equation.
# 4. How can one determine the maximum or minimum value of a quadratic function?
# 5. Explain the concept of inequalities and how to solve them algebraically.
# 6. Describe the procedure for finding the roots of a cubic equation.
# 7. What are the different types of factoring techniques used in algebra?
# 8. Can you outline the steps for evaluating an algebraic expression using substitution?
# 9. Compare and contrast linear and quadratic equations in terms of their solutions and graphs.
# 10. How can one determine if a given graph represents a linear or quadratic equation?
# 1. How can I simplify the algebraic expression (x^2 + 3x + 2)(2x - 1)?
# 2. Provide step-by-step instructions on how to solve the equation 4(x + 2) - 3 = 7(2x - 1).
# 3. What is the value of x in the equation 3(x - 4) = 5x + 6?
# 4. Detail the process of factoring the expression 12x^2 - 7x - 10.
# 5. What is the result of expanding the binomial (2x - 3)^2?
instructions = []
for generations in distiset["generations"]:
for generation in generations:
instructions.extend(generation)
print(f"Number of generated instructions: {len(instructions)}")
print(instructions)

# Output:
# Number of generated instructions: 2044
# 1. Provide an explanation for solving a quadratic equation step by step.
# 2. What is the process for simplifying an algebraic expression with exponents?
# 3. Detail how to factorize a polynomial equation.
# 4. How can one determine the maximum or minimum value of a quadratic function?
# 5. Explain the concept of inequalities and how to solve them algebraically.
# 6. Describe the procedure for finding the roots of a cubic equation.
# 7. What are the different types of factoring techniques used in algebra?
# 8. Can you outline the steps for evaluating an algebraic expression using substitution?
# 9. Compare and contrast linear and quadratic equations in terms of their solutions and graphs.
# 10. How can one determine if a given graph represents a linear or quadratic equation?
# 1. How can I simplify the algebraic expression (x^2 + 3x + 2)(2x - 1)?
# 2. Provide step-by-step instructions on how to solve the equation 4(x + 2) - 3 = 7(2x - 1).
# 3. What is the value of x in the equation 3(x - 4) = 5x + 6?
# 4. Detail the process of factoring the expression 12x^2 - 7x - 10.
# 5. What is the result of expanding the binomial (2x - 3)^2?
17 changes: 9 additions & 8 deletions examples/pipeline-transformers-and-openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@
)

model = AutoModelForCausalLM.from_pretrained(
"HuggingFaceH4/zephyr-7b-beta", dtype=torch.bfloat16, device="cuda:0"
"argilla/notus-7b-v1", dtype=torch.bfloat16, device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
tokenizer = AutoTokenizer.from_pretrained("argilla/notus-7b-v1")
tokenizer.padding_side = "left"

pipeline = Pipeline(
Expand All @@ -41,15 +41,15 @@
task=TextGenerationTask(),
max_new_tokens=128,
temperature=0.3,
prompt_format="zephyr",
do_sample=True
prompt_format="notus",
do_sample=True,
),
labeller=OpenAILLM(
model="gpt-3.5-turbo",
task=UltraFeedbackTask.for_instruction_following(),
max_new_tokens=128,
num_threads=2,
openai_api_key="<OPENAI_API_KEY>",
openai_api_key=os.getenv("OPENAI_API_KEY", None),
temperature=0.0,
),
)
Expand All @@ -67,6 +67,7 @@
os.getenv("HF_REPO_ID"), # type: ignore
split="train",
private=True,
token=os.getenv("HF_TOKEN", None),
)

try:
Expand All @@ -75,15 +76,15 @@
import argilla as rg

rg.init(
api_url="<ARGILLA_API_URL>",
api_key="<ARGILLA_API_KEY>",
api_url=os.getenv("ARGILLA_API_URL"),
api_key=os.getenv("ARGILLA_API_KEY"),
)

# Convert into an Argilla dataset and push it to Argilla
rg_dataset = dataset.to_argilla()
rg_dataset.push_to_argilla(
name=f"my-dataset-{uuid4()}",
workspace="<ARGILLA_WORKSPACE_NAME>",
workspace="admin",
)
except ImportError:
pass
Loading

0 comments on commit 547e82a

Please sign in to comment.