From 547e82a9ee60a344dfb7f1bf19e795cafc443596 Mon Sep 17 00:00:00 2001 From: Alvaro Bartolome Date: Mon, 11 Dec 2023 13:58:56 +0100 Subject: [PATCH] Add Notus format in `Prompt.format_as` and update `examples/*.py` (#147) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add `notus` within the pre-defined `Prompt` formats * Align `examples/*.py` and use `argilla/notus-7b-v1` from 🤗 Hub * Re-run `ruff format` and `ruff --fix` --- .../inference-endpoints-llm-custom-task.py | 2 +- examples/pipeline-accelerate-and-openai.py | 23 ++--- .../pipeline-fn-ultrafeedback-labeller.py | 15 +++- examples/pipeline-fn-ultrafeedback.py | 13 ++- examples/pipeline-llamacpp-and-openai.py | 14 ++-- examples/pipeline-selfinstruct-math-openai.py | 84 +++++++++---------- examples/pipeline-transformers-and-openai.py | 17 ++-- examples/pipeline-vllm-and-openai.py | 9 +- src/distilabel/tasks/__init__.py | 2 +- src/distilabel/tasks/prompt.py | 4 +- .../tasks/text_generation/self_instruct.py | 1 + 11 files changed, 101 insertions(+), 83 deletions(-) diff --git a/examples/inference-endpoints-llm-custom-task.py b/examples/inference-endpoints-llm-custom-task.py index 26db1e2a37..5099bdc81d 100644 --- a/examples/inference-endpoints-llm-custom-task.py +++ b/examples/inference-endpoints-llm-custom-task.py @@ -45,7 +45,7 @@ def output_args_names(self) -> list[str]: llm = InferenceEndpointsLLM( endpoint_name=os.getenv("HF_INFERENCE_ENDPOINT_NAME"), # type: ignore endpoint_namespace=os.getenv("HF_NAMESPACE"), # type: ignore - token=os.getenv("HF_TOKEN") or None, + token=os.getenv("HF_TOKEN", None), task=Llama2QuestionAnsweringTask(), ) print(llm.generate([{"question": "What's the capital of Spain?"}])) diff --git a/examples/pipeline-accelerate-and-openai.py b/examples/pipeline-accelerate-and-openai.py index b6b2d79ca4..43cfd4c3ea 100644 --- a/examples/pipeline-accelerate-and-openai.py +++ b/examples/pipeline-accelerate-and-openai.py @@ -11,15 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# WARNING: To run this example, you will need to install `accelerate` as +# `pip install accelerate` -# usage 'accelerate launch examples/pipeline-accelerate-and-openai.py' +# Usage: `accelerate launch examples/pipeline-accelerate-and-openai.py` import os import torch from accelerate import Accelerator from accelerate.utils import gather_object -from datasets import load_dataset, Dataset +from datasets import Dataset, load_dataset from distilabel.dataset import CustomDataset from distilabel.llm import OpenAILLM, TransformersLLM from distilabel.pipeline import Pipeline @@ -31,6 +33,7 @@ def get_current_device() -> int: """Get the current device. For GPU we return the local process index to enable multiple GPU training.""" return Accelerator().local_process_index if torch.cuda.is_available() else "cpu" + if __name__ == "__main__": accelerator = Accelerator() with accelerator.local_main_process_first(): @@ -56,7 +59,7 @@ def get_current_device() -> int: max_new_tokens=128, temperature=0.3, prompt_format="zephyr", - do_sample=True + do_sample=True, ), labeller=OpenAILLM( model="gpt-3.5-turbo", @@ -67,8 +70,8 @@ def get_current_device() -> int: temperature=0.0, ), ) - with accelerator.split_between_processes(dataset.to_dict()) as inputs: - inputs = Dataset.from_dict(inputs) + with accelerator.split_between_processes(dataset.to_dict()) as inputs: # type: ignore + inputs = Dataset.from_dict(inputs) # type: ignore dataset = pipeline.generate( inputs, # type: ignore num_generations=2, @@ -77,7 +80,7 @@ def get_current_device() -> int: display_progress_bar=True, ) dataset = gather_object(dataset) - + # Push to the HuggingFace Hub if accelerator.is_main_process: dataset = Dataset.from_list(dataset) @@ -87,7 +90,7 @@ def get_current_device() -> int: private=True, token=os.getenv("HF_TOKEN", None), ) - + try: from uuid import uuid4 @@ -100,13 +103,13 @@ def get_current_device() -> int: # Convert into an Argilla dataset and push it to Argilla dataset.__class__ = CustomDataset - dataset.task = UltraFeedbackTask.for_instruction_following() - rg_dataset = dataset.to_argilla() + dataset.task = UltraFeedbackTask.for_instruction_following() # type: ignore + rg_dataset = dataset.to_argilla() # type: ignore rg_dataset.push_to_argilla( name=f"my-dataset-{uuid4()}", workspace="admin", ) except ImportError: pass - accelerator.wait_for_everyone() + accelerator.wait_for_everyone() diff --git a/examples/pipeline-fn-ultrafeedback-labeller.py b/examples/pipeline-fn-ultrafeedback-labeller.py index 72687214e8..c45721e5d9 100644 --- a/examples/pipeline-fn-ultrafeedback-labeller.py +++ b/examples/pipeline-fn-ultrafeedback-labeller.py @@ -31,7 +31,7 @@ "honesty", max_new_tokens=256, num_threads=2, - openai_api_key=os.getenv("OPENAI_API_KEY"), + openai_api_key=os.getenv("OPENAI_API_KEY", None), temperature=0.0, ) @@ -46,10 +46,12 @@ end = time.time() print("Elapsed", end - start) + # Push to the HuggingFace Hub dataset.push_to_hub( os.getenv("HF_REPO_ID"), # type: ignore split="train", - private=False, + private=True, + token=os.getenv("HF_TOKEN", None), ) try: @@ -58,10 +60,15 @@ import argilla as rg rg.init( - api_url=os.getenv("ARGILLA_API_URL"), api_key=os.getenv("ARGILLA_API_KEY") + api_url=os.getenv("ARGILLA_API_URL"), + api_key=os.getenv("ARGILLA_API_KEY"), ) + # Convert into an Argilla dataset and push it to Argilla rg_dataset = dataset.to_argilla() - rg_dataset.push_to_argilla(name=f"my-dataset-{uuid4()}", workspace="admin") + rg_dataset.push_to_argilla( + name=f"my-dataset-{uuid4()}", + workspace="admin", + ) except ImportError: pass diff --git a/examples/pipeline-fn-ultrafeedback.py b/examples/pipeline-fn-ultrafeedback.py index 345d2410c6..24e614c383 100644 --- a/examples/pipeline-fn-ultrafeedback.py +++ b/examples/pipeline-fn-ultrafeedback.py @@ -55,10 +55,12 @@ end = time.time() print("Elapsed", end - start) + # Push to the HuggingFace Hub dataset.push_to_hub( os.getenv("HF_REPO_ID"), # type: ignore split="train", - private=False, + private=True, + token=os.getenv("HF_TOKEN", None), ) try: @@ -67,10 +69,15 @@ import argilla as rg rg.init( - api_url=os.getenv("ARGILLA_API_URL"), api_key=os.getenv("ARGILLA_API_KEY") + api_url=os.getenv("ARGILLA_API_URL"), + api_key=os.getenv("ARGILLA_API_KEY"), ) + # Convert into an Argilla dataset and push it to Argilla rg_dataset = dataset.to_argilla() - rg_dataset.push_to_argilla(name=f"my-dataset-{uuid4()}", workspace="admin") + rg_dataset.push_to_argilla( + name=f"my-dataset-{uuid4()}", + workspace="admin", + ) except ImportError: pass diff --git a/examples/pipeline-llamacpp-and-openai.py b/examples/pipeline-llamacpp-and-openai.py index d852fa8570..564de142fd 100644 --- a/examples/pipeline-llamacpp-and-openai.py +++ b/examples/pipeline-llamacpp-and-openai.py @@ -17,7 +17,7 @@ from datasets import load_dataset from distilabel.llm import LlamaCppLLM, OpenAILLM from distilabel.pipeline import Pipeline -from distilabel.tasks import Llama2TextGenerationTask, UltraFeedbackTask +from distilabel.tasks import TextGenerationTask, UltraFeedbackTask from llama_cpp import Llama if __name__ == "__main__": @@ -31,8 +31,9 @@ generator=LlamaCppLLM( model=Llama( model_path="", n_gpu_layers=-1 - ), # e.g. https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_0.gguf - task=Llama2TextGenerationTask(), + ), # e.g. download it from https://huggingface.co/TheBloke/notus-7b-v1-GGUF/blob/main/notus-7b-v1.Q4_0.gguf + task=TextGenerationTask(), + prompt_format="notus", max_new_tokens=128, temperature=0.3, ), @@ -59,6 +60,7 @@ os.getenv("HF_REPO_ID"), # type: ignore split="train", private=True, + token=os.getenv("HF_TOKEN", None), ) try: @@ -67,15 +69,15 @@ import argilla as rg rg.init( - api_url="", - api_key="", + api_url=os.getenv("ARGILLA_API_URL"), + api_key=os.getenv("ARGILLA_API_KEY"), ) # Convert into an Argilla dataset and push it to Argilla rg_dataset = dataset.to_argilla() rg_dataset.push_to_argilla( name=f"my-dataset-{uuid4()}", - workspace="", + workspace="admin", ) except ImportError: pass diff --git a/examples/pipeline-selfinstruct-math-openai.py b/examples/pipeline-selfinstruct-math-openai.py index e08fa3471a..e92ced7f7a 100644 --- a/examples/pipeline-selfinstruct-math-openai.py +++ b/examples/pipeline-selfinstruct-math-openai.py @@ -14,11 +14,10 @@ import os -from distilabel.tasks import SelfInstructTask -from distilabel.pipeline import Pipeline -from distilabel.llm import OpenAILLM - from datasets import Dataset +from distilabel.llm import OpenAILLM +from distilabel.pipeline import Pipeline +from distilabel.tasks import SelfInstructTask math_topics = [ "Algebraic Expressions", @@ -70,52 +69,49 @@ "Linear Programming", "Analytical Geometry", "Euclidean Geometry", - "Non-Euclidean Geometry" + "Non-Euclidean Geometry", ] -dataset = Dataset.from_dict({ - "input": math_topics -}) -instruction_task = SelfInstructTask( - application_description="A question-answering assistant for engaging and challenging math quizzes and problems" -) +if __name__ == "__main__": + dataset = Dataset.from_dict({"input": math_topics}) -instruction_generator = OpenAILLM( - task=instruction_task, - openai_api_key=os.getenv("OPENAI_API_KEY"), - num_threads=4, - max_new_tokens=1024 -) + instruction_task = SelfInstructTask( + application_description="A question-answering assistant for engaging and challenging math quizzes and problems" + ) -pipeline = Pipeline( - generator=instruction_generator -) + instruction_generator = OpenAILLM( + task=instruction_task, + openai_api_key=os.getenv("OPENAI_API_KEY", None), + num_threads=4, + max_new_tokens=1024, + ) -distiset = pipeline.generate(dataset=dataset, num_generations=4, batch_size=2) + pipeline = Pipeline(generator=instruction_generator) -instructions = [] -for generations in distiset["generations"]: - for generation in generations: - instructions.extend(generation) -print(f"Number of generated instructions: {len(instructions)}") -print(instructions) + distiset = pipeline.generate(dataset=dataset, num_generations=4, batch_size=2) -# Output: -# Number of generated instructions: 2044 -# 1. Provide an explanation for solving a quadratic equation step by step. -# 2. What is the process for simplifying an algebraic expression with exponents? -# 3. Detail how to factorize a polynomial equation. -# 4. How can one determine the maximum or minimum value of a quadratic function? -# 5. Explain the concept of inequalities and how to solve them algebraically. -# 6. Describe the procedure for finding the roots of a cubic equation. -# 7. What are the different types of factoring techniques used in algebra? -# 8. Can you outline the steps for evaluating an algebraic expression using substitution? -# 9. Compare and contrast linear and quadratic equations in terms of their solutions and graphs. -# 10. How can one determine if a given graph represents a linear or quadratic equation? -# 1. How can I simplify the algebraic expression (x^2 + 3x + 2)(2x - 1)? -# 2. Provide step-by-step instructions on how to solve the equation 4(x + 2) - 3 = 7(2x - 1). -# 3. What is the value of x in the equation 3(x - 4) = 5x + 6? -# 4. Detail the process of factoring the expression 12x^2 - 7x - 10. -# 5. What is the result of expanding the binomial (2x - 3)^2? + instructions = [] + for generations in distiset["generations"]: + for generation in generations: + instructions.extend(generation) + print(f"Number of generated instructions: {len(instructions)}") + print(instructions) + # Output: + # Number of generated instructions: 2044 + # 1. Provide an explanation for solving a quadratic equation step by step. + # 2. What is the process for simplifying an algebraic expression with exponents? + # 3. Detail how to factorize a polynomial equation. + # 4. How can one determine the maximum or minimum value of a quadratic function? + # 5. Explain the concept of inequalities and how to solve them algebraically. + # 6. Describe the procedure for finding the roots of a cubic equation. + # 7. What are the different types of factoring techniques used in algebra? + # 8. Can you outline the steps for evaluating an algebraic expression using substitution? + # 9. Compare and contrast linear and quadratic equations in terms of their solutions and graphs. + # 10. How can one determine if a given graph represents a linear or quadratic equation? + # 1. How can I simplify the algebraic expression (x^2 + 3x + 2)(2x - 1)? + # 2. Provide step-by-step instructions on how to solve the equation 4(x + 2) - 3 = 7(2x - 1). + # 3. What is the value of x in the equation 3(x - 4) = 5x + 6? + # 4. Detail the process of factoring the expression 12x^2 - 7x - 10. + # 5. What is the result of expanding the binomial (2x - 3)^2? diff --git a/examples/pipeline-transformers-and-openai.py b/examples/pipeline-transformers-and-openai.py index 3f29a2bb55..a7f7761e8c 100644 --- a/examples/pipeline-transformers-and-openai.py +++ b/examples/pipeline-transformers-and-openai.py @@ -29,9 +29,9 @@ ) model = AutoModelForCausalLM.from_pretrained( - "HuggingFaceH4/zephyr-7b-beta", dtype=torch.bfloat16, device="cuda:0" + "argilla/notus-7b-v1", dtype=torch.bfloat16, device_map="auto" ) - tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") + tokenizer = AutoTokenizer.from_pretrained("argilla/notus-7b-v1") tokenizer.padding_side = "left" pipeline = Pipeline( @@ -41,15 +41,15 @@ task=TextGenerationTask(), max_new_tokens=128, temperature=0.3, - prompt_format="zephyr", - do_sample=True + prompt_format="notus", + do_sample=True, ), labeller=OpenAILLM( model="gpt-3.5-turbo", task=UltraFeedbackTask.for_instruction_following(), max_new_tokens=128, num_threads=2, - openai_api_key="", + openai_api_key=os.getenv("OPENAI_API_KEY", None), temperature=0.0, ), ) @@ -67,6 +67,7 @@ os.getenv("HF_REPO_ID"), # type: ignore split="train", private=True, + token=os.getenv("HF_TOKEN", None), ) try: @@ -75,15 +76,15 @@ import argilla as rg rg.init( - api_url="", - api_key="", + api_url=os.getenv("ARGILLA_API_URL"), + api_key=os.getenv("ARGILLA_API_KEY"), ) # Convert into an Argilla dataset and push it to Argilla rg_dataset = dataset.to_argilla() rg_dataset.push_to_argilla( name=f"my-dataset-{uuid4()}", - workspace="", + workspace="admin", ) except ImportError: pass diff --git a/examples/pipeline-vllm-and-openai.py b/examples/pipeline-vllm-and-openai.py index ae4a4762c6..03b29bf92a 100644 --- a/examples/pipeline-vllm-and-openai.py +++ b/examples/pipeline-vllm-and-openai.py @@ -40,7 +40,7 @@ task=UltraFeedbackTask.for_text_quality(), max_new_tokens=128, num_threads=2, - openai_api_key="", + openai_api_key=os.getenv("OPENAI_API_KEY", None), temperature=0.0, ), ) @@ -58,6 +58,7 @@ os.getenv("HF_REPO_ID"), # type: ignore split="train", private=True, + token=os.getenv("HF_TOKEN", None), ) try: @@ -66,15 +67,15 @@ import argilla as rg rg.init( - api_url="", - api_key="", + api_url=os.getenv("ARGILLA_API_URL"), + api_key=os.getenv("ARGILLA_API_KEY"), ) # Convert into an Argilla dataset and push it to Argilla rg_dataset = dataset.to_argilla() rg_dataset.push_to_argilla( name=f"my-dataset-{uuid4()}", - workspace="", + workspace="admin", ) except ImportError: pass diff --git a/src/distilabel/tasks/__init__.py b/src/distilabel/tasks/__init__.py index ec8468e03e..69977c9dd6 100644 --- a/src/distilabel/tasks/__init__.py +++ b/src/distilabel/tasks/__init__.py @@ -31,5 +31,5 @@ "TextGenerationTask", "OpenAITextGenerationTask", "Llama2TextGenerationTask", - "SelfInstructTask" + "SelfInstructTask", ] diff --git a/src/distilabel/tasks/prompt.py b/src/distilabel/tasks/prompt.py index 0002468ce6..60bfd6eba9 100644 --- a/src/distilabel/tasks/prompt.py +++ b/src/distilabel/tasks/prompt.py @@ -26,7 +26,7 @@ class ChatCompletion(TypedDict): # A `Literal` type is used to ensure that the `format` argument is one of the supported formats. -SupportedFormats = Literal["default", "openai", "llama2", "chatml", "zephyr"] +SupportedFormats = Literal["default", "openai", "llama2", "chatml", "zephyr", "notus"] @dataclass @@ -84,7 +84,7 @@ def format_as(self, format: SupportedFormats) -> Union[str, List[ChatCompletion] return f"[INST] <>\n{self.system_prompt}<>\n\n{self.formatted_prompt} [/INST]" elif format == "chatml": return f"<|im_start|>system\n{self.system_prompt}<|im_end|>\n<|im_start|>user\n{self.formatted_prompt}<|im_end|>\n<|im_start|>assistant\n" - elif format == "zephyr": + elif format in ["zephyr", "notus"]: return f"<|system|>\n{self.system_prompt}\n<|user|>\n{self.formatted_prompt}\n<|assistant|>\n" else: raise ValueError( diff --git a/src/distilabel/tasks/text_generation/self_instruct.py b/src/distilabel/tasks/text_generation/self_instruct.py index 140828cb26..a03b2e0c4d 100644 --- a/src/distilabel/tasks/text_generation/self_instruct.py +++ b/src/distilabel/tasks/text_generation/self_instruct.py @@ -21,6 +21,7 @@ _SELF_INSTRUCT_TEMPLATE = get_template("self-instruct.jinja2") + @dataclass class SelfInstructTask(TextGenerationTask): """A `TextGenerationTask` following the Self-Instruct specification for building