Add Notus format in Prompt.format_as and update examples/*.py (#147)

* Add `notus` within the pre-defined `Prompt` formats * Align `examples/*.py` and use `argilla/notus-7b-v1` from 🤗 Hub * Re-run `ruff format` and `ruff --fix`
argilla-io · Dec 11, 2023 · 547e82a · 547e82a
1 parent 35a4b0d
commit 547e82a
Show file tree

Hide file tree

Showing 11 changed files with 101 additions and 83 deletions.
diff --git a/examples/inference-endpoints-llm-custom-task.py b/examples/inference-endpoints-llm-custom-task.py
@@ -45,7 +45,7 @@ def output_args_names(self) -> list[str]:
     llm = InferenceEndpointsLLM(
         endpoint_name=os.getenv("HF_INFERENCE_ENDPOINT_NAME"),  # type: ignore
         endpoint_namespace=os.getenv("HF_NAMESPACE"),  # type: ignore
-        token=os.getenv("HF_TOKEN") or None,
+        token=os.getenv("HF_TOKEN", None),
         task=Llama2QuestionAnsweringTask(),
     )
     print(llm.generate([{"question": "What's the capital of Spain?"}]))

diff --git a/examples/pipeline-accelerate-and-openai.py b/examples/pipeline-accelerate-and-openai.py
@@ -11,15 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# WARNING: To run this example, you will need to install `accelerate` as
+# `pip install accelerate`
 
-# usage 'accelerate launch examples/pipeline-accelerate-and-openai.py'
+# Usage: `accelerate launch examples/pipeline-accelerate-and-openai.py`
 
 import os
 
 import torch
 from accelerate import Accelerator
 from accelerate.utils import gather_object
-from datasets import load_dataset, Dataset
+from datasets import Dataset, load_dataset
 from distilabel.dataset import CustomDataset
 from distilabel.llm import OpenAILLM, TransformersLLM
 from distilabel.pipeline import Pipeline
@@ -31,6 +33,7 @@ def get_current_device() -> int:
     """Get the current device. For GPU we return the local process index to enable multiple GPU training."""
     return Accelerator().local_process_index if torch.cuda.is_available() else "cpu"
 
+
 if __name__ == "__main__":
     accelerator = Accelerator()
     with accelerator.local_main_process_first():
@@ -56,7 +59,7 @@ def get_current_device() -> int:
             max_new_tokens=128,
             temperature=0.3,
             prompt_format="zephyr",
-            do_sample=True
+            do_sample=True,
         ),
         labeller=OpenAILLM(
             model="gpt-3.5-turbo",
@@ -67,8 +70,8 @@ def get_current_device() -> int:
             temperature=0.0,
         ),
     )
-    with accelerator.split_between_processes(dataset.to_dict()) as inputs:
-        inputs = Dataset.from_dict(inputs)
+    with accelerator.split_between_processes(dataset.to_dict()) as inputs:  # type: ignore
+        inputs = Dataset.from_dict(inputs)  # type: ignore
         dataset = pipeline.generate(
             inputs,  # type: ignore
             num_generations=2,
@@ -77,7 +80,7 @@ def get_current_device() -> int:
             display_progress_bar=True,
         )
         dataset = gather_object(dataset)
-        
+
     # Push to the HuggingFace Hub
     if accelerator.is_main_process:
         dataset = Dataset.from_list(dataset)
@@ -87,7 +90,7 @@ def get_current_device() -> int:
             private=True,
             token=os.getenv("HF_TOKEN", None),
         )
-        
+
         try:
             from uuid import uuid4
 
@@ -100,13 +103,13 @@ def get_current_device() -> int:
 
             # Convert into an Argilla dataset and push it to Argilla
             dataset.__class__ = CustomDataset
-            dataset.task = UltraFeedbackTask.for_instruction_following()
-            rg_dataset = dataset.to_argilla()
+            dataset.task = UltraFeedbackTask.for_instruction_following()  # type: ignore
+            rg_dataset = dataset.to_argilla()  # type: ignore
             rg_dataset.push_to_argilla(
                 name=f"my-dataset-{uuid4()}",
                 workspace="admin",
             )
         except ImportError:
             pass
-    accelerator.wait_for_everyone()
 
+    accelerator.wait_for_everyone()
diff --git a/examples/pipeline-fn-ultrafeedback-labeller.py b/examples/pipeline-fn-ultrafeedback-labeller.py
@@ -31,7 +31,7 @@
         "honesty",
         max_new_tokens=256,
         num_threads=2,
-        openai_api_key=os.getenv("OPENAI_API_KEY"),
+        openai_api_key=os.getenv("OPENAI_API_KEY", None),
         temperature=0.0,
     )
 
@@ -46,10 +46,12 @@
     end = time.time()
     print("Elapsed", end - start)
 
+    # Push to the HuggingFace Hub
     dataset.push_to_hub(
         os.getenv("HF_REPO_ID"),  # type: ignore
         split="train",
-        private=False,
+        private=True,
+        token=os.getenv("HF_TOKEN", None),
     )
 
     try:
@@ -58,10 +60,15 @@
         import argilla as rg
 
         rg.init(
-            api_url=os.getenv("ARGILLA_API_URL"), api_key=os.getenv("ARGILLA_API_KEY")
+            api_url=os.getenv("ARGILLA_API_URL"),
+            api_key=os.getenv("ARGILLA_API_KEY"),
         )
 
+        # Convert into an Argilla dataset and push it to Argilla
         rg_dataset = dataset.to_argilla()
-        rg_dataset.push_to_argilla(name=f"my-dataset-{uuid4()}", workspace="admin")
+        rg_dataset.push_to_argilla(
+            name=f"my-dataset-{uuid4()}",
+            workspace="admin",
+        )
     except ImportError:
         pass
diff --git a/examples/pipeline-fn-ultrafeedback.py b/examples/pipeline-fn-ultrafeedback.py
@@ -55,10 +55,12 @@
     end = time.time()
     print("Elapsed", end - start)
 
+    # Push to the HuggingFace Hub
     dataset.push_to_hub(
         os.getenv("HF_REPO_ID"),  # type: ignore
         split="train",
-        private=False,
+        private=True,
+        token=os.getenv("HF_TOKEN", None),
     )
 
     try:
@@ -67,10 +69,15 @@
         import argilla as rg
 
         rg.init(
-            api_url=os.getenv("ARGILLA_API_URL"), api_key=os.getenv("ARGILLA_API_KEY")
+            api_url=os.getenv("ARGILLA_API_URL"),
+            api_key=os.getenv("ARGILLA_API_KEY"),
         )
 
+        # Convert into an Argilla dataset and push it to Argilla
         rg_dataset = dataset.to_argilla()
-        rg_dataset.push_to_argilla(name=f"my-dataset-{uuid4()}", workspace="admin")
+        rg_dataset.push_to_argilla(
+            name=f"my-dataset-{uuid4()}",
+            workspace="admin",
+        )
     except ImportError:
         pass
diff --git a/examples/pipeline-llamacpp-and-openai.py b/examples/pipeline-llamacpp-and-openai.py
@@ -17,7 +17,7 @@
 from datasets import load_dataset
 from distilabel.llm import LlamaCppLLM, OpenAILLM
 from distilabel.pipeline import Pipeline
-from distilabel.tasks import Llama2TextGenerationTask, UltraFeedbackTask
+from distilabel.tasks import TextGenerationTask, UltraFeedbackTask
 from llama_cpp import Llama
 
 if __name__ == "__main__":
@@ -31,8 +31,9 @@
         generator=LlamaCppLLM(
             model=Llama(
                 model_path="<PATH_TO_GGUF_MODEL>", n_gpu_layers=-1
-            ),  # e.g. https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_0.gguf
-            task=Llama2TextGenerationTask(),
+            ),  # e.g. download it from https://huggingface.co/TheBloke/notus-7b-v1-GGUF/blob/main/notus-7b-v1.Q4_0.gguf
+            task=TextGenerationTask(),
+            prompt_format="notus",
             max_new_tokens=128,
             temperature=0.3,
         ),
@@ -59,6 +60,7 @@
         os.getenv("HF_REPO_ID"),  # type: ignore
         split="train",
         private=True,
+        token=os.getenv("HF_TOKEN", None),
     )
 
     try:
@@ -67,15 +69,15 @@
         import argilla as rg
 
         rg.init(
-            api_url="<ARGILLA_API_URL>",
-            api_key="<ARGILLA_API_KEY>",
+            api_url=os.getenv("ARGILLA_API_URL"),
+            api_key=os.getenv("ARGILLA_API_KEY"),
         )
 
         # Convert into an Argilla dataset and push it to Argilla
         rg_dataset = dataset.to_argilla()
         rg_dataset.push_to_argilla(
             name=f"my-dataset-{uuid4()}",
-            workspace="<ARGILLA_WORKSPACE_NAME>",
+            workspace="admin",
         )
     except ImportError:
         pass
diff --git a/examples/pipeline-selfinstruct-math-openai.py b/examples/pipeline-selfinstruct-math-openai.py
@@ -14,11 +14,10 @@
 
 import os
 
-from distilabel.tasks import SelfInstructTask
-from distilabel.pipeline import Pipeline
-from distilabel.llm import OpenAILLM
-
 from datasets import Dataset
+from distilabel.llm import OpenAILLM
+from distilabel.pipeline import Pipeline
+from distilabel.tasks import SelfInstructTask
 
 math_topics = [
     "Algebraic Expressions",
@@ -70,52 +69,49 @@
     "Linear Programming",
     "Analytical Geometry",
     "Euclidean Geometry",
-    "Non-Euclidean Geometry"
+    "Non-Euclidean Geometry",
 ]
 
-dataset = Dataset.from_dict({
-    "input": math_topics
-})
 
-instruction_task = SelfInstructTask(
-    application_description="A question-answering assistant for engaging and challenging math quizzes and problems"
-)
+if __name__ == "__main__":
+    dataset = Dataset.from_dict({"input": math_topics})
 
-instruction_generator = OpenAILLM(
-    task=instruction_task,
-    openai_api_key=os.getenv("OPENAI_API_KEY"),
-    num_threads=4,
-    max_new_tokens=1024
-)
+    instruction_task = SelfInstructTask(
+        application_description="A question-answering assistant for engaging and challenging math quizzes and problems"
+    )
 
-pipeline = Pipeline(
-    generator=instruction_generator
-)
+    instruction_generator = OpenAILLM(
+        task=instruction_task,
+        openai_api_key=os.getenv("OPENAI_API_KEY", None),
+        num_threads=4,
+        max_new_tokens=1024,
+    )
 
-distiset = pipeline.generate(dataset=dataset, num_generations=4, batch_size=2)
+    pipeline = Pipeline(generator=instruction_generator)
 
-instructions = []
-for generations in distiset["generations"]:
-    for generation in generations:
-        instructions.extend(generation)
-print(f"Number of generated instructions: {len(instructions)}")
-print(instructions)
+    distiset = pipeline.generate(dataset=dataset, num_generations=4, batch_size=2)
 
-# Output:
-# Number of generated instructions: 2044
-# 1. Provide an explanation for solving a quadratic equation step by step.
-# 2. What is the process for simplifying an algebraic expression with exponents?
-# 3. Detail how to factorize a polynomial equation.
-# 4. How can one determine the maximum or minimum value of a quadratic function?
-# 5. Explain the concept of inequalities and how to solve them algebraically.
-# 6. Describe the procedure for finding the roots of a cubic equation.
-# 7. What are the different types of factoring techniques used in algebra?
-# 8. Can you outline the steps for evaluating an algebraic expression using substitution?
-# 9. Compare and contrast linear and quadratic equations in terms of their solutions and graphs.
-# 10. How can one determine if a given graph represents a linear or quadratic equation?
-# 1. How can I simplify the algebraic expression (x^2 + 3x + 2)(2x - 1)?
-# 2. Provide step-by-step instructions on how to solve the equation 4(x + 2) - 3 = 7(2x - 1).
-# 3. What is the value of x in the equation 3(x - 4) = 5x + 6?
-# 4. Detail the process of factoring the expression 12x^2 - 7x - 10.
-# 5. What is the result of expanding the binomial (2x - 3)^2?
+    instructions = []
+    for generations in distiset["generations"]:
+        for generation in generations:
+            instructions.extend(generation)
+    print(f"Number of generated instructions: {len(instructions)}")
+    print(instructions)
 
+    # Output:
+    # Number of generated instructions: 2044
+    # 1. Provide an explanation for solving a quadratic equation step by step.
+    # 2. What is the process for simplifying an algebraic expression with exponents?
+    # 3. Detail how to factorize a polynomial equation.
+    # 4. How can one determine the maximum or minimum value of a quadratic function?
+    # 5. Explain the concept of inequalities and how to solve them algebraically.
+    # 6. Describe the procedure for finding the roots of a cubic equation.
+    # 7. What are the different types of factoring techniques used in algebra?
+    # 8. Can you outline the steps for evaluating an algebraic expression using substitution?
+    # 9. Compare and contrast linear and quadratic equations in terms of their solutions and graphs.
+    # 10. How can one determine if a given graph represents a linear or quadratic equation?
+    # 1. How can I simplify the algebraic expression (x^2 + 3x + 2)(2x - 1)?
+    # 2. Provide step-by-step instructions on how to solve the equation 4(x + 2) - 3 = 7(2x - 1).
+    # 3. What is the value of x in the equation 3(x - 4) = 5x + 6?
+    # 4. Detail the process of factoring the expression 12x^2 - 7x - 10.
+    # 5. What is the result of expanding the binomial (2x - 3)^2?
diff --git a/examples/pipeline-transformers-and-openai.py b/examples/pipeline-transformers-and-openai.py
@@ -29,9 +29,9 @@
     )
 
     model = AutoModelForCausalLM.from_pretrained(
-        "HuggingFaceH4/zephyr-7b-beta", dtype=torch.bfloat16, device="cuda:0"
+        "argilla/notus-7b-v1", dtype=torch.bfloat16, device_map="auto"
     )
-    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
+    tokenizer = AutoTokenizer.from_pretrained("argilla/notus-7b-v1")
     tokenizer.padding_side = "left"
 
     pipeline = Pipeline(
@@ -41,15 +41,15 @@
             task=TextGenerationTask(),
             max_new_tokens=128,
             temperature=0.3,
-            prompt_format="zephyr",
-            do_sample=True
+            prompt_format="notus",
+            do_sample=True,
         ),
         labeller=OpenAILLM(
             model="gpt-3.5-turbo",
             task=UltraFeedbackTask.for_instruction_following(),
             max_new_tokens=128,
             num_threads=2,
-            openai_api_key="<OPENAI_API_KEY>",
+            openai_api_key=os.getenv("OPENAI_API_KEY", None),
             temperature=0.0,
         ),
     )
@@ -67,6 +67,7 @@
         os.getenv("HF_REPO_ID"),  # type: ignore
         split="train",
         private=True,
+        token=os.getenv("HF_TOKEN", None),
     )
 
     try:
@@ -75,15 +76,15 @@
         import argilla as rg
 
         rg.init(
-            api_url="<ARGILLA_API_URL>",
-            api_key="<ARGILLA_API_KEY>",
+            api_url=os.getenv("ARGILLA_API_URL"),
+            api_key=os.getenv("ARGILLA_API_KEY"),
         )
 
         # Convert into an Argilla dataset and push it to Argilla
         rg_dataset = dataset.to_argilla()
         rg_dataset.push_to_argilla(
             name=f"my-dataset-{uuid4()}",
-            workspace="<ARGILLA_WORKSPACE_NAME>",
+            workspace="admin",
         )
     except ImportError:
         pass