LAION-AI · andreaskoepf · Apr 21, 2023 · Apr 20, 2023 · Apr 21, 2023 · Apr 21, 2023
@@ -162,9 +162,9 @@ def get_one_dataset(
     elif dataset_name == "hellaswag":
         train, eval = load_hellaswag()
     elif dataset_name == "dolly15k":
-        dataset = DatabricksDolly15k(cache_dir=data_path)
+        dataset = DatabricksDolly15k(cache_dir=data_path, mode=mode, **kwargs)
     elif dataset_name == "alpaca_gpt4":
-        dataset = AlpacaGpt4(cache_dir=data_path, **kwargs)
+        dataset = AlpacaGpt4(cache_dir=data_path, mode=mode, **kwargs)
     else:
         raise ValueError(f"Unknown dataset {dataset_name}")
 

@@ -18,6 +18,7 @@
     "zhihu-kol": "wangrui6/zhihu-kol",
     "minimath": "kentsui/minimath",
     "oa_wiki_qa_bart_10000row": "michaelthwan/oa_wiki_qa_bart_10000row",
+    "poem_instructions": "checkai/instruction-poems",
 }
 
 

@@ -21,6 +21,8 @@
 # @agoryuno contributed this
 re_reference_remove = re.compile(r"\[\d+(?:,\s*\d+)*?\]")
 re_single_reference_remove = re.compile(r"\[\s?\d+\s?\]")
+
+# check if the whole string is just a combination of (multiple) whitespaces and newlines
 re_whitespace_newline_match = re.compile(r"^[\s\n]*$")
 
 
@@ -450,6 +452,7 @@ def process_split(
         dataset: Subset, reverse_augmentation: bool = False, keep_unreversed: bool = True
     ) -> list[tuple[str, str]]:
         data = []
+
         for row in dataset:
             question = row["instruction"]
             if len(row["input"]) > 0: