diff --git a/CHANGELOG.md b/CHANGELOG.md index fc6d23ca4..61f32b16a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added downstream eval task for requests dumped from oe-eval tasks - Added `CosLinearEnvelope` scheduler, which is a pointwise product of a cosine schedule and a linear decay. - Added ability to save outputs of submodules for debugging purposes. +- Added a number of tasks from oe-eval to the downstream eval tasks. - Version dolma flan change in named_data_mix.py ### Changed diff --git a/olmo/eval/downstream.py b/olmo/eval/downstream.py index 88a5ec2e2..53fddc8b0 100644 --- a/olmo/eval/downstream.py +++ b/olmo/eval/downstream.py @@ -1534,9 +1534,14 @@ def prep_examples(self): continuation_str = request_dict["continuation"] label_id = request["label"] cont_id = request["idx"] - if self.metric_type in ["ce_loss", "bpb"] and label_id != cont_id: - # Skip non-target continuations for ce_loss and bpb - continue + if self.metric_type in ["ce_loss", "bpb"]: + if label_id != cont_id: + # Skip non-target continuations for ce_loss and bpb + continue + else: + # Treat as instance with just one continuation + cont_id = 0 + label_id = 0 doc_text = request_dict["context"] ctx = self.token_encode(doc_text) dc = self.token_encode(self.doc_to_domain_conditional(doc)) @@ -1746,6 +1751,8 @@ def doc_to_label(self, doc) -> int: ), "csqa_mc_5shot": (OEEvalTask, {"dataset_path": "csqa", "dataset_name": "mc_5shot", "metric_type": "acc"}), "csqa_mc_5shot_bpb": (OEEvalTask, {"dataset_path": "csqa", "dataset_name": "mc_5shot", "metric_type": "bpb"}), + "csqa_rc_0shot": (OEEvalTask, {"dataset_path": "csqa", "dataset_name": "rc_0shot", "metric_type": "len_norm"}), + "csqa_rc_0shot_bpb": (OEEvalTask, {"dataset_path": "csqa", "dataset_name": "rc_0shot", "metric_type": "bpb"}), "csqa_rc_5shot": (OEEvalTask, {"dataset_path": "csqa", "dataset_name": "rc_5shot", "metric_type": "len_norm"}), "csqa_rc_5shot_bpb": (OEEvalTask, {"dataset_path": "csqa", "dataset_name": "rc_5shot", "metric_type": "bpb"}), "hellaswag_mc_5shot": ( @@ -1812,6 +1819,14 @@ def doc_to_label(self, doc) -> int: OEEvalTask, {"dataset_path": "socialiqa", "dataset_name": "mc_5shot", "metric_type": "bpb"}, ), + "socialiqa_rc_0shot": ( + OEEvalTask, + {"dataset_path": "socialiqa", "dataset_name": "rc_0shot", "metric_type": "len_norm"}, + ), + "socialiqa_rc_0shot_bpb": ( + OEEvalTask, + {"dataset_path": "socialiqa", "dataset_name": "rc_0shot", "metric_type": "bpb"}, + ), "socialiqa_rc_5shot": ( OEEvalTask, {"dataset_path": "socialiqa", "dataset_name": "rc_5shot", "metric_type": "len_norm"}, diff --git a/olmo_data/oe_eval_tasks/csqa/rc_0shot/config.json b/olmo_data/oe_eval_tasks/csqa/rc_0shot/config.json new file mode 100644 index 000000000..c3fc3e967 --- /dev/null +++ b/olmo_data/oe_eval_tasks/csqa/rc_0shot/config.json @@ -0,0 +1 @@ +{"task_name": "csqa", "task_hash": "739e57911dbac4fcdf9e5ff392d1b61e", "task_config": {"dataset_path": "commonsense_qa", "native_id_field": "id", "primary_metric": "acc_per_char", "split": "validation", "random_subsample_seed": 1234, "num_shots": 0, "generation_kwargs": {}, "context_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "fewshot_seed": 1234, "dataset_name": null, "task_name": "csqa", "version": 0, "task_core": "csqa"}, "current_date": "2024-08-23 21:20:34 UTC", "num_instances": 1221} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/csqa/rc_0shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/csqa/rc_0shot/requests.jsonl.gz new file mode 100644 index 000000000..6ea93a52d Binary files /dev/null and b/olmo_data/oe_eval_tasks/csqa/rc_0shot/requests.jsonl.gz differ diff --git a/olmo_data/oe_eval_tasks/socialiqa/rc_0shot/config.json b/olmo_data/oe_eval_tasks/socialiqa/rc_0shot/config.json new file mode 100644 index 000000000..bbd1b9217 --- /dev/null +++ b/olmo_data/oe_eval_tasks/socialiqa/rc_0shot/config.json @@ -0,0 +1 @@ +{"task_name": "socialiqa", "task_hash": "0880e16cf3101aff83d7fee648402df9", "task_config": {"dataset_path": "social_i_qa", "native_id_field": "index", "primary_metric": "acc_per_char", "split": "validation", "random_subsample_seed": 1234, "num_shots": 0, "generation_kwargs": {}, "context_kwargs": {}, "metric_kwargs": {"uncond_docid_offset": 1000000}, "fewshot_seed": 1234, "dataset_name": null, "task_name": "socialiqa", "version": 0, "task_core": "socialiqa"}, "current_date": "2024-08-23 21:20:37 UTC", "num_instances": 1954} \ No newline at end of file diff --git a/olmo_data/oe_eval_tasks/socialiqa/rc_0shot/requests.jsonl.gz b/olmo_data/oe_eval_tasks/socialiqa/rc_0shot/requests.jsonl.gz new file mode 100644 index 000000000..14355b4cc Binary files /dev/null and b/olmo_data/oe_eval_tasks/socialiqa/rc_0shot/requests.jsonl.gz differ