-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathopp_115.py
39 lines (29 loc) · 1017 Bytes
/
opp_115.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import datasets
import pandas as pd
def load_opp_115(directory: str) -> datasets.DatasetDict:
# define an empty DatasetDict
combined = datasets.DatasetDict()
# define available splits
splits = ["train", "validation", "test"]
# loop over all splits
for split in splits:
# read CSV file corresponding to split
temp_df = pd.read_csv(
os.path.join(directory, f"{split}_dataset.csv"),
header=None,
names=["text", "label"],
)
# aggregate all labels per sentence into a unique list
temp_df = (
temp_df.groupby("text")
.agg(dict(label=lambda x: list(set(x))))
.reset_index()
)
# convert temporary dataframe into HF dataset
dataset = datasets.Dataset.from_pandas(temp_df, preserve_index=False)
# insert dataset into combined DatasetDict
combined[split] = dataset
return combined