-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathpolicy_detection.py
37 lines (27 loc) · 1.24 KB
/
policy_detection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pandas as pd
import datasets
import os
def load_policy_detection(directory: str) -> datasets.DatasetDict:
# initialize DatasetDict object
combined = datasets.DatasetDict()
# read csv file and choose subset of columns
df = pd.read_csv(os.path.join(directory, "1301_dataset.csv"), index_col=0)
df = df[["policy_text", "is_policy"]]
# replace labels from boolean to strings for consistency
df["is_policy"] = df["is_policy"].replace({True: "Policy", False: "Not Policy"})
# rename columns for consistency
df = df.rename(columns={"policy_text": "text", "is_policy": "label"})
# convert into HF datasets
dataset = datasets.Dataset.from_pandas(df, preserve_index=False)
# make split using HF datasets internal methods
train_test_dataset_dict = dataset.train_test_split(test_size=0.3, seed=42)
train_valid_dataset_dict = train_test_dataset_dict["train"].train_test_split(
test_size=0.15, seed=42
)
# manually assign them to another DatasetDict
combined["train"] = train_valid_dataset_dict["train"]
combined["validation"] = train_valid_dataset_dict["test"]
combined["test"] = train_test_dataset_dict["test"]
return combined