Skip to content

Commit

Permalink
Merge pull request #79 from georgian-io/akash/better-preprocessing
Browse files Browse the repository at this point in the history
Feat: Better Preprocessing
  • Loading branch information
akashsara authored Sep 17, 2024
2 parents 90823e3 + 58e722b commit b2f05ee
Show file tree
Hide file tree
Showing 16 changed files with 719 additions and 287 deletions.
4 changes: 4 additions & 0 deletions datasets/Melbourne_Airbnb_Open_Data/train_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@
"model_name_or_path": "bert-base-multilingual-uncased",
"do_train": true,
"categorical_encode_type": "binary",
"categorical_handle_na": true,
"categorical_na_value": "-999999",
"numerical_transformer_method": "quantile_normal",
"numerical_handle_na": true,
"numerical_how_handle_na": "median",
"tokenizer_name": "bert-base-multilingual-uncased",
"use_simple_classifier": false,
"logging_dir": "./logs_airbnb/bertmultilingual_gating_on_cat_and_num_feats_then_sum_full_model_lr_3e-3/",
Expand Down
4 changes: 4 additions & 0 deletions datasets/PetFindermy_Adoption_Prediction/train_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@
"model_name_or_path": "bert-base-multilingual-uncased",
"do_train": true,
"categorical_encode_type": "ohe",
"categorical_handle_na": true,
"categorical_na_value": "-999999",
"numerical_transformer_method": "quantile_normal",
"numerical_handle_na": true,
"numerical_how_handle_na": "median",
"tokenizer_name": "bert-base-multilingual-uncased",
"use_simple_classifier": false,
"logging_dir": "./logs_petfinder/bertmultilingual_gating_on_cat_and_num_feats_then_sum_full_model_lr_3e-3/",
Expand Down
4 changes: 4 additions & 0 deletions datasets/Womens_Clothing_E-Commerce_Reviews/train_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@
"model_name_or_path": "bert-base-uncased",
"do_train": true,
"categorical_encode_type": "binary",
"categorical_handle_na": true,
"categorical_na_value": "-999999",
"numerical_transformer_method": "quantile_normal",
"numerical_handle_na": true,
"numerical_how_handle_na": "median",
"tokenizer_name": "bert-base-uncased",
"use_simple_classifier": false,
"logging_dir": "./logs_clothing_review/bertbase_gating_on_cat_and_num_feats_then_sum_full_model_lr_3e-3/",
Expand Down
146 changes: 146 additions & 0 deletions examples/inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
"""
This is an example of how to use the toolkit to run inference.
We use saved feature processors (generated by code in the tests folder).
This code assumes you have a dataframe of datapoints and batches them up.
"""

import os
import sys

sys.path.append("./")

import joblib
import numpy as np
import pandas as pd
import torch
from transformers import AutoConfig, AutoTokenizer, HfArgumentParser, set_seed

from multimodal_exp_args import (
ModelArguments,
MultimodalDataTrainingArguments,
OurTrainingArguments,
)
from multimodal_transformers.data import load_data
from multimodal_transformers.model import AutoModelWithTabular, TabularConfig

if __name__ == "__main__":
DEBUG = True
DEBUG_DATASET_SIZE = 50
JSON_FILE = "./tests/test_airbnb.json"
MODEL_SAVE_DIR = "./logs_airbnb/bertmultilingual_gating_on_cat_and_num_feats_then_sum_full_model_lr_3e-3/"
NUMERICAL_TRANSFORMER_PATH = os.path.join(
MODEL_SAVE_DIR, "numerical_transformer.pkl"
)
CATEGORICAL_TRANSFORMER_PATH = os.path.join(
MODEL_SAVE_DIR, "categorical_transformer.pkl"
)
MODEL_CONFIG_PATH = os.path.join(MODEL_SAVE_DIR, "config.json")
MODEL_PATH = os.path.join(MODEL_SAVE_DIR, "model.safetensors")

# Parse our input json files
parser = HfArgumentParser(
(ModelArguments, MultimodalDataTrainingArguments, OurTrainingArguments)
)
model_args, data_args, training_args = parser.parse_json_file(
json_file=os.path.abspath(JSON_FILE)
)

# Set random seed for reproducibility
set_seed(training_args.seed)

# Create a tokenizer
tokenizer = AutoTokenizer.from_pretrained(
(
model_args.tokenizer_name
if model_args.tokenizer_name
else model_args.model_name_or_path
),
cache_dir=model_args.cache_dir,
)

# Load our feature processors
categorical_transformer = joblib.load(CATEGORICAL_TRANSFORMER_PATH)
numerical_transformer = joblib.load(NUMERICAL_TRANSFORMER_PATH)

# Load our test set
data_df = pd.read_csv(os.path.join(data_args.data_path, "test.csv"))

# Load and preprocess our test dataset
test_dataset = load_data(
data_df=data_df,
text_cols=data_args.column_info["text_cols"],
tokenizer=tokenizer,
label_col=data_args.column_info["label_col"],
label_list=data_args.column_info["label_list"],
categorical_cols=data_args.column_info["cat_cols"],
numerical_cols=data_args.column_info["num_cols"],
sep_text_token_str=(
tokenizer.sep_token
if not data_args.column_info["text_col_sep_token"]
else data_args.column_info["text_col_sep_token"]
),
categorical_transformer=categorical_transformer,
numerical_transformer=numerical_transformer,
max_token_length=training_args.max_token_length,
debug=DEBUG,
debug_dataset_size=DEBUG_DATASET_SIZE,
)

task = data_args.task
# Regression tasks have only one "label"
if task == "regression":
num_labels = 1
else:
num_labels = (
len(np.unique(test_dataset.labels))
if data_args.num_classes == -1
else data_args.num_classes
)

# Setup configs
config = AutoConfig.from_pretrained(
MODEL_CONFIG_PATH,
cache_dir=model_args.cache_dir,
)
tabular_config = TabularConfig(
num_labels=num_labels,
cat_feat_dim=(
test_dataset.cat_feats.shape[1] if test_dataset.cat_feats is not None else 0
),
numerical_feat_dim=(
test_dataset.numerical_feats.shape[1]
if test_dataset.numerical_feats is not None
else 0
),
**vars(data_args),
)
config.tabular_config = tabular_config

# Make model
model = AutoModelWithTabular.from_pretrained(
MODEL_PATH,
config=config,
cache_dir=model_args.cache_dir,
)

# Run inference
dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=16)
model.eval()
all_labels = []
all_preds = []
with torch.no_grad():
for batch in dataloader:
_, logits, classifier_outputs = model(
batch["input_ids"],
attention_mask=batch["attention_mask"],
token_type_ids=batch["token_type_ids"],
cat_feats=batch["cat_feats"],
numerical_feats=batch["numerical_feats"],
)
all_labels.append(batch["labels"])
all_preds.append(logits.argmax(axis=1))

all_preds = torch.cat(all_preds)
all_labels = torch.cat(all_labels)
acc = torch.sum(all_preds == all_labels) / all_labels.shape[0]
print(f"Accuracy: {acc}")
76 changes: 52 additions & 24 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,11 @@ def main():
)

tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name
if model_args.tokenizer_name
else model_args.model_name_or_path,
(
model_args.tokenizer_name
if model_args.tokenizer_name
else model_args.model_name_or_path
),
cache_dir=model_args.cache_dir,
)

Expand All @@ -91,13 +93,21 @@ def main():
categorical_cols=data_args.column_info["cat_cols"],
numerical_cols=data_args.column_info["num_cols"],
categorical_encode_type=data_args.categorical_encode_type,
categorical_handle_na=data_args.categorical_handle_na,
categorical_na_value=data_args.categorical_na_value,
numerical_transformer_method=data_args.numerical_transformer_method,
sep_text_token_str=tokenizer.sep_token
if not data_args.column_info["text_col_sep_token"]
else data_args.column_info["text_col_sep_token"],
numerical_handle_na=data_args.numerical_handle_na,
numerical_how_handle_na=data_args.numerical_how_handle_na,
numerical_na_value=data_args.numerical_na_value,
sep_text_token_str=(
tokenizer.sep_token
if not data_args.column_info["text_col_sep_token"]
else data_args.column_info["text_col_sep_token"]
),
max_token_length=training_args.max_token_length,
debug=training_args.debug_dataset,
debug_dataset_size=training_args.debug_dataset_size,
output_dir=training_args.output_dir,
)
train_datasets = [train_dataset]
val_datasets = [val_dataset]
Expand All @@ -114,13 +124,21 @@ def main():
categorical_cols=data_args.column_info["cat_cols"],
numerical_cols=data_args.column_info["num_cols"],
categorical_encode_type=data_args.categorical_encode_type,
categorical_handle_na=data_args.categorical_handle_na,
categorical_na_value=data_args.categorical_na_value,
numerical_transformer_method=data_args.numerical_transformer_method,
sep_text_token_str=tokenizer.sep_token
if not data_args.column_info["text_col_sep_token"]
else data_args.column_info["text_col_sep_token"],
numerical_handle_na=data_args.numerical_handle_na,
numerical_how_handle_na=data_args.numerical_how_handle_na,
numerical_na_value=data_args.numerical_na_value,
sep_text_token_str=(
tokenizer.sep_token
if not data_args.column_info["text_col_sep_token"]
else data_args.column_info["text_col_sep_token"]
),
max_token_length=training_args.max_token_length,
debug=training_args.debug_dataset,
debug_dataset_size=training_args.debug_dataset_size,
output_dir=training_args.output_dir,
)
train_dataset = train_datasets[0]

Expand Down Expand Up @@ -163,27 +181,35 @@ def compute_metrics_fn(p: EvalPrediction):
):
logger.info(f"======== Fold {i+1} ========")
config = AutoConfig.from_pretrained(
model_args.config_name
if model_args.config_name
else model_args.model_name_or_path,
(
model_args.config_name
if model_args.config_name
else model_args.model_name_or_path
),
cache_dir=model_args.cache_dir,
)
tabular_config = TabularConfig(
num_labels=num_labels,
cat_feat_dim=train_dataset.cat_feats.shape[1]
if train_dataset.cat_feats is not None
else 0,
numerical_feat_dim=train_dataset.numerical_feats.shape[1]
if train_dataset.numerical_feats is not None
else 0,
cat_feat_dim=(
train_dataset.cat_feats.shape[1]
if train_dataset.cat_feats is not None
else 0
),
numerical_feat_dim=(
train_dataset.numerical_feats.shape[1]
if train_dataset.numerical_feats is not None
else 0
),
**vars(data_args),
)
config.tabular_config = tabular_config

model = AutoModelWithTabular.from_pretrained(
model_args.config_name
if model_args.config_name
else model_args.model_name_or_path,
(
model_args.config_name
if model_args.config_name
else model_args.model_name_or_path
),
config=config,
cache_dir=model_args.cache_dir,
)
Expand All @@ -200,9 +226,11 @@ def compute_metrics_fn(p: EvalPrediction):
)
if training_args.do_train:
trainer.train(
resume_from_checkpoint=model_args.model_name_or_path
if os.path.isdir(model_args.model_name_or_path)
else None
resume_from_checkpoint=(
model_args.model_name_or_path
if os.path.isdir(model_args.model_name_or_path)
else None
)
)
trainer.save_model()

Expand Down
50 changes: 48 additions & 2 deletions multimodal_exp_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,13 +114,59 @@ class MultimodalDataTrainingArguments:
"choices": ["ohe", "binary", "label", "none"],
},
)

categorical_handle_na: bool = field(
default=False,
metadata={
"help": "Whether to handle NaN values for categorical columns.",
},
)

categorical_na_value: str = field(
default="-99999",
metadata={
"help": "Value to replace NaNs with in categorical columns when categorical_handle_na is set to True.",
},
)

ohe_handle_unknown: str = field(
default="error",
metadata={
"help": "How a one hot encoder (if used) should handle new unknown classes. Refer: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html",
"choices": ["error", "ignore", "infrequent_if_exist"],
},
)

numerical_transformer_method: str = field(
default="yeo_johnson",
metadata={
"help": "sklearn numerical transformer to preprocess numerical data",
"choices": ["yeo_johnson", "box_cox", "quantile_normal", "none"],
},
)

numerical_handle_na: bool = field(
default=False,
metadata={
"help": "Whether to handle NaN values for numerical columns.",
},
)

numerical_how_handle_na: str = field(
default="median",
metadata={
"help": "How to handle NaN values in numerical columns. Mean/Median replaces NaNs with the mean/median of the column. Value replaces NaNs with numerical_na_value.",
"choices": ["median", "mean", "value"],
},
)

numerical_na_value: float = field(
default=0.0,
metadata={
"help": "Value to replace NaNs with in numerical columns when numerical_handle_na is set to True.",
},
)

task: str = field(
default="classification",
metadata={
Expand Down Expand Up @@ -185,10 +231,10 @@ def __post_init__(self):
self.column_info = json.load(f)
assert "text_cols" in self.column_info and "label_col" in self.column_info
if "cat_cols" not in self.column_info:
self.column_info["cat_cols"] = None
self.column_info["cat_cols"] = []
self.categorical_encode_type = "none"
if "num_cols" not in self.column_info:
self.column_info["num_cols"] = None
self.column_info["num_cols"] = []
self.numerical_transformer_method = "none"
if "text_col_sep_token" not in self.column_info:
self.column_info["text_col_sep_token"] = None
Expand Down
Loading

0 comments on commit b2f05ee

Please sign in to comment.