georgian-io · akashsaravanan-georgian · Mar 10, 2023 · Feb 16, 2022 · Mar 1, 2023 · Mar 1, 2023
diff --git a/datasets/Melbourne_Airbnb_Open_Data/train_config.json b/datasets/Melbourne_Airbnb_Open_Data/train_config.json
@@ -14,8 +14,8 @@
   "num_train_epochs": 5,
   "overwrite_output_dir": true,
   "learning_rate": 3e-3,
-  "per_device_train_batch_size": 12,
-  "per_device_eval_batch_size": 12,
+  "per_device_train_batch_size": 16,
+  "per_device_eval_batch_size": 16,
   "logging_steps": 50,
   "eval_steps": 500,
   "save_steps": 3000,

diff --git a/datasets/PetFindermy_Adoption_Prediction/train_config.json b/datasets/PetFindermy_Adoption_Prediction/train_config.json
@@ -1,24 +1,25 @@
 {
-  "output_dir": "./logs_petfinder/",
+  "output_dir": "./logs_petfinder/gating_on_cat_and_num_feats_then_sum_full_model",
   "debug_dataset": false,
   "task": "classification",
-  "num_labels": 5,
   "combine_feat_method": "text_only",
   "experiment_name": "bert-base-multilingual-uncased",
   "model_name_or_path": "bert-base-multilingual-uncased",
   "do_train": true,
+  "categorical_encode_type": "ohe",
+  "numerical_transformer_method": "quantile_normal",
   "tokenizer_name": "bert-base-multilingual-uncased",
-  "per_device_train_batch_size": 12,
-  "gpu_num": 0,
+  "use_simple_classifier": false,
+  "logging_dir": "./logs_petfinder/bertmultilingual_gating_on_cat_and_num_feats_then_sum_full_model_lr_3e-3/",
   "num_train_epochs": 5,
-  "categorical_encode_type": "ohe",
-  "use_class_weights": false,
+  "overwrite_output_dir": true,
+  "learning_rate": 1e-4,
+  "per_device_train_batch_size": 16,
+  "per_device_eval_batch_size": 16,
   "logging_steps": 50,
   "eval_steps": 750,
   "save_steps": 3000,
-  "learning_rate": 1e-4,
-  "data_path": "./datasets/PetFindermy_Adoption_Prediction/",
-  "column_info_path":  "./datasets/PetFindermy_Adoption_Prediction/column_info_all_text.json",
-  "overwrite_output_dir": true
+  "data_path": "./datasets/PetFindermy_Adoption_Prediction",
+  "column_info_path":  "./datasets/PetFindermy_Adoption_Prediction/column_info_all_text.json"
 }
 
diff --git a/datasets/Womens_Clothing_E-Commerce_Reviews/train_config.json b/datasets/Womens_Clothing_E-Commerce_Reviews/train_config.json
@@ -5,15 +5,21 @@
   "combine_feat_method": "text_only",
   "experiment_name": "Unimodal Bert Base Uncased",
   "model_name_or_path": "bert-base-uncased",
-  "gpu_num": 0,
   "do_train": true,
+  "categorical_encode_type": "binary",
+  "numerical_transformer_method": "quantile_normal",
   "tokenizer_name": "bert-base-uncased",
-  "per_device_train_batch_size": 12,
+  "use_simple_classifier": false,
+  "logging_dir": "./logs_clothing_review/bertbase_gating_on_cat_and_num_feats_then_sum_full_model_lr_3e-3/",
+  "num_train_epochs": 5,
+  "overwrite_output_dir": true,
+  "learning_rate": 3e-3,
+  "per_device_train_batch_size": 16,
+  "per_device_eval_batch_size": 16,
   "logging_steps": 50,
   "eval_steps": 750,
   "save_steps": 3000,
   "data_path": "./datasets/Womens_Clothing_E-Commerce_Reviews",
-  "column_info_path":  "./datasets/Womens_Clothing_E-Commerce_Reviews/column_info_all_text.json",
-  "overwrite_output_dir": true
+  "column_info_path":  "./datasets/Womens_Clothing_E-Commerce_Reviews/column_info_all_text.json"
 }
 
diff --git a/docs/source/notes/introduction.rst b/docs/source/notes/introduction.rst
@@ -95,7 +95,7 @@ The following example shows a forward pass on two data examples
     labels = torch.tensor([1, 0])
 
     model_inputs['cat_feats'] = categorical_feat
-    model_inputs['num_feats'] = numerical_feat
+    model_inputs['numerical_feats'] = numerical_feat
     model_inputs['labels'] = labels
 
     loss, logits, layer_outs = model(**model_inputs)

diff --git a/main.py b/main.py
@@ -116,16 +116,19 @@ def main():
 
     def build_compute_metrics_fn(task_name: str) -> Callable[[EvalPrediction], Dict]:
         def compute_metrics_fn(p: EvalPrediction):
+            # p.predictions is now a list of objects
+            # The first entry is the actual predictions
+            predictions = p.predictions[0]
             if task_name == "classification":
-                preds_labels = np.argmax(p.predictions, axis=1)
-                if p.predictions.shape[-1] == 2:
-                    pred_scores = softmax(p.predictions, axis=1)[:, 1]
+                preds_labels = np.argmax(predictions, axis=1)
+                if predictions.shape[-1] == 2:
+                    pred_scores = softmax(predictions, axis=1)[:, 1]
                 else:
-                    pred_scores = softmax(p.predictions, axis=1)
+                    pred_scores = softmax(predictions, axis=1)
                 return calc_classification_metrics(pred_scores, preds_labels,
                                                    p.label_ids)
             elif task_name == "regression":
-                preds = np.squeeze(p.predictions)
+                preds = np.squeeze(predictions)
                 return calc_regression_metrics(preds, p.label_ids)
             else:
                 return {}
@@ -178,7 +181,7 @@ def compute_metrics_fn(p: EvalPrediction):
             output_eval_file = os.path.join(
                 training_args.output_dir, f"eval_metric_results_{task}_fold_{i+1}.txt"
             )
-            if trainer.is_world_master():
+            if trainer.is_world_process_zero():
                 with open(output_eval_file, "w") as writer:
                     logger.info("***** Eval results {} *****".format(task))
                     for key, value in eval_result.items():
@@ -190,13 +193,13 @@ def compute_metrics_fn(p: EvalPrediction):
         if training_args.do_predict:
             logging.info("*** Test ***")
 
-            predictions = trainer.predict(test_dataset=test_dataset).predictions
+            predictions = trainer.predict(test_dataset=test_dataset).predictions[0]
             output_test_file = os.path.join(
                 training_args.output_dir, f"test_results_{task}_fold_{i+1}.txt"
             )
             eval_result = trainer.evaluate(eval_dataset=test_dataset)
             logger.info(pformat(eval_result, indent=4))
-            if trainer.is_world_master():
+            if trainer.is_world_process_zero():
                 with open(output_test_file, "w") as writer:
                     logger.info("***** Test results {} *****".format(task))
                     writer.write("index\tprediction\n")

diff --git a/multimodal_exp_args.py b/multimodal_exp_args.py
@@ -1,10 +1,10 @@
 from dataclasses import dataclass, field
 import json
 import logging
-from typing import Optional, Tuple
+from typing import Optional, Tuple, List
 
 import torch
-from transformers.training_args import TrainingArguments, torch_required, cached_property
+from transformers.training_args import TrainingArguments, requires_backends, cached_property
 
 
 logger = logging.getLogger(__name__)
@@ -178,6 +178,10 @@ class OurTrainingArguments(TrainingArguments):
 
     learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for Adam."})
 
+    report_to: Optional[List[str]] = field(
+        default_factory=list, metadata={"help": "The list of integrations to report the results and logs to."}
+    )
+
     def __post_init__(self):
         if self.debug_dataset:
             self.max_token_length = 16
@@ -186,12 +190,12 @@ def __post_init__(self):
 
 
     @cached_property
-    @torch_required
     def _setup_devices(self) -> Tuple["torch.device", int]:
+        requires_backends(self, ["torch"])
         logger.info("PyTorch: setting up devices")
         if self.no_cuda:
             device = torch.device("cpu")
-            n_gpu = 0
+            self._n_gpu = 0
         elif self.local_rank == -1:
             # if n_gpu is > 1 we'll use nn.DataParallel.
             # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
@@ -200,15 +204,16 @@ def _setup_devices(self) -> Tuple["torch.device", int]:
             # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
             # will use the first GPU in that env, i.e. GPU#1
             device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-            n_gpu = torch.cuda.device_count()
+            self._n_gpu = torch.cuda.device_count()
         else:
             # Here, we'll use torch.distributed.
             # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-            torch.distributed.init_process_group(backend="nccl")
+            if not torch.distributed.is_initialized():
+                torch.distributed.init_process_group(backend="nccl", timeout=self.ddp_timeout_delta)
             device = torch.device("cuda", self.local_rank)
-            n_gpu = 1
+            self._n_gpu = 1
 
         if device.type == "cuda":
             torch.cuda.set_device(device)
 
-        return device, n_gpu
+        return device
diff --git a/multimodal_transformers/__init__.py b/multimodal_transformers/__init__.py
@@ -1,6 +1,6 @@
 import multimodal_transformers.data
 import multimodal_transformers.model
 
-__version__ = '0.1.2-alpha'
+__version__ = '0.2-alpha'
 
 __all__ = ['multimodal_transformers', '__version__']
diff --git a/multimodal_transformers/data/data_utils.py b/multimodal_transformers/data/data_utils.py
@@ -66,7 +66,7 @@ def change_name_func(x):
     def _one_hot(self):
         ohe = preprocessing.OneHotEncoder(sparse=False)
         ohe.fit(self.df[self.cat_feats].values)
-        self.feat_names = list(ohe.get_feature_names(self.cat_feats))
+        self.feat_names = list(ohe.get_feature_names_out(self.cat_feats))
         return ohe.transform(self.df[self.cat_feats].values)
 
     def fit_transform(self):

diff --git a/multimodal_transformers/data/load_data.py b/multimodal_transformers/data/load_data.py
@@ -370,7 +370,7 @@ def load_data(data_df,
         :obj:`tabular_torch_dataset.TorchTextDataset`: The converted dataset
     """
     if debug:
-        data_df = data_df[:500]
+        data_df = data_df[:100]
     if empty_text_values is None:
         empty_text_values = ['nan', 'None']
 

diff --git a/multimodal_transformers/model/tabular_combiner.py b/multimodal_transformers/model/tabular_combiner.py
@@ -265,7 +265,7 @@ def __init__(self, tabular_config):
                         self.numerical_feat_dim,
                         division=self.mlp_division,
                         output_dim=output_dim_num)
-                    self.cat_mlp = MLP(
+                    self.num_mlp = MLP(
                         self.numerical_feat_dim,
                         output_dim_num,
                         num_hidden_lyr=len(dims),
@@ -406,7 +406,7 @@ def forward(self, text_feats, cat_feats=None, numerical_feats=None):
                 if self.numerical_feat_dim > self.text_out_dim:
                     numerical_feats = self.num_mlp(numerical_feats)
                 w_num = torch.mm(numerical_feats, self.weight_num)
-                g_num = (torch.cat([w_text, w_cat], dim=-1) * self.weight_a).sum(dim=1).unsqueeze(0).T
+                g_num = (torch.cat([w_text, w_num], dim=-1) * self.weight_a).sum(dim=1).unsqueeze(0).T
             else:
                 w_num = None
                 g_num = torch.zeros(0, device=g_text.device)

diff --git a/multimodal_transformers/model/tabular_modeling_auto.py b/multimodal_transformers/model/tabular_modeling_auto.py
@@ -1,7 +1,7 @@
 from collections import OrderedDict
 
 from transformers.configuration_utils import PretrainedConfig
-from transformers.configuration_auto import (
+from transformers import (
     AutoConfig,
     AlbertConfig,
     BertConfig,

diff --git a/multimodal_transformers/model/tabular_transformers.py b/multimodal_transformers/model/tabular_transformers.py
@@ -7,14 +7,14 @@
     XLNetForSequenceClassification,
     XLMForSequenceClassification
 )
-from transformers.modeling_bert import BERT_INPUTS_DOCSTRING
-from transformers.modeling_roberta import ROBERTA_INPUTS_DOCSTRING
-from transformers.modeling_distilbert import DISTILBERT_INPUTS_DOCSTRING
-from transformers.modeling_albert import ALBERT_INPUTS_DOCSTRING
-from transformers.modeling_xlnet import XLNET_INPUTS_DOCSTRING
-from transformers.modeling_xlm import XLM_INPUTS_DOCSTRING
-from transformers.configuration_xlm_roberta import XLMRobertaConfig
-from transformers.file_utils import add_start_docstrings_to_callable
+from transformers.models.bert.modeling_bert import BERT_INPUTS_DOCSTRING
+from transformers.models.roberta.modeling_roberta import ROBERTA_INPUTS_DOCSTRING
+from transformers.models.distilbert.modeling_distilbert import DISTILBERT_INPUTS_DOCSTRING
+from transformers.models.albert.modeling_albert import ALBERT_INPUTS_DOCSTRING
+from transformers.models.xlnet.modeling_xlnet import XLNET_INPUTS_DOCSTRING
+from transformers.models.xlm.modeling_xlm import XLM_INPUTS_DOCSTRING
+from transformers.models.xlm_roberta.configuration_xlm_roberta import XLMRobertaConfig
+from transformers.file_utils import add_start_docstrings_to_model_forward
 
 from .tabular_combiner import TabularFeatCombiner
 from .tabular_config import TabularConfig
@@ -61,7 +61,7 @@ def __init__(self, hf_model_config):
                                           hidden_channels=dims,
                                           bn=True)
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     def forward(
         self,
         input_ids=None,
@@ -162,7 +162,7 @@ def __init__(self, hf_model_config):
                                           hidden_channels=dims,
                                           bn=True)
 
-    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     def forward(
         self,
         input_ids=None,
@@ -274,7 +274,7 @@ def __init__(self, hf_model_config):
                                           hidden_channels=dims,
                                           bn=True)
 
-    @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_start_docstrings_to_model_forward(DISTILBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     def forward(
         self,
         input_ids=None,
@@ -375,7 +375,7 @@ def __init__(self, hf_model_config):
                                           hidden_channels=dims,
                                           bn=True)
 
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids=None,
@@ -440,6 +440,9 @@ class XLNetWithTabular(XLNetForSequenceClassification):
             :obj:`TabularConfig` instance specifying the configs for :obj:`TabularFeatCombiner`
     """
     def __init__(self, hf_model_config):
+        # When set to true, sequency summary layer is hidden_size -> num_labels
+        # We expect the output to be hidden_size -> hidden_size
+        hf_model_config.summary_proj_to_labels = False
         super().__init__(hf_model_config)
         tabular_config = hf_model_config.tabular_config
         if type(tabular_config) is dict:  # when loading from saved model
@@ -465,7 +468,7 @@ def __init__(self, hf_model_config):
                                           hidden_channels=dims,
                                           bn=True)
 
-    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     def forward(
         self,
         input_ids=None,
@@ -494,7 +497,7 @@ def forward(
             If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
+        use_cache = self.training or (use_cache if use_cache is not None else False)
 
         transformer_outputs = self.transformer(
             input_ids,
@@ -506,7 +509,7 @@ def forward(
             input_mask=input_mask,
             head_mask=head_mask,
             inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
+            use_mems=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
@@ -538,6 +541,9 @@ class XLMWithTabular(XLMForSequenceClassification):
             :obj:`TabularConfig` instance specifying the configs for :obj:`TabularFeatCombiner`
     """
     def __init__(self, hf_model_config):
+        # When set to true, sequency summary layer is hidden_size -> num_labels
+        # We expect the output to be hidden_size -> hidden_size
+        hf_model_config.summary_proj_to_labels = False
         super().__init__(hf_model_config)
         tabular_config = hf_model_config.tabular_config
         if type(tabular_config) is dict:  # when loading from saved model
@@ -563,7 +569,7 @@ def __init__(self, hf_model_config):
                                           hidden_channels=dims,
                                           bn=True)
 
-    @ add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @ add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING)
     def forward(
             self,
             input_ids=None,
@@ -617,4 +623,4 @@ def forward(
                                                               labels,
                                                               self.num_labels,
                                                               class_weights)
-        return loss, logits, classifier_layer_outputs
+        return loss, logits, classifier_layer_outputs
diff --git a/setup.py b/setup.py
@@ -1,17 +1,19 @@
 from setuptools import setup, find_packages
 
-__version__ = '0.1.4-alpha'
+__version__ = '0.2-alpha'
 url = 'https://github.com/georgianpartners/Multimodal-Toolkit'
 
 install_requires = [
     'torch',
-    'transformers==3.1',
+    'transformers>=4.26.1',
     'numpy',
     'tqdm',
     'scipy',
     'networkx',
     'scikit-learn',
     'pandas',
+    'sacremoses',
+    'pytest'
 ]
 
 setup(
@@ -36,4 +38,4 @@
         'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
   ],
-)
+)