Skip to content

Commit

Permalink
Merge pull request #196 from kaushaltrivedi/auto-model
Browse files Browse the repository at this point in the history
Auto model
  • Loading branch information
kaushaltrivedi authored Apr 14, 2020
2 parents d0bd75e + f72fc85 commit 53773ab
Show file tree
Hide file tree
Showing 9 changed files with 38 additions and 183 deletions.
2 changes: 1 addition & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"python.formatting.provider": "black",
"python.pythonPath": "/Users/kaushaltrivedi/anaconda3/envs/transformer/bin/python",
"python.pythonPath": "/home/ubuntu/anaconda3/bin/python",
"python.linting.pylintEnabled": false,
"python.linting.flake8Enabled": true,
"python.linting.enabled": true
Expand Down
3 changes: 2 additions & 1 deletion container/Dockerfile_gpu
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ RUN pip --no-cache-dir install \

RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext

RUN pip --no-cache-dir install fast-bert
# RUN pip --no-cache-dir install fast-bert
RUN pip install fast-bert

ENV PATH="/opt/ml/code:${PATH}"
COPY /bert /opt/ml/code
Expand Down
9 changes: 2 additions & 7 deletions container/bert/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,6 @@
# PATH = Path(os.path.join(prefix, "model"))
PATH = os.path.join(prefix, "model")

PRETRAINED_PATH = Path(os.path.join(prefix, "code"))

BERT_PRETRAINED_PATH = (
PRETRAINED_PATH / "pretrained-weights" / "uncased_L-12_H-768_A-12/"
)
MODEL_PATH = os.path.join(PATH, "pytorch_model.bin")

# request_text = None
Expand All @@ -42,7 +37,7 @@ def get_predictor_model(cls):

# print(cls.searching_all_files(PATH))
# Get model predictor
if cls.model == None:
if cls.model is None:
with open(os.path.join(PATH, "model_config.json")) as f:
model_config = json.load(f)

Expand Down Expand Up @@ -128,7 +123,7 @@ def transformation():
text = data["text"]
try:
bing_key = data["bing_key"]
except:
except Exception:
bing_key = None

else:
Expand Down
37 changes: 7 additions & 30 deletions container/bert/train
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,7 @@ import logging

import torch

from transformers import (
BertTokenizer,
XLNetTokenizer,
RobertaTokenizer,
DistilBertTokenizer,
)
from transformers import AutoTokenizer


from fast_bert.data_cls import BertDataBunch
Expand Down Expand Up @@ -124,27 +119,10 @@ def train():
)
finetuned_model = None

# Tokenizer
if training_config["model_type"] == "bert":
tokenizer = BertTokenizer.from_pretrained(
str(PRETRAINED_PATH),
do_lower_case=bool(training_config["do_lower_case"]),
)
elif training_config["model_type"] == "distilbert":
tokenizer = DistilBertTokenizer.from_pretrained(
str(PRETRAINED_PATH),
do_lower_case=bool(training_config["do_lower_case"]),
)
elif training_config["model_type"] == "roberta":
tokenizer = RobertaTokenizer.from_pretrained(
str(PRETRAINED_PATH),
do_lower_case=bool(training_config["do_lower_case"]),
)
elif training_config["model_type"] == "xlnet":
tokenizer = XLNetTokenizer.from_pretrained(
str(PRETRAINED_PATH),
do_lower_case=bool(training_config["do_lower_case"]),
)
# use auto-tokenizer
tokenizer = (
AutoTokenizer.from_pretrained(str(PRETRAINED_PATH), use_fast=True),
)

device = torch.device("cuda")
if torch.cuda.device_count() > 1:
Expand All @@ -154,7 +132,7 @@ def train():

logger.info("Number of GPUs: {}".format(torch.cuda.device_count()))

if bool(training_config["multi_label"]) == True:
if bool(training_config["multi_label"]) is True:
label_col = json.loads(training_config["label_col"])
else:
label_col = training_config["label_col"]
Expand All @@ -180,7 +158,7 @@ def train():
)

metrics = []
if bool(training_config["multi_label"]) == False:
if bool(training_config["multi_label"]) is False:
metrics.append({"name": "accuracy", "function": accuracy})
else:
metrics.append({"name": "accuracy_thresh", "function": accuracy_thresh})
Expand Down Expand Up @@ -248,4 +226,3 @@ if __name__ == "__main__":

# A zero exit code causes the job to be marked a Succeeded.
sys.exit(0)

52 changes: 4 additions & 48 deletions fast_bert/data_cls.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,48 +10,7 @@
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler

from transformers import (
WEIGHTS_NAME,
BertConfig,
BertForSequenceClassification,
BertTokenizer,
XLMConfig,
XLMForSequenceClassification,
XLMTokenizer,
XLNetConfig,
XLNetForSequenceClassification,
XLNetTokenizer,
RobertaConfig,
RobertaForSequenceClassification,
RobertaTokenizer,
CamembertConfig,
CamembertForSequenceClassification,
CamembertTokenizer,
AlbertConfig,
AlbertForSequenceClassification,
AlbertTokenizer,
DistilBertConfig,
DistilBertForSequenceClassification,
DistilBertTokenizer,
)

MODEL_CLASSES = {
"bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
"xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
"xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
"roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
"albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
"distilbert": (
DistilBertConfig,
DistilBertForSequenceClassification,
DistilBertTokenizer,
),
"camembert-base": (
CamembertConfig,
CamembertForSequenceClassification,
CamembertTokenizer,
),
}
from transformers import AutoTokenizer


class InputExample(object):
Expand Down Expand Up @@ -404,11 +363,8 @@ def __init__(
label_dir = Path(label_dir)

if isinstance(tokenizer, str):
_, _, tokenizer_class = MODEL_CLASSES[model_type]
# instantiate the new tokeniser object using the tokeniser name
tokenizer = tokenizer_class.from_pretrained(
tokenizer, do_lower_case=("uncased" in tokenizer)
)
tokenizer = AutoTokenizer.from_pretrained(tokenizer, use_fast=True)

self.tokenizer = tokenizer
self.data_dir = data_dir
Expand Down Expand Up @@ -554,8 +510,8 @@ def get_dataset_from_examples(
file_name = self.val_file
elif set_type == "test":
file_name = (
"test"
) # test is not supposed to be a file - just a list of texts
"test" # test is not supposed to be a file - just a list of texts
)

cached_features_file = os.path.join(
self.cache_dir,
Expand Down
21 changes: 12 additions & 9 deletions fast_bert/learner_cls.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@
DistilBertTokenizer,
)

from transformers import AutoModelForSequenceClassification, AutoConfig


MODEL_CLASSES = {
"bert": (
Expand Down Expand Up @@ -130,12 +132,6 @@ def from_pretrained_model(

model_type = dataBunch.model_type

config_class, model_class, _ = MODEL_CLASSES[model_type]

config = config_class.from_pretrained(
str(pretrained_path), num_labels=len(dataBunch.labels)
)

if torch.cuda.is_available():
map_location = lambda storage, loc: storage.cuda()
else:
Expand All @@ -147,13 +143,20 @@ def from_pretrained_model(
model_state_dict = None

if multi_label is True:
print(str(pretrained_path))
print(type(str(pretrained_path)))
config_class, model_class, _ = MODEL_CLASSES[model_type]

config = config_class.from_pretrained(
str(pretrained_path), num_labels=len(dataBunch.labels)
)

model = model_class[1].from_pretrained(
str(pretrained_path), config=config, state_dict=model_state_dict
)
else:
model = model_class[0].from_pretrained(
config = AutoConfig.from_pretrained(
str(pretrained_path), num_labels=len(dataBunch.labels)
)
model = AutoModelForSequenceClassification.from_pretrained(
str(pretrained_path), config=config, state_dict=model_state_dict
)

Expand Down
93 changes: 8 additions & 85 deletions fast_bert/prediction.py
Original file line number Diff line number Diff line change
@@ -1,91 +1,15 @@
import os
import torch
from transformers import BertTokenizer
from .data_cls import BertDataBunch
from .learner_cls import BertLearner
from .modeling import (
BertForMultiLabelSequenceClassification,
XLNetForMultiLabelSequenceClassification,
RobertaForMultiLabelSequenceClassification,
DistilBertForMultiLabelSequenceClassification,
CamembertForMultiLabelSequenceClassification,
AlbertForMultiLabelSequenceClassification,
)

from transformers import (
WEIGHTS_NAME,
BertConfig,
BertForSequenceClassification,
BertTokenizer,
XLMConfig,
XLMForSequenceClassification,
XLMTokenizer,
XLNetConfig,
XLNetForSequenceClassification,
XLNetTokenizer,
RobertaConfig,
RobertaForSequenceClassification,
RobertaTokenizer,
CamembertConfig,
CamembertForSequenceClassification,
CamembertTokenizer,
AlbertConfig,
AlbertForSequenceClassification,
AlbertTokenizer,
DistilBertConfig,
DistilBertForSequenceClassification,
DistilBertTokenizer,
)
from transformers import AutoTokenizer

import warnings

warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

MODEL_CLASSES = {
"bert": (
BertConfig,
(BertForSequenceClassification, BertForMultiLabelSequenceClassification),
BertTokenizer,
),
"xlnet": (
XLNetConfig,
(XLNetForSequenceClassification, XLNetForMultiLabelSequenceClassification),
XLNetTokenizer,
),
"xlm": (
XLMConfig,
(XLMForSequenceClassification, XLMForSequenceClassification),
XLMTokenizer,
),
"roberta": (
RobertaConfig,
(RobertaForSequenceClassification, RobertaForMultiLabelSequenceClassification),
RobertaTokenizer,
),
"distilbert": (
DistilBertConfig,
(
DistilBertForSequenceClassification,
DistilBertForMultiLabelSequenceClassification,
),
DistilBertTokenizer,
),
"albert": (
AlbertConfig,
(AlbertForSequenceClassification, AlbertForMultiLabelSequenceClassification),
AlbertTokenizer,
),
"camembert-base": (
CamembertConfig,
(
CamembertForSequenceClassification,
CamembertForMultiLabelSequenceClassification,
),
CamembertTokenizer,
),
}


class BertClassificationPredictor(object):
def __init__(
Expand All @@ -94,6 +18,7 @@ def __init__(
label_path,
multi_label=False,
model_type="bert",
use_fast_tokenizer=True,
do_lower_case=True,
):
self.model_path = model_path
Expand All @@ -102,16 +27,14 @@ def __init__(
self.model_type = model_type
self.do_lower_case = do_lower_case

# Use auto-tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_path, use_fast=use_fast_tokenizer
)

self.learner = self.get_learner()

def get_learner(self):

_, _, tokenizer_class = MODEL_CLASSES[self.model_type]
# instantiate the new tokeniser object using the tokeniser name
tokenizer = tokenizer_class.from_pretrained(
self.model_path, do_lower_case=self.do_lower_case
)

if torch.cuda.is_available():
device = torch.device("cuda")
else:
Expand All @@ -120,7 +43,7 @@ def get_learner(self):
databunch = BertDataBunch(
self.label_path,
self.label_path,
tokenizer,
self.tokenizer,
train_file=None,
val_file=None,
batch_size_per_gpu=32,
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ tensorboardX
fastprogress
sklearn
spacy
transformers>=2.3.0
transformers>=2.8.0
pandas
python-box
tokenizers
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

setup(
name="fast_bert",
version="1.6.5",
version="1.7.0",
description="AI Library using BERT",
author="Kaushal Trivedi",
author_email="kaushaltrivedi@me.com",
Expand Down

0 comments on commit 53773ab

Please sign in to comment.