Skip to content

Commit

Permalink
fix conflict
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Nov 28, 2023
2 parents 3e68dd9 + 3671cd4 commit f4c9d89
Show file tree
Hide file tree
Showing 8 changed files with 216 additions and 60 deletions.
2 changes: 1 addition & 1 deletion Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ Some contributions include:

A native Java integration of the library has been realized in [GROBID](https://github.com/kermitt2/grobid) via [JEP](https://github.com/ninia/jep).

The latest DeLFT release has been tested successfully with python 3.8 and Tensorflow 2.9.3. As always, GPU(s) are required for decent training time. A GeForce GTX 1050 Ti (4GB) for instance is fine for running RNN models and BERT or RoBERTa base models. Using BERT large model is possible from a GeForce GTX 1080 Ti (11GB) with modest batch size.
The latest DeLFT release has been tested successfully with python 3.8 and Tensorflow 2.9.3. As always, GPU(s) are required for decent training time. A GeForce GTX 1050 Ti (4GB) for instance is fine for running RNN models and BERT or RoBERTa base models. Using BERT large model is possible from a GeForce GTX 1080 Ti (11GB) with modest batch size. Using multiple GPUs (training and inference) is supported.

## DeLFT Documentation

Expand Down
46 changes: 32 additions & 14 deletions delft/applications/datasetTagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,15 @@ def configure(architecture, output_path=None, max_sequence_length=-1,

# train a model with all available data
def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
input_path=None, output_path=None, fold_count=1,
features_indices=None, max_sequence_length=-1,
batch_size=-1, max_epoch=-1, use_ELMo=False, patience=-1,
learning_rate=None, early_stop=None):
input_path=None, output_path=None, fold_count=1,
features_indices=None, max_sequence_length=-1,
batch_size=-1, use_ELMo=False,
max_epoch=-1,
patience=-1,
learning_rate=None,
early_stop=None,
multi_gpu=False):

print('Loading data...')
if input_path is None:
x_all1 = y_all1 = x_all2 = y_all2 = x_all3 = y_all3 = []
Expand Down Expand Up @@ -122,7 +127,7 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
learning_rate=learning_rate)

start_time = time.time()
model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid)
model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, multi_gpu=multi_gpu)
runtime = round(time.time() - start_time, 3)

print("training runtime: %s seconds " % runtime)
Expand All @@ -136,9 +141,14 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,

# split data, train a model and evaluate it
def train_eval(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
input_path=None, output_path=None, fold_count=1,
features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, use_ELMo=False,
patience=-1, learning_rate=None, early_stop=None):
input_path=None, output_path=None, fold_count=1,
features_indices=None, max_sequence_length=-1, batch_size=-1, use_ELMo=False,
max_epoch=-1,
patience=-1,
learning_rate=None,
early_stop=None,
multi_gpu=False):

print('Loading data...')
if input_path is None:
x_all1 = y_all1 = x_all2 = y_all2 = x_all3 = y_all3 = []
Expand Down Expand Up @@ -214,7 +224,7 @@ def eval_(input_path=None, architecture=None):


# annotate a list of texts
def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=None, use_ELMo=False):
def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=None, use_ELMo=False, multi_gpu=False):
annotations = []

# load model
Expand All @@ -228,7 +238,7 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non

start_time = time.time()

annotations = model.tag(texts, output_format, features=features)
annotations = model.tag(texts, output_format, features=features, multi_gpu=multi_gpu)
runtime = round(time.time() - start_time, 3)

if output_format == 'json':
Expand Down Expand Up @@ -292,11 +302,16 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non
parser.add_argument("--patience", type=int, default=-1, help="patience, number of extra epochs to perform after "
"the best epoch before stopping a training.")
parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate")

parser.add_argument("--max-epoch", type=int, default=-1,
help="Maximum number of epochs. If specified, it is assumed that earlyStop=False.")
help="Maximum number of epochs.")
parser.add_argument("--early-stop", type=t_or_f, default=None,
help="Force training early termination when evaluation scores at the end of "
"n epochs are not changing.")
parser.add_argument("--multi-gpu", default=False,
help="Enable the support for distributed computing (the batch size needs to be set accordingly using --batch-size)",
action="store_true")


args = parser.parse_args()

Expand All @@ -313,6 +328,7 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non
learning_rate = args.learning_rate
max_epoch = args.max_epoch
early_stop = args.early_stop
multi_gpu = args.multi_gpu

if transformer is None and embeddings_name is None:
# default word embeddings
Expand All @@ -330,7 +346,8 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non
patience=patience,
learning_rate=learning_rate,
max_epoch=max_epoch,
early_stop=early_stop)
early_stop=early_stop,
multi_gpu=multi_gpu)

if action == "eval":
if args.fold_count is not None and args.fold_count > 1:
Expand All @@ -355,7 +372,8 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non
patience=patience,
learning_rate=learning_rate,
max_epoch=max_epoch,
early_stop=early_stop)
early_stop=early_stop,
multi_gpu=multi_gpu)

if action == "tag":
someTexts = []
Expand All @@ -365,7 +383,7 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non
someTexts.append("We also compare ShanghaiTechRGBD with other RGB-D crowd counting datasets in , and we can see that ShanghaiTechRGBD is the most challenging RGB-D crowd counting dataset in terms of the number of images and heads.")
someTexts.append("Insulin levels of all samples were measured by ELISA kit (Mercodia)")

result = annotate_text(someTexts, "json", architecture=architecture, use_ELMo=use_ELMo)
result = annotate_text(someTexts, "json", architecture=architecture, use_ELMo=use_ELMo, multi_gpu=multi_gpu)
print(json.dumps(result, sort_keys=False, indent=4, ensure_ascii=False))


32 changes: 22 additions & 10 deletions delft/applications/grobidTagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,10 @@ def configure(model, architecture, output_path=None, max_sequence_length=-1, bat


# train a GROBID model with all available data

def train(model, embeddings_name=None, architecture=None, transformer=None, input_path=None,
output_path=None, features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1,
use_ELMo=False, incremental=False, input_model_path=None, patience=-1, learning_rate=None, early_stop=None):
use_ELMo=False, incremental=False, input_model_path=None, patience=-1, learning_rate=None, early_stop=None, multi_gpu=False):

print('Loading data...')
if input_path == None:
Expand Down Expand Up @@ -205,7 +206,8 @@ def train(model, embeddings_name=None, architecture=None, transformer=None, inpu
model.load()

start_time = time.time()
model.train(x_train, y_train, f_train, x_valid, y_valid, f_valid, incremental=incremental)
model.train(x_train, y_train, f_train, x_valid, y_valid, f_valid, incremental=incremental, multi_gpu=multi_gpu)

runtime = round(time.time() - start_time, 3)
print("training runtime: %s seconds " % (runtime))

Expand All @@ -221,7 +223,8 @@ def train_eval(model, embeddings_name=None, architecture='BidLSTM_CRF', transfor
input_path=None, output_path=None, fold_count=1,
features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1,
use_ELMo=False, incremental=False, input_model_path=None, patience=-1,
learning_rate=None, early_stop=None):
learning_rate=None, early_stop=None, multi_gpu=False):

print('Loading data...')
if input_path is None:
x_all, y_all, f_all = load_data_and_labels_crf_file('data/sequenceLabelling/grobid/'+model+'/'+model+'-060518.train')
Expand Down Expand Up @@ -276,9 +279,9 @@ def train_eval(model, embeddings_name=None, architecture='BidLSTM_CRF', transfor
start_time = time.time()

if fold_count == 1:
model.train(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid, incremental=incremental)
model.train(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid, incremental=incremental, multi_gpu=multi_gpu)
else:
model.train_nfold(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid, incremental=incremental)
model.train_nfold(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid, incremental=incremental, multi_gpu=multi_gpu)

runtime = round(time.time() - start_time, 3)
print("training runtime: %s seconds " % runtime)
Expand Down Expand Up @@ -327,7 +330,7 @@ def eval_(model, input_path=None, architecture='BidLSTM_CRF', use_ELMo=False):

# annotate a list of texts, this is relevant only of models taking only text as input
# (so not text with layout information)
def annotate_text(texts, model, output_format, architecture='BidLSTM_CRF', features=None, use_ELMo=False):
def annotate_text(texts, model, output_format, architecture='BidLSTM_CRF', features=None, use_ELMo=False, multi_gpu=False):
annotations = []

# load model
Expand All @@ -341,7 +344,7 @@ def annotate_text(texts, model, output_format, architecture='BidLSTM_CRF', featu

start_time = time.time()

annotations = model.tag(texts, output_format, features=features)
annotations = model.tag(texts, output_format, features=features, multi_gpu=multi_gpu)
runtime = round(time.time() - start_time, 3)

if output_format == 'json':
Expand Down Expand Up @@ -415,12 +418,17 @@ class Tasks:
parser.add_argument("--patience", type=int, default=-1, help="patience, number of extra epochs to perform after "
"the best epoch before stopping a training.")
parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate")

parser.add_argument("--max-epoch", type=int, default=-1,
help="Maximum number of epochs for training.")
parser.add_argument("--early-stop", type=t_or_f, default=None,
help="Force training early termination when evaluation scores at the end of "
"n epochs are not changing.")

parser.add_argument("--multi-gpu", default=False,
help="Enable the support for distributed computing (the batch size needs to be set accordingly using --batch-size)",
action="store_true")

args = parser.parse_args()

model = args.model
Expand All @@ -439,6 +447,7 @@ class Tasks:
learning_rate = args.learning_rate
max_epoch = args.max_epoch
early_stop = args.early_stop
multi_gpu = args.multi_gpu

if architecture is None:
raise ValueError("A model architecture has to be specified: " + str(architectures))
Expand All @@ -462,7 +471,8 @@ class Tasks:
patience=patience,
learning_rate=learning_rate,
max_epoch=max_epoch,
early_stop=early_stop)
early_stop=early_stop,
multi_gpu=multi_gpu)

if action == Tasks.EVAL:
if args.fold_count is not None and args.fold_count > 1:
Expand All @@ -489,7 +499,8 @@ class Tasks:
input_model_path=input_model_path,
learning_rate=learning_rate,
max_epoch=max_epoch,
early_stop=early_stop)
early_stop=early_stop,
multi_gpu=multi_gpu)

if action == Tasks.TAG:
someTexts = []
Expand All @@ -499,6 +510,7 @@ class Tasks:
someTexts.append("March the 27th, 2001")
someTexts.append(" on April 27, 2001. . ")
someTexts.append('2018')
someTexts.append('2023 July the 22nd')
elif model == 'citation':
someTexts.append("N. Al-Dhahir and J. Cioffi, \“On the uniform ADC bit precision and clip level computation for a Gaussian signal,\” IEEE Trans. Signal Processing, pp. 434–438, Feb. 1996.")
someTexts.append("T. Steinherz, E. Rivlin, N. Intrator, Off-line cursive script word recognition—a survey, Int. J. Doc. Anal. Recognition 2(3) (1999) 1–33.")
Expand All @@ -515,7 +527,7 @@ class Tasks:
someTexts.append("The statistical analysis was performed using IBM SPSS Statistics v. 20 (SPSS Inc, 2003, Chicago, USA).")

if architecture.find("FEATURE") == -1:
result = annotate_text(someTexts, model, "json", architecture=architecture, use_ELMo=use_ELMo)
result = annotate_text(someTexts, model, "json", architecture=architecture, use_ELMo=use_ELMo, multi_gpu=multi_gpu)
print(json.dumps(result, sort_keys=False, indent=4, ensure_ascii=False))
else:
print("The model " + architecture + " cannot be used without supplying features as input and it's disabled. "
Expand Down
28 changes: 19 additions & 9 deletions delft/applications/insultTagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,12 @@ def configure(architecture, embeddings_name, batch_size=-1, max_epoch=-1, early_

def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
use_ELMo=False, learning_rate=None,
batch_size=-1, max_epoch=-1, early_stop=None):
batch_size, maxlen, patience, early_stop, max_epoch, embeddings_name = configure(
architecture, embeddings_name, batch_size, max_epoch, early_stop)

batch_size=-1, max_epoch=-1, early_stop=None, multi_gpu=False):
batch_size, maxlen, patience, early_stop, max_epoch, embeddings_name = configure(architecture,
embeddings_name,
batch_size,
max_epoch,
early_stop)
root = 'data/sequenceLabelling/toxic/'

train_path = os.path.join(root, 'corrected.xml')
Expand All @@ -58,15 +60,15 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
model = Sequence(model_name, max_epoch=max_epoch, batch_size=batch_size, max_sequence_length=maxlen,
embeddings_name=embeddings_name, architecture=architecture, patience=patience, early_stop=early_stop,
transformer_name=transformer, use_ELMo=use_ELMo, learning_rate=learning_rate)
model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid)
model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, multi_gpu=multi_gpu)
print('training done')

# saving the model (must be called after eval for multiple fold training)
model.save()


# annotate a list of texts, provides results in a list of offset mentions
def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None, use_ELMo=False):
def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None, use_ELMo=False, multi_gpu=False):
annotations = []

model_name = 'insult-' + architecture
Expand All @@ -79,7 +81,7 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None,

start_time = time.time()

annotations = model.tag(texts, output_format)
annotations = model.tag(texts, output_format, multi_gpu=multi_gpu)
runtime = round(time.time() - start_time, 3)

if output_format == 'json':
Expand Down Expand Up @@ -132,12 +134,17 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None,
parser.add_argument("--use-ELMo", action="store_true", help="Use ELMo contextual embeddings")
parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate")
parser.add_argument("--max-epoch", type=int, default=-1,
help="Maximum number of epochs. If specified, it is assumed that earlyStop=False.")
help="Maximum number of epochs.")
parser.add_argument("--batch-size", type=int, default=-1, help="batch-size parameter to be used.")
parser.add_argument("--early-stop", type=t_or_f, default=None,
help="Force training early termination when evaluation scores at the end of "
"n epochs are not changing.")

parser.add_argument("--multi-gpu", default=False,
help="Enable the support for distributed computing (the batch size needs to be set accordingly using --batch-size)",
action="store_true")


args = parser.parse_args()

if args.action not in ('train', 'tag'):
Expand All @@ -148,9 +155,11 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None,
transformer = args.transformer
use_ELMo = args.use_ELMo
learning_rate = args.learning_rate

batch_size = args.batch_size
max_epoch = args.max_epoch
early_stop = args.early_stop
multi_gpu = args.multi_gpu

if transformer == None and embeddings_name == None:
# default word embeddings
Expand All @@ -164,7 +173,8 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None,
learning_rate=learning_rate,
batch_size=batch_size,
max_epoch=max_epoch,
early_stop=early_stop)
early_stop=early_stop,
multi_gpu=multi_gpu)

if args.action == 'tag':
someTexts = ['This is a gentle test.',
Expand Down
Loading

0 comments on commit f4c9d89

Please sign in to comment.