fix conflict

kermitt2 · Nov 28, 2023 · f4c9d89 · f4c9d89
2 parents 3e68dd9 + 3671cd4
commit f4c9d89
Show file tree

Hide file tree

Showing 8 changed files with 216 additions and 60 deletions.
diff --git a/Readme.md b/Readme.md
@@ -37,7 +37,7 @@ Some contributions include:
 
 A native Java integration of the library has been realized in [GROBID](https://github.com/kermitt2/grobid) via [JEP](https://github.com/ninia/jep).
 
-The latest DeLFT release has been tested successfully with python 3.8 and Tensorflow 2.9.3. As always, GPU(s) are required for decent training time. A GeForce GTX 1050 Ti (4GB) for instance is fine for running RNN models and BERT or RoBERTa base models. Using BERT large model is possible from a GeForce GTX 1080 Ti (11GB) with modest batch size.
+The latest DeLFT release has been tested successfully with python 3.8 and Tensorflow 2.9.3. As always, GPU(s) are required for decent training time. A GeForce GTX 1050 Ti (4GB) for instance is fine for running RNN models and BERT or RoBERTa base models. Using BERT large model is possible from a GeForce GTX 1080 Ti (11GB) with modest batch size. Using multiple GPUs (training and inference) is supported.
 
 ## DeLFT Documentation
 

diff --git a/delft/applications/datasetTagger.py b/delft/applications/datasetTagger.py
@@ -70,10 +70,15 @@ def configure(architecture, output_path=None, max_sequence_length=-1,
 
 # train a model with all available data
 def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
-               input_path=None, output_path=None, fold_count=1,
-               features_indices=None, max_sequence_length=-1,
-          batch_size=-1, max_epoch=-1, use_ELMo=False, patience=-1,
-          learning_rate=None, early_stop=None):
+                input_path=None, output_path=None, fold_count=1,
+                features_indices=None, max_sequence_length=-1,
+                batch_size=-1, use_ELMo=False, 
+                max_epoch=-1, 
+                patience=-1,
+                learning_rate=None, 
+                early_stop=None, 
+                multi_gpu=False):
+
     print('Loading data...')
     if input_path is None:
         x_all1 = y_all1 = x_all2 = y_all2 = x_all3 = y_all3 = []
@@ -122,7 +127,7 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
                     learning_rate=learning_rate)
 
     start_time = time.time()
-    model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid)
+    model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, multi_gpu=multi_gpu)
     runtime = round(time.time() - start_time, 3)
 
     print("training runtime: %s seconds " % runtime)
@@ -136,9 +141,14 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
 
 # split data, train a model and evaluate it
 def train_eval(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
-               input_path=None, output_path=None, fold_count=1,
-               features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, use_ELMo=False,
-               patience=-1, learning_rate=None, early_stop=None):
+                input_path=None, output_path=None, fold_count=1,
+                features_indices=None, max_sequence_length=-1, batch_size=-1, use_ELMo=False,
+                max_epoch=-1, 
+                patience=-1, 
+                learning_rate=None, 
+                early_stop=None, 
+                multi_gpu=False):
+
     print('Loading data...')
     if input_path is None:
         x_all1 = y_all1 = x_all2 = y_all2 = x_all3 = y_all3 = []
@@ -214,7 +224,7 @@ def eval_(input_path=None, architecture=None):
 
 
 # annotate a list of texts
-def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=None, use_ELMo=False):
+def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=None, use_ELMo=False, multi_gpu=False):
     annotations = []
 
     # load model
@@ -228,7 +238,7 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non
 
     start_time = time.time()
 
-    annotations = model.tag(texts, output_format, features=features)
+    annotations = model.tag(texts, output_format, features=features, multi_gpu=multi_gpu)
     runtime = round(time.time() - start_time, 3)
 
     if output_format == 'json':
@@ -292,11 +302,16 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non
     parser.add_argument("--patience", type=int, default=-1, help="patience, number of extra epochs to perform after "
                                                                  "the best epoch before stopping a training.")
     parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate")
+
     parser.add_argument("--max-epoch", type=int, default=-1,
-                        help="Maximum number of epochs. If specified, it is assumed that earlyStop=False.")
+                        help="Maximum number of epochs.")
     parser.add_argument("--early-stop", type=t_or_f, default=None,
                         help="Force training early termination when evaluation scores at the end of "
                              "n epochs are not changing.")
+    parser.add_argument("--multi-gpu", default=False,
+                        help="Enable the support for distributed computing (the batch size needs to be set accordingly using --batch-size)",
+                        action="store_true")
+
 
     args = parser.parse_args()
 
@@ -313,6 +328,7 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non
     learning_rate = args.learning_rate
     max_epoch = args.max_epoch
     early_stop = args.early_stop
+    multi_gpu = args.multi_gpu
 
     if transformer is None and embeddings_name is None:
         # default word embeddings
@@ -330,7 +346,8 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non
             patience=patience,
             learning_rate=learning_rate,
             max_epoch=max_epoch,
-            early_stop=early_stop)
+            early_stop=early_stop,
+            multi_gpu=multi_gpu)
 
     if action == "eval":
         if args.fold_count is not None and args.fold_count > 1:
@@ -355,7 +372,8 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non
                 patience=patience,
                 learning_rate=learning_rate,
                 max_epoch=max_epoch,
-                early_stop=early_stop)
+                early_stop=early_stop,
+                multi_gpu=multi_gpu)
 
     if action == "tag":
         someTexts = []
@@ -365,7 +383,7 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non
         someTexts.append("We also compare ShanghaiTechRGBD with other RGB-D crowd counting datasets in , and we can see that ShanghaiTechRGBD is the most challenging RGB-D crowd counting dataset in terms of the number of images and heads.")
         someTexts.append("Insulin levels of all samples were measured by ELISA kit (Mercodia)")
 
-        result = annotate_text(someTexts, "json", architecture=architecture, use_ELMo=use_ELMo)
+        result = annotate_text(someTexts, "json", architecture=architecture, use_ELMo=use_ELMo, multi_gpu=multi_gpu)
         print(json.dumps(result, sort_keys=False, indent=4, ensure_ascii=False))
 
 
diff --git a/delft/applications/grobidTagger.py b/delft/applications/grobidTagger.py
@@ -151,9 +151,10 @@ def configure(model, architecture, output_path=None, max_sequence_length=-1, bat
 
 
 # train a GROBID model with all available data
+
 def train(model, embeddings_name=None, architecture=None, transformer=None, input_path=None, 
         output_path=None, features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, 
-        use_ELMo=False, incremental=False, input_model_path=None, patience=-1, learning_rate=None, early_stop=None):
+        use_ELMo=False, incremental=False, input_model_path=None, patience=-1, learning_rate=None, early_stop=None, multi_gpu=False):
 
     print('Loading data...')
     if input_path == None:
@@ -205,7 +206,8 @@ def train(model, embeddings_name=None, architecture=None, transformer=None, inpu
             model.load()
 
     start_time = time.time()
-    model.train(x_train, y_train, f_train, x_valid, y_valid, f_valid, incremental=incremental)
+    model.train(x_train, y_train, f_train, x_valid, y_valid, f_valid, incremental=incremental, multi_gpu=multi_gpu)
+
     runtime = round(time.time() - start_time, 3)
     print("training runtime: %s seconds " % (runtime))
 
@@ -221,7 +223,8 @@ def train_eval(model, embeddings_name=None, architecture='BidLSTM_CRF', transfor
                input_path=None, output_path=None, fold_count=1,
                features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, 
                use_ELMo=False, incremental=False, input_model_path=None, patience=-1,
-               learning_rate=None, early_stop=None):
+               learning_rate=None, early_stop=None, multi_gpu=False):
+
     print('Loading data...')
     if input_path is None:
         x_all, y_all, f_all = load_data_and_labels_crf_file('data/sequenceLabelling/grobid/'+model+'/'+model+'-060518.train')
@@ -276,9 +279,9 @@ def train_eval(model, embeddings_name=None, architecture='BidLSTM_CRF', transfor
     start_time = time.time()
 
     if fold_count == 1:
-        model.train(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid, incremental=incremental)
+        model.train(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid, incremental=incremental, multi_gpu=multi_gpu)
     else:
-        model.train_nfold(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid, incremental=incremental)
+        model.train_nfold(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid, incremental=incremental, multi_gpu=multi_gpu)
 
     runtime = round(time.time() - start_time, 3)
     print("training runtime: %s seconds " % runtime)
@@ -327,7 +330,7 @@ def eval_(model, input_path=None, architecture='BidLSTM_CRF', use_ELMo=False):
 
 # annotate a list of texts, this is relevant only of models taking only text as input 
 # (so not text with layout information) 
-def annotate_text(texts, model, output_format, architecture='BidLSTM_CRF', features=None, use_ELMo=False):
+def annotate_text(texts, model, output_format, architecture='BidLSTM_CRF', features=None, use_ELMo=False, multi_gpu=False):
     annotations = []
 
     # load model
@@ -341,7 +344,7 @@ def annotate_text(texts, model, output_format, architecture='BidLSTM_CRF', featu
 
     start_time = time.time()
 
-    annotations = model.tag(texts, output_format, features=features)
+    annotations = model.tag(texts, output_format, features=features, multi_gpu=multi_gpu)
     runtime = round(time.time() - start_time, 3)
 
     if output_format == 'json':
@@ -415,12 +418,17 @@ class Tasks:
     parser.add_argument("--patience", type=int, default=-1, help="patience, number of extra epochs to perform after "
                                                                  "the best epoch before stopping a training.")
     parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate")
+
     parser.add_argument("--max-epoch", type=int, default=-1,
                         help="Maximum number of epochs for training.")
     parser.add_argument("--early-stop", type=t_or_f, default=None,
                         help="Force training early termination when evaluation scores at the end of "
                              "n epochs are not changing.")
 
+    parser.add_argument("--multi-gpu", default=False,
+                        help="Enable the support for distributed computing (the batch size needs to be set accordingly using --batch-size)",
+                        action="store_true")
+
     args = parser.parse_args()
 
     model = args.model
@@ -439,6 +447,7 @@ class Tasks:
     learning_rate = args.learning_rate
     max_epoch = args.max_epoch
     early_stop = args.early_stop
+    multi_gpu = args.multi_gpu
 
     if architecture is None:
         raise ValueError("A model architecture has to be specified: " + str(architectures))
@@ -462,7 +471,8 @@ class Tasks:
             patience=patience,
             learning_rate=learning_rate,
             max_epoch=max_epoch,
-            early_stop=early_stop)
+            early_stop=early_stop,
+            multi_gpu=multi_gpu)
 
     if action == Tasks.EVAL:
         if args.fold_count is not None and args.fold_count > 1:
@@ -489,7 +499,8 @@ class Tasks:
                 input_model_path=input_model_path,
                 learning_rate=learning_rate,
                 max_epoch=max_epoch,
-                early_stop=early_stop)
+                early_stop=early_stop,
+                multi_gpu=multi_gpu)
 
     if action == Tasks.TAG:
         someTexts = []
@@ -499,6 +510,7 @@ class Tasks:
             someTexts.append("March the 27th, 2001")
             someTexts.append(" on  April 27, 2001. . ")
             someTexts.append('2018')
+            someTexts.append('2023 July the 22nd')
         elif model == 'citation':
             someTexts.append("N. Al-Dhahir and J. Cioffi, \“On the uniform ADC bit precision and clip level computation for a Gaussian signal,\” IEEE Trans. Signal Processing, pp. 434–438, Feb. 1996.")
             someTexts.append("T. Steinherz, E. Rivlin, N. Intrator, Off-line cursive script word recognition—a survey, Int. J. Doc. Anal. Recognition 2(3) (1999) 1–33.")
@@ -515,7 +527,7 @@ class Tasks:
             someTexts.append("The statistical analysis was performed using IBM SPSS Statistics v. 20 (SPSS Inc, 2003, Chicago, USA).")
 
         if architecture.find("FEATURE") == -1:
-            result = annotate_text(someTexts, model, "json", architecture=architecture, use_ELMo=use_ELMo)
+            result = annotate_text(someTexts, model, "json", architecture=architecture, use_ELMo=use_ELMo, multi_gpu=multi_gpu)
             print(json.dumps(result, sort_keys=False, indent=4, ensure_ascii=False))
         else:
             print("The model " + architecture + " cannot be used without supplying features as input and it's disabled. "

diff --git a/delft/applications/insultTagger.py b/delft/applications/insultTagger.py
@@ -36,10 +36,12 @@ def configure(architecture, embeddings_name, batch_size=-1, max_epoch=-1, early_
 
 def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
           use_ELMo=False, learning_rate=None,
-          batch_size=-1, max_epoch=-1, early_stop=None):
-    batch_size, maxlen, patience, early_stop, max_epoch, embeddings_name = configure(
-        architecture, embeddings_name, batch_size, max_epoch, early_stop)
-
+          batch_size=-1, max_epoch=-1, early_stop=None, multi_gpu=False):
+    batch_size, maxlen, patience, early_stop, max_epoch, embeddings_name = configure(architecture, 
+                                                                                    embeddings_name, 
+                                                                                    batch_size, 
+                                                                                    max_epoch, 
+                                                                                    early_stop)
     root = 'data/sequenceLabelling/toxic/'
 
     train_path = os.path.join(root, 'corrected.xml')
@@ -58,15 +60,15 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
     model = Sequence(model_name, max_epoch=max_epoch, batch_size=batch_size, max_sequence_length=maxlen, 
         embeddings_name=embeddings_name, architecture=architecture, patience=patience, early_stop=early_stop,
         transformer_name=transformer, use_ELMo=use_ELMo, learning_rate=learning_rate)
-    model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid)
+    model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, multi_gpu=multi_gpu)
     print('training done')
 
     # saving the model (must be called after eval for multiple fold training)
     model.save()
 
 
 # annotate a list of texts, provides results in a list of offset mentions 
-def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None, use_ELMo=False):
+def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None, use_ELMo=False, multi_gpu=False):
     annotations = []
 
     model_name = 'insult-' + architecture
@@ -79,7 +81,7 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None,
 
     start_time = time.time()
 
-    annotations = model.tag(texts, output_format)
+    annotations = model.tag(texts, output_format, multi_gpu=multi_gpu)
     runtime = round(time.time() - start_time, 3)
 
     if output_format == 'json':
@@ -132,12 +134,17 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None,
     parser.add_argument("--use-ELMo", action="store_true", help="Use ELMo contextual embeddings")
     parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate")
     parser.add_argument("--max-epoch", type=int, default=-1,
-                        help="Maximum number of epochs. If specified, it is assumed that earlyStop=False.")
+                        help="Maximum number of epochs.")
     parser.add_argument("--batch-size", type=int, default=-1, help="batch-size parameter to be used.")
     parser.add_argument("--early-stop", type=t_or_f, default=None,
                         help="Force training early termination when evaluation scores at the end of "
                              "n epochs are not changing.")
 
+    parser.add_argument("--multi-gpu", default=False,
+                        help="Enable the support for distributed computing (the batch size needs to be set accordingly using --batch-size)",
+                        action="store_true")
+
+
     args = parser.parse_args()
 
     if args.action not in ('train', 'tag'):
@@ -148,9 +155,11 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None,
     transformer = args.transformer
     use_ELMo = args.use_ELMo
     learning_rate = args.learning_rate
+
     batch_size = args.batch_size
     max_epoch = args.max_epoch
     early_stop = args.early_stop
+    multi_gpu = args.multi_gpu
 
     if transformer == None and embeddings_name == None:
         # default word embeddings
@@ -164,7 +173,8 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None,
               learning_rate=learning_rate,
               batch_size=batch_size,
               max_epoch=max_epoch,
-              early_stop=early_stop)
+              early_stop=early_stop,
+              multi_gpu=multi_gpu)
 
     if args.action == 'tag':
         someTexts = ['This is a gentle test.',