diff --git a/Readme.md b/Readme.md
index 04fbca58..2fe7789c 100644
--- a/Readme.md
+++ b/Readme.md
@@ -37,7 +37,7 @@ Some contributions include:
 
 A native Java integration of the library has been realized in [GROBID](https://github.com/kermitt2/grobid) via [JEP](https://github.com/ninia/jep).
 
-The latest DeLFT release has been tested successfully with python 3.8 and Tensorflow 2.9.3. As always, GPU(s) are required for decent training time. A GeForce GTX 1050 Ti (4GB) for instance is fine for running RNN models and BERT or RoBERTa base models. Using BERT large model is possible from a GeForce GTX 1080 Ti (11GB) with modest batch size.
+The latest DeLFT release has been tested successfully with python 3.8 and Tensorflow 2.9.3. As always, GPU(s) are required for decent training time. A GeForce GTX 1050 Ti (4GB) for instance is fine for running RNN models and BERT or RoBERTa base models. Using BERT large model is possible from a GeForce GTX 1080 Ti (11GB) with modest batch size. Using multiple GPUs (training and inference) is supported.
 
 ## DeLFT Documentation
 
diff --git a/delft/applications/datasetTagger.py b/delft/applications/datasetTagger.py
index 1a09bae7..ec6d46d2 100644
--- a/delft/applications/datasetTagger.py
+++ b/delft/applications/datasetTagger.py
@@ -70,10 +70,15 @@ def configure(architecture, output_path=None, max_sequence_length=-1,
 
 # train a model with all available data
 def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
-               input_path=None, output_path=None, fold_count=1,
-               features_indices=None, max_sequence_length=-1,
-          batch_size=-1, max_epoch=-1, use_ELMo=False, patience=-1,
-          learning_rate=None, early_stop=None):
+                input_path=None, output_path=None, fold_count=1,
+                features_indices=None, max_sequence_length=-1,
+                batch_size=-1, use_ELMo=False, 
+                max_epoch=-1, 
+                patience=-1,
+                learning_rate=None, 
+                early_stop=None, 
+                multi_gpu=False):
+
     print('Loading data...')
     if input_path is None:
         x_all1 = y_all1 = x_all2 = y_all2 = x_all3 = y_all3 = []
@@ -122,7 +127,7 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
                     learning_rate=learning_rate)
 
     start_time = time.time()
-    model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid)
+    model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, multi_gpu=multi_gpu)
     runtime = round(time.time() - start_time, 3)
 
     print("training runtime: %s seconds " % runtime)
@@ -136,9 +141,14 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
 
 # split data, train a model and evaluate it
 def train_eval(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
-               input_path=None, output_path=None, fold_count=1,
-               features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, use_ELMo=False,
-               patience=-1, learning_rate=None, early_stop=None):
+                input_path=None, output_path=None, fold_count=1,
+                features_indices=None, max_sequence_length=-1, batch_size=-1, use_ELMo=False,
+                max_epoch=-1, 
+                patience=-1, 
+                learning_rate=None, 
+                early_stop=None, 
+                multi_gpu=False):
+
     print('Loading data...')
     if input_path is None:
         x_all1 = y_all1 = x_all2 = y_all2 = x_all3 = y_all3 = []
@@ -214,7 +224,7 @@ def eval_(input_path=None, architecture=None):
 
 
 # annotate a list of texts
-def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=None, use_ELMo=False):
+def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=None, use_ELMo=False, multi_gpu=False):
     annotations = []
 
     # load model
@@ -228,7 +238,7 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non
 
     start_time = time.time()
 
-    annotations = model.tag(texts, output_format, features=features)
+    annotations = model.tag(texts, output_format, features=features, multi_gpu=multi_gpu)
     runtime = round(time.time() - start_time, 3)
 
     if output_format == 'json':
@@ -292,11 +302,16 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non
     parser.add_argument("--patience", type=int, default=-1, help="patience, number of extra epochs to perform after "
                                                                  "the best epoch before stopping a training.")
     parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate")
+
     parser.add_argument("--max-epoch", type=int, default=-1,
-                        help="Maximum number of epochs. If specified, it is assumed that earlyStop=False.")
+                        help="Maximum number of epochs.")
     parser.add_argument("--early-stop", type=t_or_f, default=None,
                         help="Force training early termination when evaluation scores at the end of "
                              "n epochs are not changing.")
+    parser.add_argument("--multi-gpu", default=False,
+                        help="Enable the support for distributed computing (the batch size needs to be set accordingly using --batch-size)",
+                        action="store_true")
+
 
     args = parser.parse_args()
 
@@ -313,6 +328,7 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non
     learning_rate = args.learning_rate
     max_epoch = args.max_epoch
     early_stop = args.early_stop
+    multi_gpu = args.multi_gpu
 
     if transformer is None and embeddings_name is None:
         # default word embeddings
@@ -330,7 +346,8 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non
             patience=patience,
             learning_rate=learning_rate,
             max_epoch=max_epoch,
-            early_stop=early_stop)
+            early_stop=early_stop,
+            multi_gpu=multi_gpu)
 
     if action == "eval":
         if args.fold_count is not None and args.fold_count > 1:
@@ -355,7 +372,8 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non
                 patience=patience,
                 learning_rate=learning_rate,
                 max_epoch=max_epoch,
-                early_stop=early_stop)
+                early_stop=early_stop,
+                multi_gpu=multi_gpu)
 
     if action == "tag":
         someTexts = []
@@ -365,7 +383,7 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non
         someTexts.append("We also compare ShanghaiTechRGBD with other RGB-D crowd counting datasets in , and we can see that ShanghaiTechRGBD is the most challenging RGB-D crowd counting dataset in terms of the number of images and heads.")
         someTexts.append("Insulin levels of all samples were measured by ELISA kit (Mercodia)")
 
-        result = annotate_text(someTexts, "json", architecture=architecture, use_ELMo=use_ELMo)
+        result = annotate_text(someTexts, "json", architecture=architecture, use_ELMo=use_ELMo, multi_gpu=multi_gpu)
         print(json.dumps(result, sort_keys=False, indent=4, ensure_ascii=False))
         
 
diff --git a/delft/applications/grobidTagger.py b/delft/applications/grobidTagger.py
index 7db1929a..d4a7e30b 100644
--- a/delft/applications/grobidTagger.py
+++ b/delft/applications/grobidTagger.py
@@ -151,9 +151,10 @@ def configure(model, architecture, output_path=None, max_sequence_length=-1, bat
 
 
 # train a GROBID model with all available data
+
 def train(model, embeddings_name=None, architecture=None, transformer=None, input_path=None, 
         output_path=None, features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, 
-        use_ELMo=False, incremental=False, input_model_path=None, patience=-1, learning_rate=None, early_stop=None):
+        use_ELMo=False, incremental=False, input_model_path=None, patience=-1, learning_rate=None, early_stop=None, multi_gpu=False):
 
     print('Loading data...')
     if input_path == None:
@@ -205,7 +206,8 @@ def train(model, embeddings_name=None, architecture=None, transformer=None, inpu
             model.load()
 
     start_time = time.time()
-    model.train(x_train, y_train, f_train, x_valid, y_valid, f_valid, incremental=incremental)
+    model.train(x_train, y_train, f_train, x_valid, y_valid, f_valid, incremental=incremental, multi_gpu=multi_gpu)
+
     runtime = round(time.time() - start_time, 3)
     print("training runtime: %s seconds " % (runtime))
 
@@ -221,7 +223,8 @@ def train_eval(model, embeddings_name=None, architecture='BidLSTM_CRF', transfor
                input_path=None, output_path=None, fold_count=1,
                features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, 
                use_ELMo=False, incremental=False, input_model_path=None, patience=-1,
-               learning_rate=None, early_stop=None):
+               learning_rate=None, early_stop=None, multi_gpu=False):
+
     print('Loading data...')
     if input_path is None:
         x_all, y_all, f_all = load_data_and_labels_crf_file('data/sequenceLabelling/grobid/'+model+'/'+model+'-060518.train')
@@ -276,9 +279,9 @@ def train_eval(model, embeddings_name=None, architecture='BidLSTM_CRF', transfor
     start_time = time.time()
 
     if fold_count == 1:
-        model.train(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid, incremental=incremental)
+        model.train(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid, incremental=incremental, multi_gpu=multi_gpu)
     else:
-        model.train_nfold(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid, incremental=incremental)
+        model.train_nfold(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid, incremental=incremental, multi_gpu=multi_gpu)
 
     runtime = round(time.time() - start_time, 3)
     print("training runtime: %s seconds " % runtime)
@@ -327,7 +330,7 @@ def eval_(model, input_path=None, architecture='BidLSTM_CRF', use_ELMo=False):
 
 # annotate a list of texts, this is relevant only of models taking only text as input 
 # (so not text with layout information) 
-def annotate_text(texts, model, output_format, architecture='BidLSTM_CRF', features=None, use_ELMo=False):
+def annotate_text(texts, model, output_format, architecture='BidLSTM_CRF', features=None, use_ELMo=False, multi_gpu=False):
     annotations = []
 
     # load model
@@ -341,7 +344,7 @@ def annotate_text(texts, model, output_format, architecture='BidLSTM_CRF', featu
 
     start_time = time.time()
 
-    annotations = model.tag(texts, output_format, features=features)
+    annotations = model.tag(texts, output_format, features=features, multi_gpu=multi_gpu)
     runtime = round(time.time() - start_time, 3)
 
     if output_format == 'json':
@@ -415,12 +418,17 @@ class Tasks:
     parser.add_argument("--patience", type=int, default=-1, help="patience, number of extra epochs to perform after "
                                                                  "the best epoch before stopping a training.")
     parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate")
+
     parser.add_argument("--max-epoch", type=int, default=-1,
                         help="Maximum number of epochs for training.")
     parser.add_argument("--early-stop", type=t_or_f, default=None,
                         help="Force training early termination when evaluation scores at the end of "
                              "n epochs are not changing.")
 
+    parser.add_argument("--multi-gpu", default=False,
+                        help="Enable the support for distributed computing (the batch size needs to be set accordingly using --batch-size)",
+                        action="store_true")
+
     args = parser.parse_args()
 
     model = args.model
@@ -439,6 +447,7 @@ class Tasks:
     learning_rate = args.learning_rate
     max_epoch = args.max_epoch
     early_stop = args.early_stop
+    multi_gpu = args.multi_gpu
 
     if architecture is None:
         raise ValueError("A model architecture has to be specified: " + str(architectures))
@@ -462,7 +471,8 @@ class Tasks:
             patience=patience,
             learning_rate=learning_rate,
             max_epoch=max_epoch,
-            early_stop=early_stop)
+            early_stop=early_stop,
+            multi_gpu=multi_gpu)
 
     if action == Tasks.EVAL:
         if args.fold_count is not None and args.fold_count > 1:
@@ -489,7 +499,8 @@ class Tasks:
                 input_model_path=input_model_path,
                 learning_rate=learning_rate,
                 max_epoch=max_epoch,
-                early_stop=early_stop)
+                early_stop=early_stop,
+                multi_gpu=multi_gpu)
 
     if action == Tasks.TAG:
         someTexts = []
@@ -499,6 +510,7 @@ class Tasks:
             someTexts.append("March the 27th, 2001")
             someTexts.append(" on  April 27, 2001. . ")
             someTexts.append('2018')
+            someTexts.append('2023 July the 22nd')
         elif model == 'citation':
             someTexts.append("N. Al-Dhahir and J. Cioffi, \“On the uniform ADC bit precision and clip level computation for a Gaussian signal,\” IEEE Trans. Signal Processing, pp. 434–438, Feb. 1996.")
             someTexts.append("T. Steinherz, E. Rivlin, N. Intrator, Off-line cursive script word recognition—a survey, Int. J. Doc. Anal. Recognition 2(3) (1999) 1–33.")
@@ -515,7 +527,7 @@ class Tasks:
             someTexts.append("The statistical analysis was performed using IBM SPSS Statistics v. 20 (SPSS Inc, 2003, Chicago, USA).")
 
         if architecture.find("FEATURE") == -1:
-            result = annotate_text(someTexts, model, "json", architecture=architecture, use_ELMo=use_ELMo)
+            result = annotate_text(someTexts, model, "json", architecture=architecture, use_ELMo=use_ELMo, multi_gpu=multi_gpu)
             print(json.dumps(result, sort_keys=False, indent=4, ensure_ascii=False))
         else:
             print("The model " + architecture + " cannot be used without supplying features as input and it's disabled. "
diff --git a/delft/applications/insultTagger.py b/delft/applications/insultTagger.py
index d4cd4095..3bae351c 100644
--- a/delft/applications/insultTagger.py
+++ b/delft/applications/insultTagger.py
@@ -36,10 +36,12 @@ def configure(architecture, embeddings_name, batch_size=-1, max_epoch=-1, early_
 
 def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
           use_ELMo=False, learning_rate=None,
-          batch_size=-1, max_epoch=-1, early_stop=None):
-    batch_size, maxlen, patience, early_stop, max_epoch, embeddings_name = configure(
-        architecture, embeddings_name, batch_size, max_epoch, early_stop)
-
+          batch_size=-1, max_epoch=-1, early_stop=None, multi_gpu=False):
+    batch_size, maxlen, patience, early_stop, max_epoch, embeddings_name = configure(architecture, 
+                                                                                    embeddings_name, 
+                                                                                    batch_size, 
+                                                                                    max_epoch, 
+                                                                                    early_stop)
     root = 'data/sequenceLabelling/toxic/'
 
     train_path = os.path.join(root, 'corrected.xml')
@@ -58,7 +60,7 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
     model = Sequence(model_name, max_epoch=max_epoch, batch_size=batch_size, max_sequence_length=maxlen, 
         embeddings_name=embeddings_name, architecture=architecture, patience=patience, early_stop=early_stop,
         transformer_name=transformer, use_ELMo=use_ELMo, learning_rate=learning_rate)
-    model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid)
+    model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, multi_gpu=multi_gpu)
     print('training done')
 
     # saving the model (must be called after eval for multiple fold training)
@@ -66,7 +68,7 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
 
 
 # annotate a list of texts, provides results in a list of offset mentions 
-def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None, use_ELMo=False):
+def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None, use_ELMo=False, multi_gpu=False):
     annotations = []
 
     model_name = 'insult-' + architecture
@@ -79,7 +81,7 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None,
 
     start_time = time.time()
 
-    annotations = model.tag(texts, output_format)
+    annotations = model.tag(texts, output_format, multi_gpu=multi_gpu)
     runtime = round(time.time() - start_time, 3)
 
     if output_format == 'json':
@@ -132,12 +134,17 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None,
     parser.add_argument("--use-ELMo", action="store_true", help="Use ELMo contextual embeddings")
     parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate")
     parser.add_argument("--max-epoch", type=int, default=-1,
-                        help="Maximum number of epochs. If specified, it is assumed that earlyStop=False.")
+                        help="Maximum number of epochs.")
     parser.add_argument("--batch-size", type=int, default=-1, help="batch-size parameter to be used.")
     parser.add_argument("--early-stop", type=t_or_f, default=None,
                         help="Force training early termination when evaluation scores at the end of "
                              "n epochs are not changing.")
 
+    parser.add_argument("--multi-gpu", default=False,
+                        help="Enable the support for distributed computing (the batch size needs to be set accordingly using --batch-size)",
+                        action="store_true")
+
+
     args = parser.parse_args()
 
     if args.action not in ('train', 'tag'):
@@ -148,9 +155,11 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None,
     transformer = args.transformer
     use_ELMo = args.use_ELMo
     learning_rate = args.learning_rate
+
     batch_size = args.batch_size
     max_epoch = args.max_epoch
     early_stop = args.early_stop
+    multi_gpu = args.multi_gpu
 
     if transformer == None and embeddings_name == None:
         # default word embeddings
@@ -164,7 +173,8 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None,
               learning_rate=learning_rate,
               batch_size=batch_size,
               max_epoch=max_epoch,
-              early_stop=early_stop)
+              early_stop=early_stop,
+              multi_gpu=multi_gpu)
 
     if args.action == 'tag':
         someTexts = ['This is a gentle test.', 
diff --git a/delft/applications/nerTagger.py b/delft/applications/nerTagger.py
index 33407f01..808dcdfd 100644
--- a/delft/applications/nerTagger.py
+++ b/delft/applications/nerTagger.py
@@ -79,8 +79,9 @@ def configure(architecture, dataset_type, lang, embeddings_name,
 
 # train a model with all available for a given dataset 
 def train(dataset_type='conll2003', lang='en', embeddings_name=None, architecture='BidLSTM_CRF',
+
           transformer=None, data_path=None, use_ELMo=False, max_sequence_length=-1,
-          batch_size=-1, patience=-1, learning_rate=None, max_epoch=-1, early_stop=None):
+          batch_size=-1, patience=-1, learning_rate=None, max_epoch=-1, early_stop=None, multi_gpu=False):
 
     batch_size, max_sequence_length, patience, recurrent_dropout, early_stop, max_epoch, embeddings_name, word_lstm_units, multiprocessing = \
         configure(architecture, dataset_type, lang, embeddings_name, use_ELMo, max_sequence_length, batch_size, patience, max_epoch, early_stop)
@@ -210,7 +211,8 @@ def train_eval(embeddings_name=None,
                 max_sequence_length=-1,
                 learning_rate=None,
                 max_epoch=-1,
-                early_stop=None):
+                early_stop=None,
+                multi_gpu=False):
 
     batch_size, max_sequence_length, patience, recurrent_dropout, early_stop, max_epoch, embeddings_name, word_lstm_units, multiprocessing = \
         configure(architecture, dataset_type, lang, embeddings_name, use_ELMo,
@@ -468,9 +470,9 @@ def train_eval(embeddings_name=None,
 
     start_time = time.time()
     if fold_count == 1:
-        model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid)
+        model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid, multi_gpu=multi_gpu)
     else:
-        model.train_nfold(x_train, y_train, x_valid=x_valid, y_valid=y_valid)
+        model.train_nfold(x_train, y_train, x_valid=x_valid, y_valid=y_valid, multi_gpu=multi_gpu)
     runtime = round(time.time() - start_time, 3)
     print("training runtime: %s seconds " % (runtime))
 
@@ -533,7 +535,8 @@ def annotate(output_format,
              architecture='BidLSTM_CRF',
              file_in=None, 
              file_out=None,
-             use_ELMo=False):
+             use_ELMo=False,
+             multi_gpu=False):
     if file_in is None:
         raise ValueError("an input file to be annotated must be provided")
 
@@ -570,7 +573,7 @@ def annotate(output_format,
 
     start_time = time.time()
 
-    model.tag_file(file_in=file_in, output_format=output_format, file_out=file_out)
+    model.tag_file(file_in=file_in, output_format=output_format, file_out=file_out, multi_gpu=multi_gpu)
     runtime = round(time.time() - start_time, 3)
 
     print("runtime: %s seconds " % (runtime))
@@ -628,12 +631,18 @@ def annotate(output_format,
     parser.add_argument("--patience", type=int, default=-1, help="patience, number of extra epochs to perform after "
                                                                  "the best epoch before stopping a training.")
     parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate")
+
     parser.add_argument("--max-epoch", type=int, default=-1,
-                        help="Maximum number of epochs. If specified, it is assumed that earlyStop=False.")
+                        help="Maximum number of epochs.")
     parser.add_argument("--early-stop", type=t_or_f, default=None,
                         help="Force training early termination when evaluation scores at the end of "
                              "n epochs are not changing.")
 
+    parser.add_argument("--multi-gpu", default=False,
+                        help="Enable the support for distributed computing (the batch size needs to be set accordingly using --batch-size)",
+                        action="store_true")
+
+
     args = parser.parse_args()
 
     action = args.action    
@@ -656,6 +665,7 @@ def annotate(output_format,
     learning_rate = args.learning_rate
     max_epoch = args.max_epoch
     early_stop = args.early_stop
+    multi_gpu = args.multi_gpu
 
     # name of embeddings refers to the file delft/resources-registry.json
     # be sure to use here the same name as in the registry ('glove-840B', 'fasttext-crawl', 'word2vec'), 
@@ -677,7 +687,8 @@ def annotate(output_format,
             patience=patience,
             learning_rate=learning_rate,
             max_epoch=max_epoch,
-            early_stop=early_stop
+            early_stop=early_stop,
+            multi_gpu=multi_gpu
         )
 
     if action == 'train_eval':
@@ -698,7 +709,8 @@ def annotate(output_format,
             patience=patience,
             learning_rate=learning_rate,
             max_epoch=max_epoch,
-            early_stop=early_stop
+            early_stop=early_stop,
+            multi_gpu=multi_gpu
             )
 
     if action == 'eval':
@@ -714,14 +726,15 @@ def annotate(output_format,
             print("Language not supported:", lang)
         else: 
             print(file_in)
-            result = annotate("json", 
-                            dataset_type, 
-                            lang, 
-                            architecture=architecture, 
+            annotate("json",
+                            dataset_type,
+                            lang,
+                            architecture=architecture,
                             #transformer=transformer,
-                            file_in=file_in, 
+                            file_in=file_in,
                             file_out=file_out,
-                            use_ELMo=use_ELMo)
+                            use_ELMo=use_ELMo,
+                            multi_gpu=multi_gpu)
             """
             if result is not None:
                 if file_out is None:
diff --git a/delft/sequenceLabelling/tagger.py b/delft/sequenceLabelling/tagger.py
index 71f6a2f9..cf389f54 100644
--- a/delft/sequenceLabelling/tagger.py
+++ b/delft/sequenceLabelling/tagger.py
@@ -1,6 +1,8 @@
 import datetime
 
 import numpy as np
+import tensorflow as tf
+from packaging import version
 
 from delft.sequenceLabelling.data_generator import DataGeneratorTransformers
 from delft.sequenceLabelling.preprocess import Preprocessor
@@ -22,7 +24,23 @@ def __init__(self,
         self.model_config = model_config
         self.embeddings = embeddings
 
-    def tag(self, texts, output_format, features=None):
+    def tag(self, texts, output_format, features=None, multi_gpu=False):
+        if multi_gpu:
+            strategy = tf.distribute.MirroredStrategy()
+            print('Running with multi-gpu. Number of devices: {}'.format(strategy.num_replicas_in_sync))
+
+            # This trick avoid an exception being through when the --multi-gpu approach is used on a single GPU system.
+            # It might be removed with TF 2.10 https://github.com/tensorflow/tensorflow/issues/50487
+            if version.parse(tf.__version__) < version.parse('2.10.0'):
+                import atexit
+                atexit.register(strategy._extended._collective_ops._pool.close) # type: ignore
+
+            with strategy.scope():
+                return self.tag_(texts, output_format, features)
+        else:
+            return self.tag_(texts, output_format, features)
+
+    def tag_(self, texts, output_format, features=None):
 
         if output_format == 'json':
             res = {
diff --git a/delft/sequenceLabelling/wrapper.py b/delft/sequenceLabelling/wrapper.py
index 29e09111..f2b2d802 100644
--- a/delft/sequenceLabelling/wrapper.py
+++ b/delft/sequenceLabelling/wrapper.py
@@ -1,5 +1,7 @@
 import os
 
+from packaging import version
+
 # ask tensorflow to be quiet and not print hundred lines of logs
 from delft.utilities.Transformer import TRANSFORMER_CONFIG_FILE_NAME, DEFAULT_TRANSFORMER_TOKENIZER_DIR
 from delft.utilities.misc import print_parameters
@@ -141,7 +143,23 @@ def __init__(self,
                                               early_stop, patience,
                                               max_checkpoints_to_keep, multiprocessing)
 
-    def train(self, x_train, y_train, f_train=None, x_valid=None, y_valid=None, f_valid=None, incremental=False, callbacks=None):
+    def train(self, x_train, y_train, f_train=None, x_valid=None, y_valid=None, f_valid=None, incremental=False, callbacks=None, multi_gpu=False):
+        # TBD if valid is None, segment train to get one if early_stop is True
+        if multi_gpu:
+            strategy = tf.distribute.MirroredStrategy()
+            print('Running with multi-gpu. Number of devices: {}'.format(strategy.num_replicas_in_sync))
+
+            # This trick avoid an exception being through when the --multi-gpu approach is used on a single GPU system.
+            # It might be removed with TF 2.10 https://github.com/tensorflow/tensorflow/issues/50487
+            import atexit
+            atexit.register(strategy._extended._collective_ops._pool.close) # type: ignore
+
+            with strategy.scope():
+                self.train_(x_train, y_train, f_train, x_valid, y_valid, f_valid, incremental, callbacks)
+        else:
+            self.train_(x_train, y_train, f_train, x_valid, y_valid, f_valid, incremental, callbacks)
+
+    def train_(self, x_train, y_train, f_train=None, x_valid=None, y_valid=None, f_valid=None, incremental=False, callbacks=None):
         # TBD if valid is None, segment train to get one if early_stop is True
 
         # we concatenate all the training+validation data to create the model vocabulary
@@ -194,13 +212,28 @@ def train(self, x_train, y_train, f_train=None, x_valid=None, y_valid=None, f_va
         if self.embeddings and self.embeddings.use_ELMo:
             self.embeddings.clean_ELMo_cache()
 
-    def train_nfold(self, x_train, y_train, x_valid=None, y_valid=None, f_train=None, f_valid=None, incremental=False, callbacks=None):
+    def train_nfold(self, x_train, y_train, x_valid=None, y_valid=None, f_train=None, f_valid=None, incremental=False, callbacks=None, multi_gpu=False):
+        if multi_gpu:
+            strategy = tf.distribute.MirroredStrategy()
+            print('Running with multi-gpu. Number of devices: {}'.format(strategy.num_replicas_in_sync))
+
+            # This trick avoid an exception being through when the --multi-gpu approach is used on a single GPU system.
+            # It might be removed with TF 2.10 https://github.com/tensorflow/tensorflow/issues/50487
+            import atexit
+            atexit.register(strategy._extended._collective_ops._pool.close) # type: ignore
+
+            with strategy.scope():
+                self.train_nfold_(x_train, y_train, x_valid, y_valid, f_train, f_valid, incremental, callbacks)
+        else:
+            self.train_nfold_(x_train, y_train, x_valid, y_valid, f_train, f_valid, incremental, callbacks)
+
+    def train_nfold_(self, x_train, y_train, x_valid=None, y_valid=None, f_train=None, f_valid=None, incremental=False, callbacks=None):
         x_all = np.concatenate((x_train, x_valid), axis=0) if x_valid is not None else x_train
         y_all = np.concatenate((y_train, y_valid), axis=0) if y_valid is not None else y_train
         features_all = concatenate_or_none((f_train, f_valid), axis=0)
 
         if incremental:
-            if self.model == None and self.models == None:
+            if self.model is None and self.models is None:
                 print("error: you must load a model first for an incremental training")
                 return
 
@@ -435,7 +468,23 @@ def eval_nfold(self, x_test, y_test, features=None):
             print(get_report(fold_average_evaluation, digits=4, include_avgs=['micro']))
 
 
-    def tag(self, texts, output_format, features=None, batch_size=None):
+    def tag(self, texts, output_format, features=None, batch_size=None, multi_gpu=False):
+        if multi_gpu:
+            strategy = tf.distribute.MirroredStrategy()
+            print('Running with multi-gpu. Number of devices: {}'.format(strategy.num_replicas_in_sync))
+
+            # This trick avoid an exception being through when the --multi-gpu approach is used on a single GPU system.
+            # It might be removed with TF 2.10 https://github.com/tensorflow/tensorflow/issues/50487
+            if version.parse(tf.__version__) < version.parse('2.10.0'):
+                import atexit
+                atexit.register(strategy._extended._collective_ops._pool.close) # type: ignore
+
+            with strategy.scope():
+                return self.tag_(texts, output_format, features, batch_size)
+        else:
+            return self.tag_(texts, output_format, features, batch_size)
+
+    def tag_(self, texts, output_format, features=None, batch_size=None):
         # annotate a list of sentences, return the list of annotations in the 
         # specified output_format
 
@@ -462,13 +511,13 @@ def tag(self, texts, output_format, features=None, batch_size=None):
         else:
             raise (OSError('Could not find a model.' + str(self.model)))
 
-    def tag_file(self, file_in, output_format, file_out, batch_size=None):
+    def tag_file(self, file_in, output_format, file_out, batch_size=None, multi_gpu=False):
         # Annotate a text file containing one sentence per line, the annotations are
         # written in the output file if not None, in the standard output otherwise.
         # Processing is streamed by batches so that we can process huge files without
         # memory issues
 
-        if batch_size != None:
+        if batch_size is not None:
             self.model_config.batch_size = batch_size
             print("---")
             print("batch_size (prediction):", self.model_config.batch_size)
@@ -486,10 +535,10 @@ def tag_file(self, file_in, output_format, file_out, batch_size=None):
             first = True
             with open(file_in, 'r') as f:
                 texts = None
-                while texts == None or len(texts) == self.model_config.batch_size * self.nb_workers:
+                while texts is None or len(texts) == self.model_config.batch_size * self.nb_workers:
 
                   texts = next_n_lines(f, self.model_config.batch_size * self.nb_workers)
-                  annotations = tagger.tag(texts, output_format)
+                  annotations = tagger.tag(texts, output_format, multi_gpu=multi_gpu)
                   # if the following is true, we just output the JSON returned by the tagger without any modification
                   directDump = False
                   if first:
diff --git a/doc/Install-DeLFT.md b/doc/Install-DeLFT.md
index 0b71f7bd..9ed49e91 100644
--- a/doc/Install-DeLFT.md
+++ b/doc/Install-DeLFT.md
@@ -31,7 +31,7 @@ To ensure the availability of GPU devices for the right version of tensorflow, C
 
 ## Loading resources locally
 
-Required resources to train models (static embeddings, pre-trained transformer models) will be downloaded automatically. However, if you wish to load these resources locally, you need to notify their local path in the resource registry file. 
+Required resources to train models (static embeddings, pre-trained transformer models) will be downloaded automatically, in particular via Hugging Face Hub using the model name identifier. However, if you wish to load these resources locally, you need to notify their local path in the resource registry file. 
 
 Edit the file `delft/resources-registry.json` and modify the value for `path` according to the path where you have saved the corresponding embeddings. The embedding files must be unzipped. For instance, for loading glove-840B embeddings from a local path:
 
@@ -47,8 +47,44 @@ Edit the file `delft/resources-registry.json` and modify the value for `path` ac
             "item": "word"
         },
         ...
-    ]
+    ],
+    ...
 }
 
 ```
 
+For pre-trained transformer models (for example downloaded from Hugging Face), you can indicate simply the path to the model directory, as follow:
+
+
+```json
+{
+    "transformers": [
+        {
+            "name": "scilons/scilons-bert-v0.1",
+            "model_dir": "/media/lopez/T52/models/scilons/scilons-bert-v0.1/",
+            "lang": "en"
+        },
+        ...
+    ],
+    ...
+}
+```
+
+For older transformer formats with just config, vocab and checkpoint weights file, you can indicate the resources like this:
+
+```json
+{
+    "transformers": [
+        {
+            "name": "dmis-lab/biobert-base-cased-v1.2",
+            "path-config": "/media/lopez/T5/embeddings/biobert_v1.2_pubmed/bert_config.json",
+            "path-weights": "/media/lopez/T5/embeddings/biobert_v1.2_pubmed/model.ckpt-1000000",
+            "path-vocab": "/media/lopez/T5/embeddings/biobert_v1.2_pubmed/vocab.txt",
+            "lang": "en"
+        },
+        ...
+    ],
+    ...
+}
+```
+