rossumai · simsa-st · Apr 26, 2023 · Mar 8, 2023 · Apr 25, 2023 · Apr 26, 2023
diff --git a/baselines/NER/run_training.sh b/baselines/NER/run_training.sh
@@ -17,8 +17,26 @@ GPU="0"
 # Choose for which NER baseline you want to run the training. You can run multiple trainings
 # consecutively by separating them by space but beware that they run in the order in which they are
 # listed at the bottom of this file, not by the order in the `run` list.
+
 run="roberta_base"
 
+# To run all trainings for baselines included in the docile dataset paper, follow these steps.
+# Note: if you don't have enough memory on your GPU, reduce batch size and correspondingly increase
+# --gradient_accumulation_steps
+# Steps:
+# 1. Get pretrained checkpoints for RoBERTa and LayoutLMv3. Either:
+#   1a. download them: follow instructions in baselines/README.md, or
+#   1b. rerun the pretrainings: set run="roberta_pretraining" here for RoBERTa pretraining and run
+#       `python pretrain.py` in baselines/layoutlmv3_pretrain/ directory.
+# 2. Run the following trainings (uncomment to run all consecutively):
+# run="roberta_base roberta_ours layoutlmv3_base layoutlmv3_ours roberta_base_synthetic_pretraining roberta_ours_synthetic_pretraining layoutlmv3_ours_synthetic_pretraining"
+# 3. Move last checkpoints of models pretrained on synthetic data to the expected location:
+# cp -r \
+#   /app/data/baselines/trainings/${MODEL}_synthetic_pretraining/${TIMESTAMP}/checkpoint-187500 \
+#   /app/data/baselines/checkpoints/${MODEL}_187500
+# 4. Run remaining trainings (uncomment to run all consecutively):
+# run="roberta_base_with_synthetic_pretraining roberta_ours_with_synthetic_pretraining layoutlmv3_ours_with_synthetic_pretraining"
+
 TIMESTAMP=$(date +"%Y%m%d_%H%M_%S")
 OUTPUT_DIR_PREFIX="/app/data/baselines/trainings"
 
@@ -28,7 +46,7 @@ CHECKPOINTS_DIR="/app/data/baselines/checkpoints"
 # Common parameters for all trainings with exception of roberta_pretraining
 DATA="--docile_path /app/data/docile/"
 USE_PREPROCESSED="--preprocessed_dataset_path /app/data/baselines/preprocessed_dataset"
-OTHER_COMMON_PARAMS="--save_total_limit 3 --weight_decay 0.001 --lr 2e-5 --dataloader_num_workers 8 --use_BIO_format --tag_everything --report_all_metrics --stride 0"
+OTHER_COMMON_PARAMS="--save_total_limit 3 --weight_decay 0.001 --lr 2e-5 --dataloader_num_workers 8 --use_BIO_format --tag_everything --report_all_metrics"
 COMMON_PARAMS="${DATA} ${USE_PREPROCESSED} ${OTHER_COMMON_PARAMS}"
 
 # Used for synthetic pretraining of LayoutLMv3
@@ -75,7 +93,7 @@ fi
 
 single_run="roberta_base"
 if [[ " ${run} " =~ " ${single_run} " ]]; then
-  train_params="--train_bs 32 --test_bs 32 --num_epochs 1500 --gradient_accumulation_steps 4 --warmup_ratio 1.0"
+  train_params="--train_bs 16 --test_bs 16 --num_epochs 750 --gradient_accumulation_steps 1"
   model="--model_name roberta-base --use_roberta"
   all_params="${COMMON_PARAMS} ${train_params} ${model}"
   output_dir="${single_run}/${TIMESTAMP}"
@@ -84,7 +102,7 @@ fi
 
 single_run="roberta_ours"
 if [[ " ${run} " =~ " ${single_run} " ]]; then
-  train_params="--train_bs 32 --test_bs 32 --num_epochs 1000 --gradient_accumulation_steps 1 --warmup_ratio 0"
+  train_params="--train_bs 16 --test_bs 16 --num_epochs 750 --gradient_accumulation_steps 1"
   model="--model_name ${CHECKPOINTS_DIR}/roberta_pretraining_50000 --use_roberta"
   all_params="${COMMON_PARAMS} ${train_params} ${model}"
   output_dir="${single_run}/${TIMESTAMP}"
@@ -93,7 +111,7 @@ fi
 
 single_run="layoutlmv3_base"
 if [[ " ${run} " =~ " ${single_run} " ]]; then
-  train_params="--train_bs 32 --test_bs 32 --num_epochs 1500 --gradient_accumulation_steps 1 --warmup_ratio 0"
+  train_params="--train_bs 16 --test_bs 16 --num_epochs 750 --gradient_accumulation_steps 1"
   model="--model_name microsoft/layoutlmv3-base"
   all_params="${COMMON_PARAMS} ${train_params} ${model}"
   output_dir="${single_run}/${TIMESTAMP}"
@@ -102,7 +120,7 @@ fi
 
 single_run="layoutlmv3_ours"
 if [[ " ${run} " =~ " ${single_run} " ]]; then
-  train_params="--train_bs 16 --test_bs 16 --num_epochs 1000 --gradient_accumulation_steps 1 --warmup_ratio 0"
+  train_params="--train_bs 16 --test_bs 16 --num_epochs 750 --gradient_accumulation_steps 1"
   model="--model_name microsoft/layoutlmv3-base --pretrained_weights ${CHECKPOINTS_DIR}/layoutlmv3_pretraining.ckpt"
   all_params="${COMMON_PARAMS} ${train_params} ${model}"
   output_dir="${single_run}/${TIMESTAMP}"
@@ -112,7 +130,7 @@ fi
 single_run="roberta_base_synthetic_pretraining"  # 30 epochs on synthetic data only
 if [[ " ${run} " =~ " ${single_run} " ]]; then
   data_params="--split synthetic"
-  train_params="--train_bs 16 --test_bs 16 --num_epochs 30 --gradient_accumulation_steps 1 --warmup_ratio 0"
+  train_params="--train_bs 16 --test_bs 16 --num_epochs 30 --gradient_accumulation_steps 1"
   model="--model_name roberta-base --use_roberta"
   all_params="${COMMON_PARAMS} ${data_params} ${train_params} ${model}"
   output_dir="${single_run}/${TIMESTAMP}"
@@ -122,7 +140,7 @@ fi
 single_run="roberta_ours_synthetic_pretraining"  # 30 epochs on synthetic data only
 if [[ " ${run} " =~ " ${single_run} " ]]; then
   data_params="--split synthetic"
-  train_params="--train_bs 16 --test_bs 16 --num_epochs 30 --gradient_accumulation_steps 1 --warmup_ratio 0"
+  train_params="--train_bs 16 --test_bs 16 --num_epochs 30 --gradient_accumulation_steps 1"
   model="--model_name ${CHECKPOINTS_DIR}/roberta_pretraining_50000 --use_roberta"
   all_params="${COMMON_PARAMS} ${data_params} ${train_params} ${model}"
   output_dir="${single_run}/${TIMESTAMP}"
@@ -132,7 +150,7 @@ fi
 single_run="layoutlmv3_ours_synthetic_pretraining"  # 30 epochs on synthetic data only
 if [[ " ${run} " =~ " ${single_run} " ]]; then
   data_params="--split synthetic ${USE_ARROW}"
-  train_params="--train_bs 16 --test_bs 16 --num_epochs 30 --gradient_accumulation_steps 1 --warmup_ratio 0"
+  train_params="--train_bs 16 --test_bs 16 --num_epochs 30 --gradient_accumulation_steps 1"
   model="--model_name microsoft/layoutlmv3-base --pretrained_weights ${CHECKPOINTS_DIR}/layoutlmv3_pretraining.ckpt"
   all_params="${COMMON_PARAMS} ${data_params} ${train_params} ${model}"
   output_dir="${single_run}/${TIMESTAMP}"
@@ -141,7 +159,7 @@ fi
 
 single_run="roberta_base_with_synthetic_pretraining"
 if [[ " ${run} " =~ " ${single_run} " ]]; then
-  train_params="--train_bs 8 --test_bs 8 --num_epochs 1500 --gradient_accumulation_steps 8 --warmup_ratio 0"
+  train_params="--train_bs 16 --test_bs 16 --num_epochs 750 --gradient_accumulation_steps 1"
   model="--model_name ${CHECKPOINTS_DIR}/roberta_base_synthetic_pretraining_187500 --use_roberta"
   all_params="${COMMON_PARAMS} ${train_params} ${model}"
   output_dir="${single_run}/${TIMESTAMP}"
@@ -150,7 +168,7 @@ fi
 
 single_run="roberta_ours_with_synthetic_pretraining"
 if [[ " ${run} " =~ " ${single_run} " ]]; then
-  train_params="--train_bs 8 --test_bs 8 --num_epochs 1500 --gradient_accumulation_steps 4 --warmup_ratio 0"
+  train_params="--train_bs 16 --test_bs 16 --num_epochs 750 --gradient_accumulation_steps 1"
   model="--model_name ${CHECKPOINTS_DIR}/roberta_ours_synthetic_pretraining_187500 --use_roberta"
   all_params="${COMMON_PARAMS} ${train_params} ${model}"
   output_dir="${single_run}/${TIMESTAMP}"
@@ -159,7 +177,7 @@ fi
 
 single_run="layoutlmv3_ours_with_synthetic_pretraining"
 if [[ " ${run} " =~ " ${single_run} " ]]; then
-  train_params="--train_bs 16 --test_bs 16 --num_epochs 1500 --gradient_accumulation_steps 1 --warmup_ratio 0"
+  train_params="--train_bs 16 --test_bs 16 --num_epochs 750 --gradient_accumulation_steps 1"
   model="--model_name microsoft/layoutlmv3-base --pretrained_weights ${CHECKPOINTS_DIR}/layoutlmv3_ours_synthetic_pretraining_187500"
   all_params="${COMMON_PARAMS} ${train_params} ${model}"
   output_dir="${single_run}/${TIMESTAMP}"
@@ -170,7 +188,7 @@ fi
 # Not presented in the dataset paper
 single_run="roberta_base_with_2d_embedding"
 if [[ " ${run} " =~ " ${single_run} " ]]; then
-  train_params="--train_bs 32 --test_bs 32 --num_epochs 1500 --gradient_accumulation_steps 4 --warmup_ratio 1.0"
+  train_params="--train_bs 16 --test_bs 16 --num_epochs 750 --gradient_accumulation_steps 1"
   model="--model_name roberta-base --use_roberta --use_new_2D_pos_emb --pos_emb_dim 6500"
   all_params="${COMMON_PARAMS} ${train_params} ${model}"
   output_dir="${single_run}/${TIMESTAMP}"

diff --git a/baselines/README.md b/baselines/README.md
@@ -4,6 +4,10 @@ The DocILE benchmark comes with several baselines to help you get started and to
 
 ## Download checkpoints and predictions
 
+**Info:** New checkpoints were provided on April 27, 2023. Older checkpoints can be downloaded temporarily with the `download_dataset.sh` script by using the string `baselines-20230315` instead of `baselines`. To see results of the previously shared checkpoints on the test and validation sets, check the history of this file. The new trainings were all run with the currently present version of the code and with hyperparameters set to the same values (for NER-based models).
+
+**Warning:** Furthermore a [bug was fixed](https://github.com/rossumai/docile/pull/64) on April 24, 2023 in the NER baselines training code (inference and pretraining were not affected). This bug did not affect any of the shared baselines as it was introduced during refactorings after trainings of the originally shared checkpoints have already finished and before the new trainings have started.
+
 Checkpoints of various trained models are provided with predictions on the validation set. You can download them with the same [download script](../download_dataset.sh) that is provided for downloading the dataset in the root of this repository.
 
 First you need to obtain a secret token by following the instructions at https://docile.rossum.ai/. Then run this from the root of this repository:
@@ -43,24 +47,24 @@ The main benchmark metric for KILE is `AP`, the best results on AP are bold in t
 
 | model                                      | <ins>val-AP</ins>   |   val-F1 |   val-precision |   val-recall | <ins>test-AP</ins>   |   test-F1 |   test-precision |   test-recall |
 |--------------------------------------------|---------------------|----------|-----------------|--------------|----------------------|-----------|------------------|---------------|
-| roberta_base                               | 0.531               |    0.656 |           0.645 |        0.668 | 0.515                |     0.634 |            0.623 |         0.645 |
-| roberta_ours                               | 0.528               |    0.661 |           0.647 |        0.675 | 0.503                |     0.634 |            0.617 |         0.651 |
-| layoutlmv3_ours                            | 0.453               |    0.608 |           0.611 |        0.605 | 0.451                |     0.587 |            0.588 |         0.585 |
-| roberta_base_with_synthetic_pretraining    | 0.554               |    0.680 |           0.676 |        0.683 | 0.537                |     0.659 |            0.653 |         0.665 |
-| roberta_ours_with_synthetic_pretraining    | **0.557**           |    0.683 |           0.682 |        0.683 | **0.541**            |     0.656 |            0.655 |         0.657 |
-| layoutlmv3_ours_with_synthetic_pretraining | 0.507               |    0.656 |           0.662 |        0.651 | 0.493                |     0.641 |            0.646 |         0.636 |
+| roberta_base                               | 0.552               |    0.688 |           0.681 |        0.694 | 0.534                |     0.664 |            0.658 |         0.671 |
+| roberta_ours                               | 0.537               |    0.671 |           0.661 |        0.682 | 0.515                |     0.645 |            0.634 |         0.656 |
+| layoutlmv3_ours                            | 0.513               |    0.657 |           0.651 |        0.662 | 0.507                |     0.639 |            0.636 |         0.641 |
+| roberta_base_with_synthetic_pretraining    | **0.566**           |    0.689 |           0.680 |        0.698 | **0.539**            |     0.664 |            0.659 |         0.669 |
+| roberta_ours_with_synthetic_pretraining    | 0.542               |    0.677 |           0.672 |        0.682 | 0.527                |     0.652 |            0.648 |         0.656 |
+| layoutlmv3_ours_with_synthetic_pretraining | 0.532               |    0.674 |           0.680 |        0.668 | 0.512                |     0.655 |            0.662 |         0.648 |
 
 ### LIR
 
 The main benchmark metric for LIR is `F1`, the best results on F1 are bold in the table.
 
 | model                                      |   val-AP | <ins>val-F1</ins>   |   val-precision |   val-recall |   test-AP | <ins>test-F1</ins>   |   test-precision |   test-recall |
 |--------------------------------------------|----------|---------------------|-----------------|--------------|-----------|----------------------|------------------|---------------|
-| roberta_base                               |    0.542 | 0.675               |           0.695 |        0.656 |     0.548 | 0.669                |            0.679 |         0.659 |
-| roberta_ours                               |    0.533 | 0.657               |           0.672 |        0.643 |     0.571 | **0.674**            |            0.685 |         0.663 |
-| layoutlmv3_ours                            |    0.549 | 0.665               |           0.692 |        0.639 |     0.549 | 0.667                |            0.683 |         0.652 |
-| roberta_base_with_synthetic_pretraining    |    0.567 | 0.688               |           0.706 |        0.670 |     0.556 | 0.665                |            0.684 |         0.646 |
-| roberta_ours_with_synthetic_pretraining    |    0.553 | **0.689**           |           0.722 |        0.659 |     0.551 | 0.671                |            0.700 |         0.644 |
-| layoutlmv3_ours_with_synthetic_pretraining |    0.553 | 0.656               |           0.679 |        0.635 |     0.555 | 0.661                |            0.682 |         0.640 |
-| roberta_base_detr_table                    |    0.519 | 0.660               |           0.700 |        0.624 |     0.526 | 0.652                |            0.675 |         0.631 |
-| roberta_base_detr_tableLI                  |    0.408 | 0.599               |           0.652 |        0.554 |     0.402 | 0.584                |            0.623 |         0.549 |
+| roberta_base                               |    0.552 | 0.688               |           0.709 |        0.668 |     0.576 | 0.686                |            0.695 |         0.678 |
+| roberta_ours                               |    0.538 | 0.662               |           0.676 |        0.649 |     0.570 | 0.686                |            0.693 |         0.678 |
+| layoutlmv3_ours                            |    0.546 | 0.666               |           0.688 |        0.645 |     0.531 | 0.661                |            0.682 |         0.641 |
+| roberta_base_with_synthetic_pretraining    |    0.567 | **0.701**           |           0.721 |        0.683 |     0.583 | **0.698**            |            0.710 |         0.687 |
+| roberta_ours_with_synthetic_pretraining    |    0.549 | 0.682               |           0.703 |        0.662 |     0.559 | 0.675                |            0.696 |         0.655 |
+| layoutlmv3_ours_with_synthetic_pretraining |    0.564 | 0.681               |           0.704 |        0.659 |     0.582 | 0.691                |            0.709 |         0.673 |
+| roberta_base_detr_table                    |    0.553 | 0.682               |           0.719 |        0.648 |     0.560 | 0.682                |            0.706 |         0.660 |
+| roberta_base_detr_tableLI                  |    0.427 | 0.613               |           0.661 |        0.572 |     0.407 | 0.594                |            0.632 |         0.560 |
diff --git a/download_dataset.sh b/download_dataset.sh
@@ -105,16 +105,16 @@ if [[ "$dataset" == "unlabeled" ]]; then
       download_and_unzip "$token" "$targetdir" "$unzip" "unlabeled-chunk-$i"
     fi
   done
-elif [[ "$dataset" == "baselines" ]]; then
-  download_and_unzip "$token" "$targetdir" "$unzip" "baselines-roberta-base"
-  download_and_unzip "$token" "$targetdir" "$unzip" "baselines-roberta-ours"
-  download_and_unzip "$token" "$targetdir" "$unzip" "baselines-layoutlmv3-ours"
-  download_and_unzip "$token" "$targetdir" "$unzip" "baselines-roberta-pretraining"
-  download_and_unzip "$token" "$targetdir" "$unzip" "baselines-layoutlmv3-pretraining"
-  download_and_unzip "$token" "$targetdir" "$unzip" "baselines-detr"
-  download_and_unzip "$token" "$targetdir" "$unzip" "baselines-roberta-base-with-synthetic-pretraining"
-  download_and_unzip "$token" "$targetdir" "$unzip" "baselines-roberta-ours-with-synthetic-pretraining"
-  download_and_unzip "$token" "$targetdir" "$unzip" "baselines-layoutlmv3-ours-with-synthetic-pretraining"
+elif [[ "$dataset" == "baselines" || "$dataset" == "baselines-20230315" ]]; then
+  download_and_unzip "$token" "$targetdir" "$unzip" "${dataset}-roberta-base"
+  download_and_unzip "$token" "$targetdir" "$unzip" "${dataset}-roberta-ours"
+  download_and_unzip "$token" "$targetdir" "$unzip" "${dataset}-layoutlmv3-ours"
+  download_and_unzip "$token" "$targetdir" "$unzip" "${dataset}-roberta-pretraining"
+  download_and_unzip "$token" "$targetdir" "$unzip" "${dataset}-layoutlmv3-pretraining"
+  download_and_unzip "$token" "$targetdir" "$unzip" "${dataset}-detr"
+  download_and_unzip "$token" "$targetdir" "$unzip" "${dataset}-roberta-base-with-synthetic-pretraining"
+  download_and_unzip "$token" "$targetdir" "$unzip" "${dataset}-roberta-ours-with-synthetic-pretraining"
+  download_and_unzip "$token" "$targetdir" "$unzip" "${dataset}-layoutlmv3-ours-with-synthetic-pretraining"
 else
   download_and_unzip "$token" "$targetdir" "$unzip" "$dataset"
 fi
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docile-benchmark"
-version = "0.3.0"
+version = "0.3.1"
 description = "Tools to work with the DocILE dataset and benchmark"
 authors = [
     "Stepan Simsa <stepan.simsa@rossum.ai>",