Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Provide new baseline checkpoints #67

Merged
merged 3 commits into from
Apr 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 30 additions & 12 deletions baselines/NER/run_training.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,26 @@ GPU="0"
# Choose for which NER baseline you want to run the training. You can run multiple trainings
# consecutively by separating them by space but beware that they run in the order in which they are
# listed at the bottom of this file, not by the order in the `run` list.

run="roberta_base"

# To run all trainings for baselines included in the docile dataset paper, follow these steps.
# Note: if you don't have enough memory on your GPU, reduce batch size and correspondingly increase
# --gradient_accumulation_steps
# Steps:
# 1. Get pretrained checkpoints for RoBERTa and LayoutLMv3. Either:
# 1a. download them: follow instructions in baselines/README.md, or
# 1b. rerun the pretrainings: set run="roberta_pretraining" here for RoBERTa pretraining and run
# `python pretrain.py` in baselines/layoutlmv3_pretrain/ directory.
# 2. Run the following trainings (uncomment to run all consecutively):
# run="roberta_base roberta_ours layoutlmv3_base layoutlmv3_ours roberta_base_synthetic_pretraining roberta_ours_synthetic_pretraining layoutlmv3_ours_synthetic_pretraining"
# 3. Move last checkpoints of models pretrained on synthetic data to the expected location:
# cp -r \
# /app/data/baselines/trainings/${MODEL}_synthetic_pretraining/${TIMESTAMP}/checkpoint-187500 \
# /app/data/baselines/checkpoints/${MODEL}_187500
# 4. Run remaining trainings (uncomment to run all consecutively):
# run="roberta_base_with_synthetic_pretraining roberta_ours_with_synthetic_pretraining layoutlmv3_ours_with_synthetic_pretraining"

TIMESTAMP=$(date +"%Y%m%d_%H%M_%S")
OUTPUT_DIR_PREFIX="/app/data/baselines/trainings"

Expand All @@ -28,7 +46,7 @@ CHECKPOINTS_DIR="/app/data/baselines/checkpoints"
# Common parameters for all trainings with exception of roberta_pretraining
DATA="--docile_path /app/data/docile/"
USE_PREPROCESSED="--preprocessed_dataset_path /app/data/baselines/preprocessed_dataset"
OTHER_COMMON_PARAMS="--save_total_limit 3 --weight_decay 0.001 --lr 2e-5 --dataloader_num_workers 8 --use_BIO_format --tag_everything --report_all_metrics --stride 0"
OTHER_COMMON_PARAMS="--save_total_limit 3 --weight_decay 0.001 --lr 2e-5 --dataloader_num_workers 8 --use_BIO_format --tag_everything --report_all_metrics"
COMMON_PARAMS="${DATA} ${USE_PREPROCESSED} ${OTHER_COMMON_PARAMS}"

# Used for synthetic pretraining of LayoutLMv3
Expand Down Expand Up @@ -75,7 +93,7 @@ fi

single_run="roberta_base"
if [[ " ${run} " =~ " ${single_run} " ]]; then
train_params="--train_bs 32 --test_bs 32 --num_epochs 1500 --gradient_accumulation_steps 4 --warmup_ratio 1.0"
train_params="--train_bs 16 --test_bs 16 --num_epochs 750 --gradient_accumulation_steps 1"
model="--model_name roberta-base --use_roberta"
all_params="${COMMON_PARAMS} ${train_params} ${model}"
output_dir="${single_run}/${TIMESTAMP}"
Expand All @@ -84,7 +102,7 @@ fi

single_run="roberta_ours"
if [[ " ${run} " =~ " ${single_run} " ]]; then
train_params="--train_bs 32 --test_bs 32 --num_epochs 1000 --gradient_accumulation_steps 1 --warmup_ratio 0"
train_params="--train_bs 16 --test_bs 16 --num_epochs 750 --gradient_accumulation_steps 1"
model="--model_name ${CHECKPOINTS_DIR}/roberta_pretraining_50000 --use_roberta"
all_params="${COMMON_PARAMS} ${train_params} ${model}"
output_dir="${single_run}/${TIMESTAMP}"
Expand All @@ -93,7 +111,7 @@ fi

single_run="layoutlmv3_base"
if [[ " ${run} " =~ " ${single_run} " ]]; then
train_params="--train_bs 32 --test_bs 32 --num_epochs 1500 --gradient_accumulation_steps 1 --warmup_ratio 0"
train_params="--train_bs 16 --test_bs 16 --num_epochs 750 --gradient_accumulation_steps 1"
model="--model_name microsoft/layoutlmv3-base"
all_params="${COMMON_PARAMS} ${train_params} ${model}"
output_dir="${single_run}/${TIMESTAMP}"
Expand All @@ -102,7 +120,7 @@ fi

single_run="layoutlmv3_ours"
if [[ " ${run} " =~ " ${single_run} " ]]; then
train_params="--train_bs 16 --test_bs 16 --num_epochs 1000 --gradient_accumulation_steps 1 --warmup_ratio 0"
train_params="--train_bs 16 --test_bs 16 --num_epochs 750 --gradient_accumulation_steps 1"
model="--model_name microsoft/layoutlmv3-base --pretrained_weights ${CHECKPOINTS_DIR}/layoutlmv3_pretraining.ckpt"
all_params="${COMMON_PARAMS} ${train_params} ${model}"
output_dir="${single_run}/${TIMESTAMP}"
Expand All @@ -112,7 +130,7 @@ fi
single_run="roberta_base_synthetic_pretraining" # 30 epochs on synthetic data only
if [[ " ${run} " =~ " ${single_run} " ]]; then
data_params="--split synthetic"
train_params="--train_bs 16 --test_bs 16 --num_epochs 30 --gradient_accumulation_steps 1 --warmup_ratio 0"
train_params="--train_bs 16 --test_bs 16 --num_epochs 30 --gradient_accumulation_steps 1"
model="--model_name roberta-base --use_roberta"
all_params="${COMMON_PARAMS} ${data_params} ${train_params} ${model}"
output_dir="${single_run}/${TIMESTAMP}"
Expand All @@ -122,7 +140,7 @@ fi
single_run="roberta_ours_synthetic_pretraining" # 30 epochs on synthetic data only
if [[ " ${run} " =~ " ${single_run} " ]]; then
data_params="--split synthetic"
train_params="--train_bs 16 --test_bs 16 --num_epochs 30 --gradient_accumulation_steps 1 --warmup_ratio 0"
train_params="--train_bs 16 --test_bs 16 --num_epochs 30 --gradient_accumulation_steps 1"
model="--model_name ${CHECKPOINTS_DIR}/roberta_pretraining_50000 --use_roberta"
all_params="${COMMON_PARAMS} ${data_params} ${train_params} ${model}"
output_dir="${single_run}/${TIMESTAMP}"
Expand All @@ -132,7 +150,7 @@ fi
single_run="layoutlmv3_ours_synthetic_pretraining" # 30 epochs on synthetic data only
if [[ " ${run} " =~ " ${single_run} " ]]; then
data_params="--split synthetic ${USE_ARROW}"
train_params="--train_bs 16 --test_bs 16 --num_epochs 30 --gradient_accumulation_steps 1 --warmup_ratio 0"
train_params="--train_bs 16 --test_bs 16 --num_epochs 30 --gradient_accumulation_steps 1"
model="--model_name microsoft/layoutlmv3-base --pretrained_weights ${CHECKPOINTS_DIR}/layoutlmv3_pretraining.ckpt"
all_params="${COMMON_PARAMS} ${data_params} ${train_params} ${model}"
output_dir="${single_run}/${TIMESTAMP}"
Expand All @@ -141,7 +159,7 @@ fi

single_run="roberta_base_with_synthetic_pretraining"
if [[ " ${run} " =~ " ${single_run} " ]]; then
train_params="--train_bs 8 --test_bs 8 --num_epochs 1500 --gradient_accumulation_steps 8 --warmup_ratio 0"
train_params="--train_bs 16 --test_bs 16 --num_epochs 750 --gradient_accumulation_steps 1"
model="--model_name ${CHECKPOINTS_DIR}/roberta_base_synthetic_pretraining_187500 --use_roberta"
all_params="${COMMON_PARAMS} ${train_params} ${model}"
output_dir="${single_run}/${TIMESTAMP}"
Expand All @@ -150,7 +168,7 @@ fi

single_run="roberta_ours_with_synthetic_pretraining"
if [[ " ${run} " =~ " ${single_run} " ]]; then
train_params="--train_bs 8 --test_bs 8 --num_epochs 1500 --gradient_accumulation_steps 4 --warmup_ratio 0"
train_params="--train_bs 16 --test_bs 16 --num_epochs 750 --gradient_accumulation_steps 1"
model="--model_name ${CHECKPOINTS_DIR}/roberta_ours_synthetic_pretraining_187500 --use_roberta"
all_params="${COMMON_PARAMS} ${train_params} ${model}"
output_dir="${single_run}/${TIMESTAMP}"
Expand All @@ -159,7 +177,7 @@ fi

single_run="layoutlmv3_ours_with_synthetic_pretraining"
if [[ " ${run} " =~ " ${single_run} " ]]; then
train_params="--train_bs 16 --test_bs 16 --num_epochs 1500 --gradient_accumulation_steps 1 --warmup_ratio 0"
train_params="--train_bs 16 --test_bs 16 --num_epochs 750 --gradient_accumulation_steps 1"
model="--model_name microsoft/layoutlmv3-base --pretrained_weights ${CHECKPOINTS_DIR}/layoutlmv3_ours_synthetic_pretraining_187500"
all_params="${COMMON_PARAMS} ${train_params} ${model}"
output_dir="${single_run}/${TIMESTAMP}"
Expand All @@ -170,7 +188,7 @@ fi
# Not presented in the dataset paper
single_run="roberta_base_with_2d_embedding"
if [[ " ${run} " =~ " ${single_run} " ]]; then
train_params="--train_bs 32 --test_bs 32 --num_epochs 1500 --gradient_accumulation_steps 4 --warmup_ratio 1.0"
train_params="--train_bs 16 --test_bs 16 --num_epochs 750 --gradient_accumulation_steps 1"
model="--model_name roberta-base --use_roberta --use_new_2D_pos_emb --pos_emb_dim 6500"
all_params="${COMMON_PARAMS} ${train_params} ${model}"
output_dir="${single_run}/${TIMESTAMP}"
Expand Down
32 changes: 18 additions & 14 deletions baselines/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ The DocILE benchmark comes with several baselines to help you get started and to

## Download checkpoints and predictions

**Info:** New checkpoints were provided on April 27, 2023. Older checkpoints can be downloaded temporarily with the `download_dataset.sh` script by using the string `baselines-20230315` instead of `baselines`. To see results of the previously shared checkpoints on the test and validation sets, check the history of this file. The new trainings were all run with the currently present version of the code and with hyperparameters set to the same values (for NER-based models).

**Warning:** Furthermore a [bug was fixed](https://github.com/rossumai/docile/pull/64) on April 24, 2023 in the NER baselines training code (inference and pretraining were not affected). This bug did not affect any of the shared baselines as it was introduced during refactorings after trainings of the originally shared checkpoints have already finished and before the new trainings have started.

Checkpoints of various trained models are provided with predictions on the validation set. You can download them with the same [download script](../download_dataset.sh) that is provided for downloading the dataset in the root of this repository.

First you need to obtain a secret token by following the instructions at https://docile.rossum.ai/. Then run this from the root of this repository:
Expand Down Expand Up @@ -43,24 +47,24 @@ The main benchmark metric for KILE is `AP`, the best results on AP are bold in t

| model | <ins>val-AP</ins> | val-F1 | val-precision | val-recall | <ins>test-AP</ins> | test-F1 | test-precision | test-recall |
|--------------------------------------------|---------------------|----------|-----------------|--------------|----------------------|-----------|------------------|---------------|
| roberta_base | 0.531 | 0.656 | 0.645 | 0.668 | 0.515 | 0.634 | 0.623 | 0.645 |
| roberta_ours | 0.528 | 0.661 | 0.647 | 0.675 | 0.503 | 0.634 | 0.617 | 0.651 |
| layoutlmv3_ours | 0.453 | 0.608 | 0.611 | 0.605 | 0.451 | 0.587 | 0.588 | 0.585 |
| roberta_base_with_synthetic_pretraining | 0.554 | 0.680 | 0.676 | 0.683 | 0.537 | 0.659 | 0.653 | 0.665 |
| roberta_ours_with_synthetic_pretraining | **0.557** | 0.683 | 0.682 | 0.683 | **0.541** | 0.656 | 0.655 | 0.657 |
| layoutlmv3_ours_with_synthetic_pretraining | 0.507 | 0.656 | 0.662 | 0.651 | 0.493 | 0.641 | 0.646 | 0.636 |
| roberta_base | 0.552 | 0.688 | 0.681 | 0.694 | 0.534 | 0.664 | 0.658 | 0.671 |
| roberta_ours | 0.537 | 0.671 | 0.661 | 0.682 | 0.515 | 0.645 | 0.634 | 0.656 |
| layoutlmv3_ours | 0.513 | 0.657 | 0.651 | 0.662 | 0.507 | 0.639 | 0.636 | 0.641 |
| roberta_base_with_synthetic_pretraining | **0.566** | 0.689 | 0.680 | 0.698 | **0.539** | 0.664 | 0.659 | 0.669 |
| roberta_ours_with_synthetic_pretraining | 0.542 | 0.677 | 0.672 | 0.682 | 0.527 | 0.652 | 0.648 | 0.656 |
| layoutlmv3_ours_with_synthetic_pretraining | 0.532 | 0.674 | 0.680 | 0.668 | 0.512 | 0.655 | 0.662 | 0.648 |

### LIR

The main benchmark metric for LIR is `F1`, the best results on F1 are bold in the table.

| model | val-AP | <ins>val-F1</ins> | val-precision | val-recall | test-AP | <ins>test-F1</ins> | test-precision | test-recall |
|--------------------------------------------|----------|---------------------|-----------------|--------------|-----------|----------------------|------------------|---------------|
| roberta_base | 0.542 | 0.675 | 0.695 | 0.656 | 0.548 | 0.669 | 0.679 | 0.659 |
| roberta_ours | 0.533 | 0.657 | 0.672 | 0.643 | 0.571 | **0.674** | 0.685 | 0.663 |
| layoutlmv3_ours | 0.549 | 0.665 | 0.692 | 0.639 | 0.549 | 0.667 | 0.683 | 0.652 |
| roberta_base_with_synthetic_pretraining | 0.567 | 0.688 | 0.706 | 0.670 | 0.556 | 0.665 | 0.684 | 0.646 |
| roberta_ours_with_synthetic_pretraining | 0.553 | **0.689** | 0.722 | 0.659 | 0.551 | 0.671 | 0.700 | 0.644 |
| layoutlmv3_ours_with_synthetic_pretraining | 0.553 | 0.656 | 0.679 | 0.635 | 0.555 | 0.661 | 0.682 | 0.640 |
| roberta_base_detr_table | 0.519 | 0.660 | 0.700 | 0.624 | 0.526 | 0.652 | 0.675 | 0.631 |
| roberta_base_detr_tableLI | 0.408 | 0.599 | 0.652 | 0.554 | 0.402 | 0.584 | 0.623 | 0.549 |
| roberta_base | 0.552 | 0.688 | 0.709 | 0.668 | 0.576 | 0.686 | 0.695 | 0.678 |
| roberta_ours | 0.538 | 0.662 | 0.676 | 0.649 | 0.570 | 0.686 | 0.693 | 0.678 |
| layoutlmv3_ours | 0.546 | 0.666 | 0.688 | 0.645 | 0.531 | 0.661 | 0.682 | 0.641 |
| roberta_base_with_synthetic_pretraining | 0.567 | **0.701** | 0.721 | 0.683 | 0.583 | **0.698** | 0.710 | 0.687 |
| roberta_ours_with_synthetic_pretraining | 0.549 | 0.682 | 0.703 | 0.662 | 0.559 | 0.675 | 0.696 | 0.655 |
| layoutlmv3_ours_with_synthetic_pretraining | 0.564 | 0.681 | 0.704 | 0.659 | 0.582 | 0.691 | 0.709 | 0.673 |
| roberta_base_detr_table | 0.553 | 0.682 | 0.719 | 0.648 | 0.560 | 0.682 | 0.706 | 0.660 |
| roberta_base_detr_tableLI | 0.427 | 0.613 | 0.661 | 0.572 | 0.407 | 0.594 | 0.632 | 0.560 |
20 changes: 10 additions & 10 deletions download_dataset.sh
Original file line number Diff line number Diff line change
Expand Up @@ -105,16 +105,16 @@ if [[ "$dataset" == "unlabeled" ]]; then
download_and_unzip "$token" "$targetdir" "$unzip" "unlabeled-chunk-$i"
fi
done
elif [[ "$dataset" == "baselines" ]]; then
download_and_unzip "$token" "$targetdir" "$unzip" "baselines-roberta-base"
download_and_unzip "$token" "$targetdir" "$unzip" "baselines-roberta-ours"
download_and_unzip "$token" "$targetdir" "$unzip" "baselines-layoutlmv3-ours"
download_and_unzip "$token" "$targetdir" "$unzip" "baselines-roberta-pretraining"
download_and_unzip "$token" "$targetdir" "$unzip" "baselines-layoutlmv3-pretraining"
download_and_unzip "$token" "$targetdir" "$unzip" "baselines-detr"
download_and_unzip "$token" "$targetdir" "$unzip" "baselines-roberta-base-with-synthetic-pretraining"
download_and_unzip "$token" "$targetdir" "$unzip" "baselines-roberta-ours-with-synthetic-pretraining"
download_and_unzip "$token" "$targetdir" "$unzip" "baselines-layoutlmv3-ours-with-synthetic-pretraining"
elif [[ "$dataset" == "baselines" || "$dataset" == "baselines-20230315" ]]; then
download_and_unzip "$token" "$targetdir" "$unzip" "${dataset}-roberta-base"
download_and_unzip "$token" "$targetdir" "$unzip" "${dataset}-roberta-ours"
download_and_unzip "$token" "$targetdir" "$unzip" "${dataset}-layoutlmv3-ours"
download_and_unzip "$token" "$targetdir" "$unzip" "${dataset}-roberta-pretraining"
download_and_unzip "$token" "$targetdir" "$unzip" "${dataset}-layoutlmv3-pretraining"
download_and_unzip "$token" "$targetdir" "$unzip" "${dataset}-detr"
download_and_unzip "$token" "$targetdir" "$unzip" "${dataset}-roberta-base-with-synthetic-pretraining"
download_and_unzip "$token" "$targetdir" "$unzip" "${dataset}-roberta-ours-with-synthetic-pretraining"
download_and_unzip "$token" "$targetdir" "$unzip" "${dataset}-layoutlmv3-ours-with-synthetic-pretraining"
else
download_and_unzip "$token" "$targetdir" "$unzip" "$dataset"
fi
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "docile-benchmark"
version = "0.3.0"
version = "0.3.1"
description = "Tools to work with the DocILE dataset and benchmark"
authors = [
"Stepan Simsa <stepan.simsa@rossum.ai>",
Expand Down