From 7b070007da3b4a4b666084c8afb99eac41286281 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 12 Aug 2024 10:38:41 +0200 Subject: [PATCH 1/6] avoid null pointer exceptions during training data generation with certain nasty documents --- .../grobid/core/engines/FullTextParser.java | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 953d92f8b1..d2b1bf8661 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -1234,17 +1234,19 @@ public Document createTraining(File inputFile, "\t\t\t\t\n\t\t\t\t\t\n\n"); for (LabeledReferenceResult ref : references) { - if ( (ref.getReferenceText() != null) && (ref.getReferenceText().trim().length() > 0) ) { + if ( StringUtils.isNotBlank(ref.getReferenceText()) ) { BiblioItem bib = parsers.getCitationParser().processingString(ref.getReferenceText(), 0); - String authorSequence = bib.getAuthors(); - if ((authorSequence != null) && (authorSequence.trim().length() > 0) ) { - /*List inputs = new ArrayList(); - inputs.add(authorSequence);*/ - StringBuilder bufferName = parsers.getAuthorParser().trainingExtraction(authorSequence, false); - if ( (bufferName != null) && (bufferName.length()>0) ) { - writerName.write("\n\t\t\t\t\t\t"); - writerName.write(bufferName.toString()); - writerName.write("\n"); + if (bib != null) { + String authorSequence = bib.getAuthors(); + if (StringUtils.isNotBlank(authorSequence)) { + /*List inputs = new ArrayList(); + inputs.add(authorSequence);*/ + StringBuilder bufferName = parsers.getAuthorParser().trainingExtraction(authorSequence, false); + if ((bufferName != null) && (bufferName.length() > 0)) { + writerName.write("\n\t\t\t\t\t\t"); + writerName.write(bufferName.toString()); + writerName.write("\n"); + } } } } From 85eaafd6007309c4d42ead1f83fa79ace0b4dbd9 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 29 Aug 2024 07:18:27 +0200 Subject: [PATCH 2/6] Skip evaluation when split is 1 --- .../src/main/java/org/grobid/trainer/AbstractTrainer.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/grobid-trainer/src/main/java/org/grobid/trainer/AbstractTrainer.java b/grobid-trainer/src/main/java/org/grobid/trainer/AbstractTrainer.java index f26b1eea0d..6084e63dc9 100755 --- a/grobid-trainer/src/main/java/org/grobid/trainer/AbstractTrainer.java +++ b/grobid-trainer/src/main/java/org/grobid/trainer/AbstractTrainer.java @@ -171,6 +171,10 @@ public String splitTrainEvaluate(Double split, boolean incremental) { // if we are here, that means that training succeeded renameModels(oldModelPath, tempModelPath); + if (split == 1.0) { + return "Split ratio is 1.0, no evaluation performed."; + } + return EvaluationUtilities.evaluateStandard(evalDataPath.getAbsolutePath(), getTagger()).toString(); } From aac2c4235557c705f072bc665616ff98883fb61b Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 27 Aug 2024 22:01:59 +0200 Subject: [PATCH 3/6] update github action, fix manual actions, enable docker image tag suffix --- .github/workflows/ci-build-manual-crf.yml | 22 ++++++++++++++-------- .github/workflows/ci-build-manual-full.yml | 2 +- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci-build-manual-crf.yml b/.github/workflows/ci-build-manual-crf.yml index 619529d63b..84a1c56465 100644 --- a/.github/workflows/ci-build-manual-crf.yml +++ b/.github/workflows/ci-build-manual-crf.yml @@ -1,8 +1,12 @@ name: Build and push a CRF-only docker image -on: - workflow_dispatch: - +on: + workflow_dispatch: + inputs: + suffix: + type: string + description: Docker image suffix (e.g. develop, crf, full) + required: false jobs: build: @@ -26,17 +30,19 @@ jobs: steps: - name: Create more disk space run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY" - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Build and push id: docker_build uses: mr-smithers-excellent/docker-build-push@v6 with: - dockerfile: Dockerfile - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} + dockerfile: Dockerfile.crf + username: ${{ secrets.DOCKERHUB_USERNAME_LFOPPIANO }} + password: ${{ secrets.DOCKERHUB_TOKEN_LFOPPIANO }} image: lfoppiano/grobid registry: docker.io pushImage: true - tags: latest-develop, latest-crf + tags: | + latest-develop${{ github.event.inputs.suffix && '-' + github.event.inputs.suffix || '' }} + latest-crf${{ github.event.inputs.suffix && '-' + github.event.inputs.suffix || '' }} - name: Image digest run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/.github/workflows/ci-build-manual-full.yml b/.github/workflows/ci-build-manual-full.yml index 9671177fa8..ce1a0a175b 100644 --- a/.github/workflows/ci-build-manual-full.yml +++ b/.github/workflows/ci-build-manual-full.yml @@ -25,7 +25,7 @@ jobs: steps: - name: Create more disk space run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY" - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Build and push id: docker_build uses: mr-smithers-excellent/docker-build-push@v5 From 65c964a72a778bb32dea35cf672ec0818cd926c5 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 30 Aug 2024 09:20:18 +0200 Subject: [PATCH 4/6] fix syntax for github actions yaml --- .github/workflows/ci-build-manual-crf.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-build-manual-crf.yml b/.github/workflows/ci-build-manual-crf.yml index 84a1c56465..ad66c58f94 100644 --- a/.github/workflows/ci-build-manual-crf.yml +++ b/.github/workflows/ci-build-manual-crf.yml @@ -42,7 +42,7 @@ jobs: registry: docker.io pushImage: true tags: | - latest-develop${{ github.event.inputs.suffix && '-' + github.event.inputs.suffix || '' }} - latest-crf${{ github.event.inputs.suffix && '-' + github.event.inputs.suffix || '' }} + latest-develop${{ github.event.inputs.suffix != '' && '-' || '' }}${{ github.event.inputs.suffix }} + latest-crf${{ github.event.inputs.suffix != '' && '-' || '' }}${{ github.event.inputs.suffix }} - name: Image digest run: echo ${{ steps.docker_build.outputs.digest }} From 0f38cfde3e2280fd260a1edae837b8d8964193b5 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 30 Aug 2024 09:29:43 +0200 Subject: [PATCH 5/6] fix tag syntax --- .github/workflows/ci-build-manual-crf.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-build-manual-crf.yml b/.github/workflows/ci-build-manual-crf.yml index ad66c58f94..cabc1ab3ef 100644 --- a/.github/workflows/ci-build-manual-crf.yml +++ b/.github/workflows/ci-build-manual-crf.yml @@ -42,7 +42,7 @@ jobs: registry: docker.io pushImage: true tags: | - latest-develop${{ github.event.inputs.suffix != '' && '-' || '' }}${{ github.event.inputs.suffix }} + latest-develop${{ github.event.inputs.suffix != '' && '-' || '' }}${{ github.event.inputs.suffix }}, | latest-crf${{ github.event.inputs.suffix != '' && '-' || '' }}${{ github.event.inputs.suffix }} - name: Image digest run: echo ${{ steps.docker_build.outputs.digest }} From 399ef9d34683bc2b2711ba191e53020d6fd33132 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 30 Aug 2024 10:13:40 +0200 Subject: [PATCH 6/6] fix tag syntax --- .github/workflows/ci-build-manual-crf.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/ci-build-manual-crf.yml b/.github/workflows/ci-build-manual-crf.yml index cabc1ab3ef..815d4e8cc2 100644 --- a/.github/workflows/ci-build-manual-crf.yml +++ b/.github/workflows/ci-build-manual-crf.yml @@ -42,7 +42,6 @@ jobs: registry: docker.io pushImage: true tags: | - latest-develop${{ github.event.inputs.suffix != '' && '-' || '' }}${{ github.event.inputs.suffix }}, | - latest-crf${{ github.event.inputs.suffix != '' && '-' || '' }}${{ github.event.inputs.suffix }} + latest-develop${{ github.event.inputs.suffix != '' && '-' || '' }}${{ github.event.inputs.suffix }}, latest-crf${{ github.event.inputs.suffix != '' && '-' || '' }}${{ github.event.inputs.suffix }} - name: Image digest run: echo ${{ steps.docker_build.outputs.digest }}