Skip to content

Commit

Permalink
Merge branch 'master' into new-figure-table-models
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Sep 23, 2024
2 parents d649e22 + f501033 commit 44d1801
Show file tree
Hide file tree
Showing 16 changed files with 328 additions and 89 deletions.
22 changes: 14 additions & 8 deletions .github/workflows/ci-build-manual-crf.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
name: Build and push a CRF-only docker image

on:
workflow_dispatch:

on:
workflow_dispatch:
inputs:
custom_tag:
type: string
description: Docker image tag
required: true
default: "latest-crf"

jobs:
build:
Expand All @@ -26,17 +31,18 @@ jobs:
steps:
- name: Create more disk space
run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: Build and push
id: docker_build
uses: mr-smithers-excellent/docker-build-push@v6
with:
dockerfile: Dockerfile
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
dockerfile: Dockerfile.crf
username: ${{ secrets.DOCKERHUB_USERNAME_LFOPPIANO }}
password: ${{ secrets.DOCKERHUB_TOKEN_LFOPPIANO }}
image: lfoppiano/grobid
registry: docker.io
pushImage: true
tags: latest-develop, latest-crf
tags: |
latest-develop, ${{ github.event.inputs.custom_tag}}
- name: Image digest
run: echo ${{ steps.docker_build.outputs.digest }}
14 changes: 10 additions & 4 deletions .github/workflows/ci-build-manual-full.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
name: Build and push a full docker image

on: "workflow_dispatch"

on:
workflow_dispatch:
inputs:
custom_tag:
type: string
description: Docker image tag
required: true
default: "latest-full"

jobs:
build:
Expand All @@ -25,7 +31,7 @@ jobs:
steps:
- name: Create more disk space
run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: Build and push
id: docker_build
uses: mr-smithers-excellent/docker-build-push@v5
Expand All @@ -35,7 +41,7 @@ jobs:
image: lfoppiano/grobid
registry: docker.io
pushImage: true
tags: latest-full
tags: latest-full, ${{ github.event.inputs.custom_tag}}
dockerfile: Dockerfile.delft
- name: Image digest
run: echo ${{ steps.docker_build.outputs.digest }}
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,10 @@ grobid-keyterm
dataseer-ml
datastet
grobid-test-ant
grobid-home/models/quantities
grobid-home/models/quantities*
grobid-home/models/dictionaries-lexical-entries
grobid-home/models/units
grobid-home/models/units*
grobid-home/models/values*
grobid-home/models/bio
grobid-home/models/ner
grobid-home/models/nerfr
Expand All @@ -86,3 +87,4 @@ grobid-home/models/context_*
Dockerfile.dataseer
Dockerfile.software
Dockerfile.datastet
.run
19 changes: 15 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,39 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [0.8.1] - 2024-06-10
## [0.8.1] - 2024-09-14

### Added
- Identified URLs are now added in the TEI output #1099
- Added DL models for patent processing #1082
- Copyright and licence identification models #1078
- Copyrights owner and licenses identification models #1078
- Add research infrastructure recognition for funding processing #1085
- Add paragraphs coordinates in the TEI output #1068
- Specify configuration file with DL models enabled for the full docker image #1117
- Support for biblio-glutton 0.3 #1086

### Changed
- Update affiliation process #1069
- Improved the recognition of URLs using (when available) PDF annotations, such as clickable links
- Updated TEI schema #1084
- Review patent process #1082
- Add Kotlin language to support development and testing #1096

### Fixed
- Sentence segmentation avoids to split sentences with an URL in the middle #1097
- Sentence segmentation is now applied to funding and acknowledgement #1106
- Avoid splitting URLs between sentences #1097
- Add missing sentence segmentation in funding and acknowledgement #1106
- Docker image was optimized to reduce the needed space #1088
- Fixed OOBE when processing large quantities of notes #1075
- Corrected `<title>` coordinate attribute name #1070
- Fix missing coordinates in paragraph continuation #1076
- Fixed JSON log output
- Fixed notes identification #1124
- Fixed extraneous semicolon in the training data #1133
- Reduced security vulnerabilities in the dependencies #1136 #1137

## New Contributors
* @tanaynayak made their first contribution in https://github.com/kermitt2/grobid/pull/1133
* @vipulg13 made their first contribution in https://github.com/kermitt2/grobid/pull/1137

## [0.8.0] - 2023-11-19

Expand Down
40 changes: 31 additions & 9 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ buildscript {
classpath "gradle.plugin.com.github.jengelman.gradle.plugins:shadow:7.0.0"
classpath 'com.adarshr:gradle-test-logger-plugin:2.0.0'
classpath "org.jetbrains.kotlin:kotlin-gradle-plugin:1.8.21"
classpath group: 'net.researchgate', name: 'gradle-release', version: '3.0.2'
}
}

Expand All @@ -34,6 +35,7 @@ allprojects {
apply plugin: 'com.github.kt3k.coveralls'
apply plugin: 'com.adarshr.test-logger'
apply plugin: 'org.jetbrains.kotlin.jvm'
apply plugin: 'net.researchgate.release'

group = "org.grobid"

Expand All @@ -60,19 +62,29 @@ subprojects {
}
}

// sourceCompatibility = 1.11
// targetCompatibility = 1.11
sourceCompatibility = 1.11
targetCompatibility = 1.11

kotlin {
jvmToolchain(11)
}

java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(11))
tasks.withType(KotlinCompile).configureEach {
sourceCompatibility = JavaVersion.VERSION_11
targetCompatibility = JavaVersion.VERSION_11
kotlinOptions {
jvmTarget = JavaVersion.VERSION_11
}
}

// kotlin {
// jvmToolchain(11)
// }

// java {
// toolchain {
// languageVersion.set(JavaLanguageVersion.of(11))
// vendor.set(JvmVendorSpec.ADOPTIUM)
//
// }
// }

repositories {
mavenCentral()
maven {
Expand Down Expand Up @@ -324,6 +336,7 @@ project("grobid-home") {
}

import org.apache.tools.ant.taskdefs.condition.Os
import org.jetbrains.kotlin.gradle.tasks.KotlinCompile

project(":grobid-service") {
apply plugin: 'application'
Expand Down Expand Up @@ -641,3 +654,12 @@ wrapper {
}

build.dependsOn project.getSubprojects().collect({ it.getTasks().getByName("build") })

release {
failOnUnversionedFiles = false
failOnCommitNeeded = false
tagTemplate = '${version}'
git {
requireBranch.set('master')
}
}
2 changes: 1 addition & 1 deletion doc/training/General-principles.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ This maybe of interest if the current state of the models does not correctly rec

The addition of training in Grobid is __not__ done from scratch, but from pre-annotated training data generated by the existing models in Grobid. This ensures that the syntax of the new training data will be (normally) correct and that the stream of text will be easy to align with the text extracted from the PDF. This permits also to take advantage of the existing models which will annotate correctly a certain amount of text, and to focus on the corrections, thus improving the productivity of the annotator.

For generating pre-annotated training files for Grobid based on the existing models, see the instructions for running the software in batch [here](../Training-the-models-of-Grobid/#generation-of-training-data) and [here](../Grobid-batch/#createtraining).
For generating pre-annotated training files for Grobid based on the existing models, see the instructions for running the software in batch [here](../../Training-the-models-of-Grobid/#generation-of-training-data) and [here](../../Grobid-batch/#createtraining).

After running the batch `createTraining` on a set of PDF files using methods for creating training data, each article comes with:

Expand Down
4 changes: 3 additions & 1 deletion gradle.properties
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
version=0.8.1
version=0.8.2-SNAPSHOT
# Set workers to 1 that even for parallel builds it works. (I guess the shadow plugin makes some trouble)
org.gradle.workers.max=1
org.gradle.caching = true
Expand All @@ -10,3 +10,5 @@ org.gradle.vfs.watch = true
#systemProp.https.proxyPort=
#systemProp.https.proxyHost=
#systemProp.https.proxyPort=

org.gradle.java.installations.auto-download=false
51 changes: 42 additions & 9 deletions grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ public String toString() {
", PMID='" + PMID + '\'' +
", PMCID='" + PMCID + '\'' +
", PII='" + PII + '\'' +
", HALId='" + halId + '\'' +
", ark='" + ark + '\'' +
", istexId='" + istexId + '\'' +
", inDOI='" + inDOI + '\'' +
Expand Down Expand Up @@ -256,6 +257,7 @@ public String toString() {
private String PMID = null;
private String PMCID = null;
private String PII = null;
private String halId = null;
private String ark = null;
private String istexId = null;
private String abstract_ = null;
Expand Down Expand Up @@ -526,6 +528,10 @@ public String getDOI() {
return doi;
}

public String getHalId() {
return halId;
}

public String getArk() {
return ark;
}
Expand Down Expand Up @@ -1060,9 +1066,20 @@ public static String cleanDOI(String doi) {
doi = doi.replaceAll("[\\p{M}]", "");
doi = doi.replaceAll("\\p{InCombiningDiacriticalMarks}+", "");

// remove possible starting/trailing parenthesis
if (doi.startsWith("(") || doi.startsWith("[") || doi.startsWith("⟨"))
doi = doi.substring(1);

if (doi.endsWith(")") || doi.endsWith("]") || doi.endsWith("⟩"))
doi = doi.substring(0,doi.length()-1);

return doi;
}

public void setHalId(String halId) {
this.halId = halId;
}

public void setArXivId(String id) {
if (id != null) {
arXivId = StringUtils.normalizeSpace(id);
Expand Down Expand Up @@ -1591,6 +1608,7 @@ public void reset() {
type = null;
book_type = null;
doi = null;
halId = null;
istexId = null;
ark = null;
inDOI = null;
Expand Down Expand Up @@ -2169,7 +2187,7 @@ else if (pubnum != null && pubnum.length() == 13)
}
}

// TODO: PII
// TODO: PII and HALId

}

Expand Down Expand Up @@ -2345,6 +2363,13 @@ else if (bookTitle == null) {
tei.append("<idno type=\"DOI\">" + TextUtilities.HTMLEncode(doi) + "</idno>\n");
}

if (!StringUtils.isEmpty(halId)) {
for (int i = 0; i < indent + 2; i++) {
tei.append("\t");
}
tei.append("<idno type=\"HALid\">" + TextUtilities.HTMLEncode(halId) + "</idno>\n");
}

if (!StringUtils.isEmpty(arXivId)) {
for (int i = 0; i < indent + 2; i++) {
tei.append("\t");
Expand Down Expand Up @@ -2786,9 +2811,6 @@ else if (this.getYear().length() == 4)
}
}

/*for (int i = 0; i < indent + 2; i++) {
tei.append("\t");
}*/
if ((volumeBlock != null) | (issue != null) || (pageRange != null) || (publication_date != null)
|| (publisher != null)) {
for (int i = 0; i < indent + 2; i++) {
Expand Down Expand Up @@ -2947,7 +2969,12 @@ else if (this.getYear().length() == 4)
for (int i = 0; i < indent + 2; i++) {
tei.append("\t");
}
if ((publication_date != null) || (pageRange != null) || (location != null) || (publisher != null) || (volumeBlock != null)) {
if (normalized_publication_date != null ||
publication_date != null ||
pageRange != null ||
location != null ||
publisher != null ||
volumeBlock != null) {
tei.append("<imprint>\n");
}
else {
Expand Down Expand Up @@ -3177,12 +3204,13 @@ else if (this.getYear().length() == 4)
}

if (uri != null) {
if (uri.startsWith("http://hal.")) {
/*if (uri.startsWith("http://hal.") || ) {
for (int i = 0; i < indent + 1; i++) {
tei.append("\t");
}
tei.append("<idno type=\"HALid\">" + TextUtilities.HTMLEncode(uri) + "</idno>\n");
} else {
} else */
{
for (int i = 0; i < indent + 1; i++) {
tei.append("\t");
}
Expand All @@ -3191,7 +3219,7 @@ else if (this.getYear().length() == 4)
}

if (url != null) {
if (url.startsWith("http://hal.")) {
if (url.startsWith("http://hal.") || url.startsWith("https://hal.")) {
for (int i = 0; i < indent + 1; i++) {
tei.append("\t");
}
Expand Down Expand Up @@ -4117,6 +4145,7 @@ public static void injectIdentifiers(BiblioItem destination, BiblioItem source)
destination.setPII(source.getPII());
destination.setIstexId(source.getIstexId());
destination.setArk(source.getArk());
destination.setHalId(source.getHalId());
}

/**
Expand All @@ -4140,6 +4169,8 @@ public static void correct(BiblioItem bib, BiblioItem bibo) {
bib.setIstexId(bibo.getIstexId());
if (bibo.getArk() != null)
bib.setArk(bibo.getArk());
if (bibo.getHalId() != null)
bib.setHalId(bibo.getHalId());

if (bibo.getOAURL() != null)
bib.setOAURL(bibo.getOAURL());
Expand Down Expand Up @@ -4243,6 +4274,8 @@ public static void correct(BiblioItem bib, BiblioItem bibo) {
bib.setISBN10(bibo.getISBN10());
if (bibo.getISBN13() != null)
bib.setISBN13(bibo.getISBN13());
if (bibo.getHalId() != null)
bib.setHalId(bibo.getHalId());

if (bibo.getItem() != -1) {
bib.setItem(bibo.getItem());
Expand Down Expand Up @@ -4361,7 +4394,7 @@ public boolean rejectAsReference() {
if (fullAuthors == null && collaboration == null)
authorSet = false;
// normally properties authors and authorList are null in the current Grobid version
if (!titleSet && !authorSet && (url == null) && (doi == null))
if (!titleSet && !authorSet && url == null && doi == null && halId ==null)
return true;
else
return false;
Expand Down
Loading

0 comments on commit 44d1801

Please sign in to comment.