From 5b0dd4f5cb1b47005407024f212cef349c744685 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 30 Dec 2024 20:28:26 +0100 Subject: [PATCH 1/6] cosmetics --- .../src/main/java/org/grobid/core/document/TEIFormatter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index d9f2c46006..8f9c12aad1 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -923,8 +923,8 @@ else if (biblio.getE_Year().length() == 4) tei.append("\t\t\t\n"); } - if ((abstractText != null) && (abstractText.length() != 0)) { - if ( (biblio.getLabeledAbstract() != null) && (biblio.getLabeledAbstract().length() > 0) ) { + if (StringUtils.isNotBlank(abstractText)) { + if (StringUtils.isNotBlank(biblio.getLabeledAbstract())) { // we have available structured abstract, which can be serialized as a full text "piece" StringBuilder buffer = new StringBuilder(); try { From 32e8a8889cfd74b07461b5a6d3545595c0cfba20 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 31 Dec 2024 10:46:27 +0100 Subject: [PATCH 2/6] update language package to support oldstyle Adobe non-standard fonts names --- .../languages/xpdf-oldstyle/oldstyle.nameToUnicode | 10 ++++++++++ grobid-home/pdfalto/languages/xpdfrc | 3 +++ grobid-home/pdfalto/lin-64/xpdfrc | 4 ++++ grobid-home/pdfalto/mac-64/xpdfrc | 3 +++ grobid-home/pdfalto/mac_arm-64/xpdfrc | 3 +++ 5 files changed, 23 insertions(+) create mode 100644 grobid-home/pdfalto/languages/xpdf-oldstyle/oldstyle.nameToUnicode diff --git a/grobid-home/pdfalto/languages/xpdf-oldstyle/oldstyle.nameToUnicode b/grobid-home/pdfalto/languages/xpdf-oldstyle/oldstyle.nameToUnicode new file mode 100644 index 0000000000..7af14c9a80 --- /dev/null +++ b/grobid-home/pdfalto/languages/xpdf-oldstyle/oldstyle.nameToUnicode @@ -0,0 +1,10 @@ +0030 zero.oldstyle +0031 one.oldstyle +0032 two.oldstyle +0033 three.oldstyle +0034 four.oldstyle +0035 five.oldstyle +0036 six.oldstyle +0037 seven.oldstyle +0038 eight.oldstyle +0039 nine.oldstyle \ No newline at end of file diff --git a/grobid-home/pdfalto/languages/xpdfrc b/grobid-home/pdfalto/languages/xpdfrc index 895c5debed..eb7b0e4ed1 100644 --- a/grobid-home/pdfalto/languages/xpdfrc +++ b/grobid-home/pdfalto/languages/xpdfrc @@ -59,3 +59,6 @@ unicodeMap TIS-620 languages/xpdf-thai/TIS-620.unicodeMap #----- begin Turkish support package (2011-aug-15) unicodeMap ISO-8859-9 languages/xpdf-turkish/ISO-8859-9.unicodeMap #----- end Turkish support package +#----- begin oldstyle support package (2024-dec-31) +nameToUnicode ../languages/xpdf-oldstyle/oldstyle.nameToUnicode +#----- end oldstyle support package diff --git a/grobid-home/pdfalto/lin-64/xpdfrc b/grobid-home/pdfalto/lin-64/xpdfrc index 5b909e6b33..cd95ac9e89 100644 --- a/grobid-home/pdfalto/lin-64/xpdfrc +++ b/grobid-home/pdfalto/lin-64/xpdfrc @@ -59,3 +59,7 @@ unicodeMap TIS-620 ../languages/xpdf-thai/TIS-620.unicodeMap #----- begin Turkish support package (2011-aug-15) unicodeMap ISO-8859-9 ../languages/xpdf-turkish/ISO-8859-9.unicodeMap #----- end Turkish support package +#----- begin oldstyle support package (2024-dec-31) +nameToUnicode ../languages/xpdf-oldstyle/oldstyle.nameToUnicode +#----- end oldstyle support package + diff --git a/grobid-home/pdfalto/mac-64/xpdfrc b/grobid-home/pdfalto/mac-64/xpdfrc index 5b909e6b33..3d142d13f0 100644 --- a/grobid-home/pdfalto/mac-64/xpdfrc +++ b/grobid-home/pdfalto/mac-64/xpdfrc @@ -59,3 +59,6 @@ unicodeMap TIS-620 ../languages/xpdf-thai/TIS-620.unicodeMap #----- begin Turkish support package (2011-aug-15) unicodeMap ISO-8859-9 ../languages/xpdf-turkish/ISO-8859-9.unicodeMap #----- end Turkish support package +#----- begin oldstyle support package (2024-dec-31) +nameToUnicode ../languages/xpdf-oldstyle/oldstyle.nameToUnicode +#----- end oldstyle support package diff --git a/grobid-home/pdfalto/mac_arm-64/xpdfrc b/grobid-home/pdfalto/mac_arm-64/xpdfrc index 5b909e6b33..3d142d13f0 100644 --- a/grobid-home/pdfalto/mac_arm-64/xpdfrc +++ b/grobid-home/pdfalto/mac_arm-64/xpdfrc @@ -59,3 +59,6 @@ unicodeMap TIS-620 ../languages/xpdf-thai/TIS-620.unicodeMap #----- begin Turkish support package (2011-aug-15) unicodeMap ISO-8859-9 ../languages/xpdf-turkish/ISO-8859-9.unicodeMap #----- end Turkish support package +#----- begin oldstyle support package (2024-dec-31) +nameToUnicode ../languages/xpdf-oldstyle/oldstyle.nameToUnicode +#----- end oldstyle support package From d308e9f226b21474383e1d674a3ca5f2ad8a47e2 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 31 Dec 2024 11:22:31 +0100 Subject: [PATCH 3/6] add f_f ligature --- .../pdfalto/languages/xpdf-oldstyle/oldstyle.nameToUnicode | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/grobid-home/pdfalto/languages/xpdf-oldstyle/oldstyle.nameToUnicode b/grobid-home/pdfalto/languages/xpdf-oldstyle/oldstyle.nameToUnicode index 7af14c9a80..89f02f5f81 100644 --- a/grobid-home/pdfalto/languages/xpdf-oldstyle/oldstyle.nameToUnicode +++ b/grobid-home/pdfalto/languages/xpdf-oldstyle/oldstyle.nameToUnicode @@ -7,4 +7,5 @@ 0036 six.oldstyle 0037 seven.oldstyle 0038 eight.oldstyle -0039 nine.oldstyle \ No newline at end of file +0039 nine.oldstyle +fb00 f_f \ No newline at end of file From 9c075a0e0d5705e80c28faf2b67b4d5baee583b9 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 31 Dec 2024 12:55:24 +0100 Subject: [PATCH 4/6] fix ligatures and other weirdos fonts --- .../pdfalto/languages/xpdf-others/fitted.nameToUnicode | 7 +++++++ .../pdfalto/languages/xpdf-others/ligatures.nameToUnicode | 6 ++++++ .../{xpdf-oldstyle => xpdf-others}/oldstyle.nameToUnicode | 3 +-- .../languages/xpdf-others/taboldstyle.nameToUnicode | 1 + grobid-home/pdfalto/languages/xpdfrc | 5 ++++- grobid-home/pdfalto/lin-64/xpdfrc | 5 ++++- grobid-home/pdfalto/mac-64/xpdfrc | 5 ++++- grobid-home/pdfalto/mac_arm-64/xpdfrc | 5 ++++- 8 files changed, 31 insertions(+), 6 deletions(-) create mode 100644 grobid-home/pdfalto/languages/xpdf-others/fitted.nameToUnicode create mode 100644 grobid-home/pdfalto/languages/xpdf-others/ligatures.nameToUnicode rename grobid-home/pdfalto/languages/{xpdf-oldstyle => xpdf-others}/oldstyle.nameToUnicode (86%) create mode 100644 grobid-home/pdfalto/languages/xpdf-others/taboldstyle.nameToUnicode diff --git a/grobid-home/pdfalto/languages/xpdf-others/fitted.nameToUnicode b/grobid-home/pdfalto/languages/xpdf-others/fitted.nameToUnicode new file mode 100644 index 0000000000..9aba076d9f --- /dev/null +++ b/grobid-home/pdfalto/languages/xpdf-others/fitted.nameToUnicode @@ -0,0 +1,7 @@ +0030 zero.fitted +0031 one.fitted +0032 two.fitted +0033 three.fitted +0034 four.fitted +0035 five.fitted +0036 six.fitted \ No newline at end of file diff --git a/grobid-home/pdfalto/languages/xpdf-others/ligatures.nameToUnicode b/grobid-home/pdfalto/languages/xpdf-others/ligatures.nameToUnicode new file mode 100644 index 0000000000..831b07c8ff --- /dev/null +++ b/grobid-home/pdfalto/languages/xpdf-others/ligatures.nameToUnicode @@ -0,0 +1,6 @@ +fb00 f_f +fb01 f_i +fb02 f_l +fb03 f_f_i + + diff --git a/grobid-home/pdfalto/languages/xpdf-oldstyle/oldstyle.nameToUnicode b/grobid-home/pdfalto/languages/xpdf-others/oldstyle.nameToUnicode similarity index 86% rename from grobid-home/pdfalto/languages/xpdf-oldstyle/oldstyle.nameToUnicode rename to grobid-home/pdfalto/languages/xpdf-others/oldstyle.nameToUnicode index 89f02f5f81..7af14c9a80 100644 --- a/grobid-home/pdfalto/languages/xpdf-oldstyle/oldstyle.nameToUnicode +++ b/grobid-home/pdfalto/languages/xpdf-others/oldstyle.nameToUnicode @@ -7,5 +7,4 @@ 0036 six.oldstyle 0037 seven.oldstyle 0038 eight.oldstyle -0039 nine.oldstyle -fb00 f_f \ No newline at end of file +0039 nine.oldstyle \ No newline at end of file diff --git a/grobid-home/pdfalto/languages/xpdf-others/taboldstyle.nameToUnicode b/grobid-home/pdfalto/languages/xpdf-others/taboldstyle.nameToUnicode new file mode 100644 index 0000000000..95fbb2b43f --- /dev/null +++ b/grobid-home/pdfalto/languages/xpdf-others/taboldstyle.nameToUnicode @@ -0,0 +1 @@ +0038 eight.taboldstyle \ No newline at end of file diff --git a/grobid-home/pdfalto/languages/xpdfrc b/grobid-home/pdfalto/languages/xpdfrc index eb7b0e4ed1..628a6d9ffd 100644 --- a/grobid-home/pdfalto/languages/xpdfrc +++ b/grobid-home/pdfalto/languages/xpdfrc @@ -60,5 +60,8 @@ unicodeMap TIS-620 languages/xpdf-thai/TIS-620.unicodeMap unicodeMap ISO-8859-9 languages/xpdf-turkish/ISO-8859-9.unicodeMap #----- end Turkish support package #----- begin oldstyle support package (2024-dec-31) -nameToUnicode ../languages/xpdf-oldstyle/oldstyle.nameToUnicode +nameToUnicode languages/xpdf-others/oldstyle.nameToUnicode +nameToUnicode languages/xpdf-others/taboldstyle.nameToUnicode +nameToUnicode languages/xpdf-others/ligatures.nameToUnicode +nameToUnicode languages/xpdf-others/fitted.nameToUnicode #----- end oldstyle support package diff --git a/grobid-home/pdfalto/lin-64/xpdfrc b/grobid-home/pdfalto/lin-64/xpdfrc index cd95ac9e89..93d893976e 100644 --- a/grobid-home/pdfalto/lin-64/xpdfrc +++ b/grobid-home/pdfalto/lin-64/xpdfrc @@ -60,6 +60,9 @@ unicodeMap TIS-620 ../languages/xpdf-thai/TIS-620.unicodeMap unicodeMap ISO-8859-9 ../languages/xpdf-turkish/ISO-8859-9.unicodeMap #----- end Turkish support package #----- begin oldstyle support package (2024-dec-31) -nameToUnicode ../languages/xpdf-oldstyle/oldstyle.nameToUnicode +nameToUnicode ../languages/xpdf-others/oldstyle.nameToUnicode +nameToUnicode ../languages/xpdf-others/taboldstyle.nameToUnicode +nameToUnicode ../languages/xpdf-others/ligatures.nameToUnicode +nameToUnicode ../languages/xpdf-others/fitted.nameToUnicode #----- end oldstyle support package diff --git a/grobid-home/pdfalto/mac-64/xpdfrc b/grobid-home/pdfalto/mac-64/xpdfrc index 3d142d13f0..9378089095 100644 --- a/grobid-home/pdfalto/mac-64/xpdfrc +++ b/grobid-home/pdfalto/mac-64/xpdfrc @@ -60,5 +60,8 @@ unicodeMap TIS-620 ../languages/xpdf-thai/TIS-620.unicodeMap unicodeMap ISO-8859-9 ../languages/xpdf-turkish/ISO-8859-9.unicodeMap #----- end Turkish support package #----- begin oldstyle support package (2024-dec-31) -nameToUnicode ../languages/xpdf-oldstyle/oldstyle.nameToUnicode +nameToUnicode ../languages/xpdf-others/oldstyle.nameToUnicode +nameToUnicode ../languages/xpdf-others/taboldstyle.nameToUnicode +nameToUnicode ../languages/xpdf-others/ligatures.nameToUnicode +nameToUnicode ../languages/xpdf-others/fitted.nameToUnicode #----- end oldstyle support package diff --git a/grobid-home/pdfalto/mac_arm-64/xpdfrc b/grobid-home/pdfalto/mac_arm-64/xpdfrc index 3d142d13f0..9378089095 100644 --- a/grobid-home/pdfalto/mac_arm-64/xpdfrc +++ b/grobid-home/pdfalto/mac_arm-64/xpdfrc @@ -60,5 +60,8 @@ unicodeMap TIS-620 ../languages/xpdf-thai/TIS-620.unicodeMap unicodeMap ISO-8859-9 ../languages/xpdf-turkish/ISO-8859-9.unicodeMap #----- end Turkish support package #----- begin oldstyle support package (2024-dec-31) -nameToUnicode ../languages/xpdf-oldstyle/oldstyle.nameToUnicode +nameToUnicode ../languages/xpdf-others/oldstyle.nameToUnicode +nameToUnicode ../languages/xpdf-others/taboldstyle.nameToUnicode +nameToUnicode ../languages/xpdf-others/ligatures.nameToUnicode +nameToUnicode ../languages/xpdf-others/fitted.nameToUnicode #----- end oldstyle support package From 04a048cb1c48e2425183d28bab80958e20508db7 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 31 Dec 2024 16:39:21 +0100 Subject: [PATCH 5/6] add more cases --- .../languages/xpdf-others/others.nameToUnicode | 4 ++++ .../languages/xpdf-others/taboldstyle.nameToUnicode | 11 ++++++++++- grobid-home/pdfalto/lin-64/xpdfrc | 1 + grobid-home/pdfalto/mac-64/xpdfrc | 1 + grobid-home/pdfalto/mac_arm-64/xpdfrc | 1 + 5 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 grobid-home/pdfalto/languages/xpdf-others/others.nameToUnicode diff --git a/grobid-home/pdfalto/languages/xpdf-others/others.nameToUnicode b/grobid-home/pdfalto/languages/xpdf-others/others.nameToUnicode new file mode 100644 index 0000000000..9851bb556c --- /dev/null +++ b/grobid-home/pdfalto/languages/xpdf-others/others.nameToUnicode @@ -0,0 +1,4 @@ +2113 lscript +2202 partialdiff +21A9 arrowhookleft +21A9 arrowrighttophalf \ No newline at end of file diff --git a/grobid-home/pdfalto/languages/xpdf-others/taboldstyle.nameToUnicode b/grobid-home/pdfalto/languages/xpdf-others/taboldstyle.nameToUnicode index 95fbb2b43f..1093439560 100644 --- a/grobid-home/pdfalto/languages/xpdf-others/taboldstyle.nameToUnicode +++ b/grobid-home/pdfalto/languages/xpdf-others/taboldstyle.nameToUnicode @@ -1 +1,10 @@ -0038 eight.taboldstyle \ No newline at end of file +0030 zero.taboldstyle +0031 one.taboldstyle +0032 two.taboldstyle +0033 three.taboldstyle +0034 four.taboldstyle +0035 five.taboldstyle +0036 six.taboldstyle +0037 seven.taboldstyle +0038 eight.taboldstyle +0039 nine.taboldstyle \ No newline at end of file diff --git a/grobid-home/pdfalto/lin-64/xpdfrc b/grobid-home/pdfalto/lin-64/xpdfrc index 93d893976e..0f59473c4d 100644 --- a/grobid-home/pdfalto/lin-64/xpdfrc +++ b/grobid-home/pdfalto/lin-64/xpdfrc @@ -64,5 +64,6 @@ nameToUnicode ../languages/xpdf-others/oldstyle.nameToUnicode nameToUnicode ../languages/xpdf-others/taboldstyle.nameToUnicode nameToUnicode ../languages/xpdf-others/ligatures.nameToUnicode nameToUnicode ../languages/xpdf-others/fitted.nameToUnicode +nameToUnicode ../languages/xpdf-others/others.nameToUnicode #----- end oldstyle support package diff --git a/grobid-home/pdfalto/mac-64/xpdfrc b/grobid-home/pdfalto/mac-64/xpdfrc index 9378089095..37bd6ea62e 100644 --- a/grobid-home/pdfalto/mac-64/xpdfrc +++ b/grobid-home/pdfalto/mac-64/xpdfrc @@ -64,4 +64,5 @@ nameToUnicode ../languages/xpdf-others/oldstyle.nameToUnicode nameToUnicode ../languages/xpdf-others/taboldstyle.nameToUnicode nameToUnicode ../languages/xpdf-others/ligatures.nameToUnicode nameToUnicode ../languages/xpdf-others/fitted.nameToUnicode +nameToUnicode ../languages/xpdf-others/others.nameToUnicode #----- end oldstyle support package diff --git a/grobid-home/pdfalto/mac_arm-64/xpdfrc b/grobid-home/pdfalto/mac_arm-64/xpdfrc index 9378089095..37bd6ea62e 100644 --- a/grobid-home/pdfalto/mac_arm-64/xpdfrc +++ b/grobid-home/pdfalto/mac_arm-64/xpdfrc @@ -64,4 +64,5 @@ nameToUnicode ../languages/xpdf-others/oldstyle.nameToUnicode nameToUnicode ../languages/xpdf-others/taboldstyle.nameToUnicode nameToUnicode ../languages/xpdf-others/ligatures.nameToUnicode nameToUnicode ../languages/xpdf-others/fitted.nameToUnicode +nameToUnicode ../languages/xpdf-others/others.nameToUnicode #----- end oldstyle support package From f19ee1b1b94a8c1028cb589d36ff64e4c1504606 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 31 Dec 2024 16:47:56 +0100 Subject: [PATCH 6/6] add more cases --- .../languages/xpdf-others/sc.nameToUnicode | 26 +++++++++++++++++++ grobid-home/pdfalto/languages/xpdfrc | 2 ++ grobid-home/pdfalto/lin-64/xpdfrc | 1 + grobid-home/pdfalto/mac-64/xpdfrc | 1 + grobid-home/pdfalto/mac_arm-64/xpdfrc | 1 + 5 files changed, 31 insertions(+) create mode 100644 grobid-home/pdfalto/languages/xpdf-others/sc.nameToUnicode diff --git a/grobid-home/pdfalto/languages/xpdf-others/sc.nameToUnicode b/grobid-home/pdfalto/languages/xpdf-others/sc.nameToUnicode new file mode 100644 index 0000000000..8418b37b30 --- /dev/null +++ b/grobid-home/pdfalto/languages/xpdf-others/sc.nameToUnicode @@ -0,0 +1,26 @@ +0061 a.sc +0062 b.sc +0063 c.sc +0064 d.sc +0065 e.sc +0066 f.sc +0067 g.sc +0068 h.sc +0069 i.sc +006a j.sc +006c l.sc +006d m.sc +006e n.sc +006f o.sc +0070 p.sc +0071 q.sc +0072 r.sc +0073 s.sc +0074 t.sc +0075 u.sc +0076 v.sc +0077 w.sc +0078 x.sc +0079 y.sc +007a z.sc +002d hyphen.sc \ No newline at end of file diff --git a/grobid-home/pdfalto/languages/xpdfrc b/grobid-home/pdfalto/languages/xpdfrc index 628a6d9ffd..be909550ca 100644 --- a/grobid-home/pdfalto/languages/xpdfrc +++ b/grobid-home/pdfalto/languages/xpdfrc @@ -64,4 +64,6 @@ nameToUnicode languages/xpdf-others/oldstyle.nameToUnicode nameToUnicode languages/xpdf-others/taboldstyle.nameToUnicode nameToUnicode languages/xpdf-others/ligatures.nameToUnicode nameToUnicode languages/xpdf-others/fitted.nameToUnicode +nameToUnicode languages/xpdf-others/others.nameToUnicode +nameToUnicode languages/xpdf-others/sc.nameToUnicode #----- end oldstyle support package diff --git a/grobid-home/pdfalto/lin-64/xpdfrc b/grobid-home/pdfalto/lin-64/xpdfrc index 0f59473c4d..6c997423ea 100644 --- a/grobid-home/pdfalto/lin-64/xpdfrc +++ b/grobid-home/pdfalto/lin-64/xpdfrc @@ -65,5 +65,6 @@ nameToUnicode ../languages/xpdf-others/taboldstyle.nameToUnicode nameToUnicode ../languages/xpdf-others/ligatures.nameToUnicode nameToUnicode ../languages/xpdf-others/fitted.nameToUnicode nameToUnicode ../languages/xpdf-others/others.nameToUnicode +nameToUnicode ../languages/xpdf-others/sc.nameToUnicode #----- end oldstyle support package diff --git a/grobid-home/pdfalto/mac-64/xpdfrc b/grobid-home/pdfalto/mac-64/xpdfrc index 37bd6ea62e..e0dc52b508 100644 --- a/grobid-home/pdfalto/mac-64/xpdfrc +++ b/grobid-home/pdfalto/mac-64/xpdfrc @@ -65,4 +65,5 @@ nameToUnicode ../languages/xpdf-others/taboldstyle.nameToUnicode nameToUnicode ../languages/xpdf-others/ligatures.nameToUnicode nameToUnicode ../languages/xpdf-others/fitted.nameToUnicode nameToUnicode ../languages/xpdf-others/others.nameToUnicode +nameToUnicode ../languages/xpdf-others/sc.nameToUnicode #----- end oldstyle support package diff --git a/grobid-home/pdfalto/mac_arm-64/xpdfrc b/grobid-home/pdfalto/mac_arm-64/xpdfrc index 37bd6ea62e..e0dc52b508 100644 --- a/grobid-home/pdfalto/mac_arm-64/xpdfrc +++ b/grobid-home/pdfalto/mac_arm-64/xpdfrc @@ -65,4 +65,5 @@ nameToUnicode ../languages/xpdf-others/taboldstyle.nameToUnicode nameToUnicode ../languages/xpdf-others/ligatures.nameToUnicode nameToUnicode ../languages/xpdf-others/fitted.nameToUnicode nameToUnicode ../languages/xpdf-others/others.nameToUnicode +nameToUnicode ../languages/xpdf-others/sc.nameToUnicode #----- end oldstyle support package