From 1460d162bd68eca940d587dd4e3de53c3818e026 Mon Sep 17 00:00:00 2001 From: jvea Date: Tue, 4 Feb 2020 13:16:42 +0100 Subject: [PATCH] expose a REST API from altoPdf to return xml with text content and all images in base64 format --- .../grobid/core/document/DocumentSource.java | 4 +- .../org/grobid/service/AltoRestService.java | 45 +++++ .../java/org/grobid/service/GrobidPaths.java | 4 + .../service/modules/GrobidServiceModule.java | 6 + .../service/process/AltoRestProcessFiles.java | 155 ++++++++++++++++++ 5 files changed, 213 insertions(+), 1 deletion(-) create mode 100644 grobid-service/src/main/java/org/grobid/service/AltoRestService.java create mode 100644 grobid-service/src/main/java/org/grobid/service/process/AltoRestProcessFiles.java diff --git a/grobid-core/src/main/java/org/grobid/core/document/DocumentSource.java b/grobid-core/src/main/java/org/grobid/core/document/DocumentSource.java index eab6dd4fe0..66ef76f77e 100644 --- a/grobid-core/src/main/java/org/grobid/core/document/DocumentSource.java +++ b/grobid-core/src/main/java/org/grobid/core/document/DocumentSource.java @@ -383,7 +383,9 @@ public File getXmlFile() { public void setXmlFile(File xmlFile) { this.xmlFile = xmlFile; } - + public void setCleanupXml( boolean cleanupXml ){ + this.cleanupXml=cleanupXml; + } } diff --git a/grobid-service/src/main/java/org/grobid/service/AltoRestService.java b/grobid-service/src/main/java/org/grobid/service/AltoRestService.java new file mode 100644 index 0000000000..83335d2dae --- /dev/null +++ b/grobid-service/src/main/java/org/grobid/service/AltoRestService.java @@ -0,0 +1,45 @@ +package org.grobid.service; + +import com.codahale.metrics.annotation.Timed; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import org.glassfish.jersey.media.multipart.FormDataParam; + +import org.grobid.service.process.AltoRestProcessFiles; + +import javax.ws.rs.*; +import javax.ws.rs.core.*; + +import java.io.InputStream; + + +/** + * RESTful service for the GROBID system. + * + * @author FloZi, Damien, Patrice + */ + +@Timed +@Singleton +@Path(GrobidPaths.PATH_PDF_ALTO) +public class AltoRestService { + + private static final String INPUT = "input"; + + @Inject + public AltoRestService(){ + + } + + @Inject + private AltoRestProcessFiles restProcessFiles; + + @Consumes(MediaType.MULTIPART_FORM_DATA) + @Produces(MediaType.APPLICATION_XML) + @POST + public Response processPDFReferenceAlto(@FormDataParam(INPUT) InputStream inputStream) throws Exception { + + + return restProcessFiles.processPDFReferenceAlto(inputStream); + } +} \ No newline at end of file diff --git a/grobid-service/src/main/java/org/grobid/service/GrobidPaths.java b/grobid-service/src/main/java/org/grobid/service/GrobidPaths.java index 91187f4035..be04cffd49 100755 --- a/grobid-service/src/main/java/org/grobid/service/GrobidPaths.java +++ b/grobid-service/src/main/java/org/grobid/service/GrobidPaths.java @@ -120,6 +120,10 @@ public interface GrobidPaths { */ String PATH_CITATIONS_PATENT_PDF_ANNOTATION = "citationPatentAnnotations"; + /** + * path extension for getting version + */ + String PATH_PDF_ALTO = "alto"; /** * path extension for processing sha1. */ diff --git a/grobid-service/src/main/java/org/grobid/service/modules/GrobidServiceModule.java b/grobid-service/src/main/java/org/grobid/service/modules/GrobidServiceModule.java index 5722157432..d5f0722b04 100644 --- a/grobid-service/src/main/java/org/grobid/service/modules/GrobidServiceModule.java +++ b/grobid-service/src/main/java/org/grobid/service/modules/GrobidServiceModule.java @@ -5,12 +5,15 @@ import com.google.inject.Binder; import com.google.inject.Provides; import com.hubspot.dropwizard.guicier.DropwizardAwareModule; + +import org.grobid.service.AltoRestService; import org.grobid.service.GrobidRestService; import org.grobid.service.GrobidServiceConfiguration; import org.grobid.service.exceptions.mapper.GrobidExceptionMapper; import org.grobid.service.exceptions.mapper.GrobidExceptionsTranslationUtility; import org.grobid.service.exceptions.mapper.GrobidServiceExceptionMapper; import org.grobid.service.exceptions.mapper.WebApplicationExceptionMapper; +import org.grobid.service.process.AltoRestProcessFiles; import org.grobid.service.process.GrobidRestProcessFiles; import org.grobid.service.process.GrobidRestProcessGeneric; import org.grobid.service.process.GrobidRestProcessString; @@ -38,6 +41,9 @@ public void configure(Binder binder) { binder.bind(GrobidExceptionsTranslationUtility.class); binder.bind(GrobidExceptionMapper.class); binder.bind(WebApplicationExceptionMapper.class); + binder.bind(AltoRestService.class); + binder.bind(AltoRestProcessFiles.class); + } @Provides diff --git a/grobid-service/src/main/java/org/grobid/service/process/AltoRestProcessFiles.java b/grobid-service/src/main/java/org/grobid/service/process/AltoRestProcessFiles.java new file mode 100644 index 0000000000..2fe14d0ccf --- /dev/null +++ b/grobid-service/src/main/java/org/grobid/service/process/AltoRestProcessFiles.java @@ -0,0 +1,155 @@ +package org.grobid.service.process; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import org.grobid.core.document.DocumentSource; +import org.grobid.core.engines.Engine; +import org.grobid.core.factory.GrobidPoolingFactory; +import org.grobid.core.layout.GraphicObject; +import org.grobid.core.utilities.IOUtilities; +import org.grobid.service.exceptions.GrobidServiceException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.ws.rs.core.HttpHeaders; +import javax.ws.rs.core.MediaType; +import javax.ws.rs.core.Response; +import javax.ws.rs.core.Response.Status; +import java.io.*; +import java.util.NoSuchElementException; + +import org.grobid.core.engines.EngineParsers; +import org.grobid.core.engines.config.GrobidAnalysisConfig; +import org.apache.xerces.impl.dv.util.Base64; +import org.grobid.core.document.Document; +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.Base64.*; + +import static java.nio.file.Files.readAllBytes; +import static java.nio.file.Paths.get; + +/** + * Web services consuming a file + */ +@Singleton +public class AltoRestProcessFiles extends GrobidRestProcessFiles { + + private static final Logger LOGGER = LoggerFactory.getLogger(AltoRestProcessFiles.class); + + @Inject + public AltoRestProcessFiles() { + + } + + /** + * Uploads the origin PDF, process it and return PDF annotations for references + * in JSON. + * + * @param inputStream the data of origin PDF + * @return a response object containing the JSON annotations + */ + public Response processPDFReferenceAlto(final InputStream inputStream) throws Exception { + LOGGER.debug(methodLogIn()); + + GrobidAnalysisConfig config = new GrobidAnalysisConfig.GrobidAnalysisConfigBuilder().withPreprocessImages(true) + .withProcessVectorGraphics(false).build(); + + Response response = null; + File originFile, xmlFile = null; + Engine engine = null; + final EngineParsers parsers = new EngineParsers(); + try { + engine = Engine.getEngine(true); + // conservative check, if no engine is free in the pool a NoSuchElementException + // is normally thrown + if (engine == null) { + parsers.close(); + throw new GrobidServiceException("No GROBID engine available", Status.SERVICE_UNAVAILABLE); + } + + originFile = IOUtilities.writeInputFile(inputStream); + if (originFile == null) { + LOGGER.error("The input file cannot be written."); + parsers.close(); + throw new GrobidServiceException("The input file cannot be written.", Status.INTERNAL_SERVER_ERROR); + } + final DocumentSource docSource = DocumentSource.fromPdf(originFile, -1, -1, true, false, false); + xmlFile = docSource.getXmlFile(); + docSource.setCleanupXml( false); + originFile.delete(); + + // reading text file into stream, try-with-resources + String content = null; + try { + try (InputStream in = new FileInputStream(xmlFile)) { + final byte[] bytes = new byte[(int) xmlFile.length()]; + + int offset = 0; + while (offset < bytes.length) { + final int result = in.read(bytes, offset, bytes.length - offset); + if (result == -1) { + break; + } + offset += result; + } + content = new String(bytes, StandardCharsets.UTF_8); + } + } catch (final IOException e) { + e.printStackTrace(); + } + + + // real pdf document and concat all images no svg to end file with struct + // alto content + String base64imagesxml = parsers.getSegmentationParser().processing(docSource, config).getImages().stream() + .filter(image -> !image.getFilePath().toString().endsWith(".svg")).map(image -> { + try + { + LOGGER.debug(" image string"+ image.getFilePath()); + LOGGER.debug(" image "+ get(image.getFilePath())); + File tempFile = new File(image.getFilePath()); + LOGGER.debug(" image Exists "+ tempFile.exists()); + + return ""; + } catch (IOException e) { + LOGGER.error("An unexpected exception occurs to read image "+ image.getFilePath().toString()+"-"+e); + return " error image "+image.getFilePath().toString(); } + }) + .reduce("", String::concat); + docSource.setCleanupXml( true); + DocumentSource.close(docSource, true, true, true); + parsers.close(); + + + //if(base64imagesxml.length()!=0){ + content = ""+ content +""+ base64imagesxml +""; + //} + if (content != null) { + response = Response.status(Status.OK).entity(content) + .header(HttpHeaders.CONTENT_TYPE, MediaType.APPLICATION_XML + "; charset=UTF-8").build(); + } + /*} else { + response = Response.status(Status.NO_CONTENT).build(); + }*/ + + } catch (final NoSuchElementException nseExp) { + LOGGER.error("Could not get an engine from the pool within configured time. Sending service unavailable."); + response = Response.status(Status.SERVICE_UNAVAILABLE).build(); + } catch (final Exception exp) { + LOGGER.error("An unexpected exception occurs. ", exp); + response = Response.status(Status.INTERNAL_SERVER_ERROR).entity(exp.getMessage()).build(); + } finally { + if (xmlFile != null) + IOUtilities.removeTempFile(xmlFile); + + if (engine != null) { + GrobidPoolingFactory.returnEngine(engine); + } + } + LOGGER.debug(methodLogOut()); + return response; + } +} \ No newline at end of file