Skip to content

Commit

Permalink
expose a REST API from altoPdf to return xml with text content and al…
Browse files Browse the repository at this point in the history
…l images in base64 format
  • Loading branch information
jorgeveamurguia committed Feb 4, 2020
1 parent 53044fd commit 1460d16
Show file tree
Hide file tree
Showing 5 changed files with 213 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,9 @@ public File getXmlFile() {
public void setXmlFile(File xmlFile) {
this.xmlFile = xmlFile;
}

public void setCleanupXml( boolean cleanupXml ){
this.cleanupXml=cleanupXml;
}
}


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package org.grobid.service;

import com.codahale.metrics.annotation.Timed;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import org.glassfish.jersey.media.multipart.FormDataParam;

import org.grobid.service.process.AltoRestProcessFiles;

import javax.ws.rs.*;
import javax.ws.rs.core.*;

import java.io.InputStream;


/**
* RESTful service for the GROBID system.
*
* @author FloZi, Damien, Patrice
*/

@Timed
@Singleton
@Path(GrobidPaths.PATH_PDF_ALTO)
public class AltoRestService {

private static final String INPUT = "input";

@Inject
public AltoRestService(){

}

@Inject
private AltoRestProcessFiles restProcessFiles;

@Consumes(MediaType.MULTIPART_FORM_DATA)
@Produces(MediaType.APPLICATION_XML)
@POST
public Response processPDFReferenceAlto(@FormDataParam(INPUT) InputStream inputStream) throws Exception {


return restProcessFiles.processPDFReferenceAlto(inputStream);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,10 @@ public interface GrobidPaths {
*/
String PATH_CITATIONS_PATENT_PDF_ANNOTATION = "citationPatentAnnotations";

/**
* path extension for getting version
*/
String PATH_PDF_ALTO = "alto";
/**
* path extension for processing sha1.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,15 @@
import com.google.inject.Binder;
import com.google.inject.Provides;
import com.hubspot.dropwizard.guicier.DropwizardAwareModule;

import org.grobid.service.AltoRestService;
import org.grobid.service.GrobidRestService;
import org.grobid.service.GrobidServiceConfiguration;
import org.grobid.service.exceptions.mapper.GrobidExceptionMapper;
import org.grobid.service.exceptions.mapper.GrobidExceptionsTranslationUtility;
import org.grobid.service.exceptions.mapper.GrobidServiceExceptionMapper;
import org.grobid.service.exceptions.mapper.WebApplicationExceptionMapper;
import org.grobid.service.process.AltoRestProcessFiles;
import org.grobid.service.process.GrobidRestProcessFiles;
import org.grobid.service.process.GrobidRestProcessGeneric;
import org.grobid.service.process.GrobidRestProcessString;
Expand Down Expand Up @@ -38,6 +41,9 @@ public void configure(Binder binder) {
binder.bind(GrobidExceptionsTranslationUtility.class);
binder.bind(GrobidExceptionMapper.class);
binder.bind(WebApplicationExceptionMapper.class);
binder.bind(AltoRestService.class);
binder.bind(AltoRestProcessFiles.class);

}

@Provides
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
package org.grobid.service.process;

import com.google.inject.Inject;
import com.google.inject.Singleton;
import org.grobid.core.document.DocumentSource;
import org.grobid.core.engines.Engine;
import org.grobid.core.factory.GrobidPoolingFactory;
import org.grobid.core.layout.GraphicObject;
import org.grobid.core.utilities.IOUtilities;
import org.grobid.service.exceptions.GrobidServiceException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.ws.rs.core.HttpHeaders;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import javax.ws.rs.core.Response.Status;
import java.io.*;
import java.util.NoSuchElementException;

import org.grobid.core.engines.EngineParsers;
import org.grobid.core.engines.config.GrobidAnalysisConfig;
import org.apache.xerces.impl.dv.util.Base64;
import org.grobid.core.document.Document;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Base64.*;

import static java.nio.file.Files.readAllBytes;
import static java.nio.file.Paths.get;

/**
* Web services consuming a file
*/
@Singleton
public class AltoRestProcessFiles extends GrobidRestProcessFiles {

private static final Logger LOGGER = LoggerFactory.getLogger(AltoRestProcessFiles.class);

@Inject
public AltoRestProcessFiles() {

}

/**
* Uploads the origin PDF, process it and return PDF annotations for references
* in JSON.
*
* @param inputStream the data of origin PDF
* @return a response object containing the JSON annotations
*/
public Response processPDFReferenceAlto(final InputStream inputStream) throws Exception {
LOGGER.debug(methodLogIn());

GrobidAnalysisConfig config = new GrobidAnalysisConfig.GrobidAnalysisConfigBuilder().withPreprocessImages(true)
.withProcessVectorGraphics(false).build();

Response response = null;
File originFile, xmlFile = null;
Engine engine = null;
final EngineParsers parsers = new EngineParsers();
try {
engine = Engine.getEngine(true);
// conservative check, if no engine is free in the pool a NoSuchElementException
// is normally thrown
if (engine == null) {
parsers.close();
throw new GrobidServiceException("No GROBID engine available", Status.SERVICE_UNAVAILABLE);
}

originFile = IOUtilities.writeInputFile(inputStream);
if (originFile == null) {
LOGGER.error("The input file cannot be written.");
parsers.close();
throw new GrobidServiceException("The input file cannot be written.", Status.INTERNAL_SERVER_ERROR);
}
final DocumentSource docSource = DocumentSource.fromPdf(originFile, -1, -1, true, false, false);
xmlFile = docSource.getXmlFile();
docSource.setCleanupXml( false);
originFile.delete();

// reading text file into stream, try-with-resources
String content = null;
try {
try (InputStream in = new FileInputStream(xmlFile)) {
final byte[] bytes = new byte[(int) xmlFile.length()];

int offset = 0;
while (offset < bytes.length) {
final int result = in.read(bytes, offset, bytes.length - offset);
if (result == -1) {
break;
}
offset += result;
}
content = new String(bytes, StandardCharsets.UTF_8);
}
} catch (final IOException e) {
e.printStackTrace();
}


// real pdf document and concat all images no svg to end file with struct
// <all><content>alto content <content><images><image name="name"
// value="base64"/></images></all>
String base64imagesxml = parsers.getSegmentationParser().processing(docSource, config).getImages().stream()
.filter(image -> !image.getFilePath().toString().endsWith(".svg")).map(image -> {
try
{
LOGGER.debug(" image string"+ image.getFilePath());
LOGGER.debug(" image "+ get(image.getFilePath()));
File tempFile = new File(image.getFilePath());
LOGGER.debug(" image Exists "+ tempFile.exists());

return "<image name=''" + image.getFilePath().toString() + " value=" + java.util.Base64
.getEncoder().encodeToString(readAllBytes(get(image.getFilePath()))) + "'/>";
} catch (IOException e) {
LOGGER.error("An unexpected exception occurs to read image "+ image.getFilePath().toString()+"-"+e);
return " error image "+image.getFilePath().toString(); }
})
.reduce("", String::concat);
docSource.setCleanupXml( true);
DocumentSource.close(docSource, true, true, true);
parsers.close();


//if(base64imagesxml.length()!=0){
content = "<all><content>"+ content +"</content><images>"+ base64imagesxml +"<images><all>";
//}
if (content != null) {
response = Response.status(Status.OK).entity(content)
.header(HttpHeaders.CONTENT_TYPE, MediaType.APPLICATION_XML + "; charset=UTF-8").build();
}
/*} else {
response = Response.status(Status.NO_CONTENT).build();
}*/

} catch (final NoSuchElementException nseExp) {
LOGGER.error("Could not get an engine from the pool within configured time. Sending service unavailable.");
response = Response.status(Status.SERVICE_UNAVAILABLE).build();
} catch (final Exception exp) {
LOGGER.error("An unexpected exception occurs. ", exp);
response = Response.status(Status.INTERNAL_SERVER_ERROR).entity(exp.getMessage()).build();
} finally {
if (xmlFile != null)
IOUtilities.removeTempFile(xmlFile);

if (engine != null) {
GrobidPoolingFactory.returnEngine(engine);
}
}
LOGGER.debug(methodLogOut());
return response;
}
}

0 comments on commit 1460d16

Please sign in to comment.