Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

expose AltoPdf as REST API #552

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,9 @@ public File getXmlFile() {
public void setXmlFile(File xmlFile) {
this.xmlFile = xmlFile;
}

public void setCleanupXml( boolean cleanupXml ){
this.cleanupXml=cleanupXml;
}
}


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package org.grobid.service;

import com.codahale.metrics.annotation.Timed;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import org.glassfish.jersey.media.multipart.FormDataParam;

import org.grobid.service.process.AltoRestProcessFiles;

import javax.ws.rs.*;
import javax.ws.rs.core.*;

import java.io.InputStream;


/**
* RESTful service for the GROBID system.
*
* @author FloZi, Damien, Patrice
*/

@Timed
@Singleton
@Path(GrobidPaths.PATH_PDF_ALTO)
public class AltoRestService {

private static final String INPUT = "input";

@Inject
public AltoRestService(){

}

@Inject
private AltoRestProcessFiles restProcessFiles;

@Consumes(MediaType.MULTIPART_FORM_DATA)
@Produces(MediaType.APPLICATION_XML)
@POST
public Response processPDFReferenceAlto(@FormDataParam(INPUT) InputStream inputStream) throws Exception {


return restProcessFiles.processPDFReferenceAlto(inputStream);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,10 @@ public interface GrobidPaths {
*/
String PATH_CITATIONS_PATENT_PDF_ANNOTATION = "citationPatentAnnotations";

/**
* path extension for getting version
*/
String PATH_PDF_ALTO = "alto";
/**
* path extension for processing sha1.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,15 @@
import com.google.inject.Binder;
import com.google.inject.Provides;
import com.hubspot.dropwizard.guicier.DropwizardAwareModule;

import org.grobid.service.AltoRestService;
import org.grobid.service.GrobidRestService;
import org.grobid.service.GrobidServiceConfiguration;
import org.grobid.service.exceptions.mapper.GrobidExceptionMapper;
import org.grobid.service.exceptions.mapper.GrobidExceptionsTranslationUtility;
import org.grobid.service.exceptions.mapper.GrobidServiceExceptionMapper;
import org.grobid.service.exceptions.mapper.WebApplicationExceptionMapper;
import org.grobid.service.process.AltoRestProcessFiles;
import org.grobid.service.process.GrobidRestProcessFiles;
import org.grobid.service.process.GrobidRestProcessGeneric;
import org.grobid.service.process.GrobidRestProcessString;
Expand Down Expand Up @@ -38,6 +41,9 @@ public void configure(Binder binder) {
binder.bind(GrobidExceptionsTranslationUtility.class);
binder.bind(GrobidExceptionMapper.class);
binder.bind(WebApplicationExceptionMapper.class);
binder.bind(AltoRestService.class);
binder.bind(AltoRestProcessFiles.class);

}

@Provides
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
package org.grobid.service.process;

import com.google.inject.Inject;
import com.google.inject.Singleton;
import org.grobid.core.document.DocumentSource;
import org.grobid.core.engines.Engine;
import org.grobid.core.factory.GrobidPoolingFactory;
import org.grobid.core.layout.GraphicObject;
import org.grobid.core.utilities.IOUtilities;
import org.grobid.service.exceptions.GrobidServiceException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.ws.rs.core.HttpHeaders;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import javax.ws.rs.core.Response.Status;
import java.io.*;
import java.util.NoSuchElementException;

import org.grobid.core.engines.EngineParsers;
import org.grobid.core.engines.config.GrobidAnalysisConfig;
import org.apache.xerces.impl.dv.util.Base64;
import org.grobid.core.document.Document;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Base64.*;

import static java.nio.file.Files.readAllBytes;
import static java.nio.file.Paths.get;

/**
* Web services consuming a file
*/
@Singleton
public class AltoRestProcessFiles extends GrobidRestProcessFiles {

private static final Logger LOGGER = LoggerFactory.getLogger(AltoRestProcessFiles.class);

@Inject
public AltoRestProcessFiles() {

}

/**
* Uploads the origin PDF, process it and return PDF annotations for references
* in JSON.
*
* @param inputStream the data of origin PDF
* @return a response object containing the JSON annotations
*/
public Response processPDFReferenceAlto(final InputStream inputStream) throws Exception {
LOGGER.debug(methodLogIn());

GrobidAnalysisConfig config = new GrobidAnalysisConfig.GrobidAnalysisConfigBuilder().withPreprocessImages(true)
.withProcessVectorGraphics(false).build();

Response response = null;
File originFile, xmlFile = null;
Engine engine = null;
final EngineParsers parsers = new EngineParsers();
try {
engine = Engine.getEngine(true);
// conservative check, if no engine is free in the pool a NoSuchElementException
// is normally thrown
if (engine == null) {
parsers.close();
throw new GrobidServiceException("No GROBID engine available", Status.SERVICE_UNAVAILABLE);
}

originFile = IOUtilities.writeInputFile(inputStream);
if (originFile == null) {
LOGGER.error("The input file cannot be written.");
parsers.close();
throw new GrobidServiceException("The input file cannot be written.", Status.INTERNAL_SERVER_ERROR);
}
final DocumentSource docSource = DocumentSource.fromPdf(originFile, -1, -1, true, false, false);
xmlFile = docSource.getXmlFile();
docSource.setCleanupXml( false);
originFile.delete();

// reading text file into stream, try-with-resources
String content = null;
try {
try (InputStream in = new FileInputStream(xmlFile)) {
final byte[] bytes = new byte[(int) xmlFile.length()];

int offset = 0;
while (offset < bytes.length) {
final int result = in.read(bytes, offset, bytes.length - offset);
if (result == -1) {
break;
}
offset += result;
}
content = new String(bytes, StandardCharsets.UTF_8);
}
} catch (final IOException e) {
e.printStackTrace();
}


// real pdf document and concat all images no svg to end file with struct
// <all><content>alto content <content><images><image name="name"
// value="base64"/></images></all>
String base64imagesxml = parsers.getSegmentationParser().processing(docSource, config).getImages().stream()
.filter(image -> !image.getFilePath().toString().endsWith(".svg")).map(image -> {
try
{
LOGGER.debug(" image string"+ image.getFilePath());
LOGGER.debug(" image "+ get(image.getFilePath()));
File tempFile = new File(image.getFilePath());
LOGGER.debug(" image Exists "+ tempFile.exists());

return "<image name=''" + image.getFilePath().toString() + " value=" + java.util.Base64
.getEncoder().encodeToString(readAllBytes(get(image.getFilePath()))) + "'/>";
} catch (IOException e) {
LOGGER.error("An unexpected exception occurs to read image "+ image.getFilePath().toString()+"-"+e);
return " error image "+image.getFilePath().toString(); }
})
.reduce("", String::concat);
docSource.setCleanupXml( true);
DocumentSource.close(docSource, true, true, true);
parsers.close();


//if(base64imagesxml.length()!=0){
content = "<all><content>"+ content +"</content><images>"+ base64imagesxml +"<images><all>";
//}
if (content != null) {
response = Response.status(Status.OK).entity(content)
.header(HttpHeaders.CONTENT_TYPE, MediaType.APPLICATION_XML + "; charset=UTF-8").build();
}
/*} else {
response = Response.status(Status.NO_CONTENT).build();
}*/

} catch (final NoSuchElementException nseExp) {
LOGGER.error("Could not get an engine from the pool within configured time. Sending service unavailable.");
response = Response.status(Status.SERVICE_UNAVAILABLE).build();
} catch (final Exception exp) {
LOGGER.error("An unexpected exception occurs. ", exp);
response = Response.status(Status.INTERNAL_SERVER_ERROR).entity(exp.getMessage()).build();
} finally {
if (xmlFile != null)
IOUtilities.removeTempFile(xmlFile);

if (engine != null) {
GrobidPoolingFactory.returnEngine(engine);
}
}
LOGGER.debug(methodLogOut());
return response;
}
}