Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update chemical formula parser for processing in parallel #61

Merged
merged 5 commits into from
Oct 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion resources/web
Submodule web updated from 9ad40e to 035761
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ public class ChemicalComposition {
private String formula;

private String name;

private Integer code;

private String message;

public String getFormula() {
return formula;
Expand All @@ -38,7 +42,7 @@ public void setComposition(Map<String, String> composition) {
}

public boolean isEmpty() {
return StringUtils.isBlank(formula) && StringUtils.isBlank(name) && composition.keySet().size() == 0;
return StringUtils.isBlank(formula) && StringUtils.isBlank(name) && composition.keySet().isEmpty();
}

@Override
Expand All @@ -62,4 +66,20 @@ public boolean equals(Object o) {
public int hashCode() {
return Objects.hash(composition, formula, name);
}

public Integer getCode() {
return code;
}

public void setCode(Integer code) {
this.code = code;
}

public String getMessage() {
return message;
}

public void setMessage(String message) {
this.message = message;
}
}
86 changes: 83 additions & 3 deletions src/main/java/org/grobid/core/engines/MaterialParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
import org.apache.commons.text.StringEscapeUtils;
import org.grobid.core.GrobidModel;
import org.grobid.core.analyzers.DeepAnalyzer;
import org.grobid.core.data.document.Span;
import org.grobid.core.data.material.ChemicalComposition;
import org.grobid.core.data.material.Formula;
import org.grobid.core.data.material.Material;
import org.grobid.core.engines.label.TaggingLabel;
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.exceptions.GrobidExceptionStatus;
import org.grobid.core.features.FeaturesVectorMaterial;
import org.grobid.core.layout.BoundingBox;
import org.grobid.core.layout.LayoutToken;
Expand All @@ -27,11 +29,11 @@
import org.slf4j.LoggerFactory;

import javax.inject.Inject;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.*;
import java.util.function.Predicate;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

import static org.apache.commons.collections4.CollectionUtils.isEmpty;
import static org.apache.commons.collections4.CollectionUtils.isNotEmpty;
Expand Down Expand Up @@ -87,6 +89,84 @@ public List<Material> process(String text) {
return process(SuperconductorsParser.textToLayoutTokens(text));
}

public List<List<Material>> processParallel(List<String> texts) {

List<Integer> emptyIndices = IntStream.range(0, texts.size())
.filter(i -> StringUtils.isBlank(texts.get(i)))
.boxed()
.toList();

Set<Integer> emptyIndicesSet = new HashSet<>(emptyIndices);
List<String> textsCopy = new ArrayList<>();

for (int is = 0; is < texts.size(); is++) {
if (!emptyIndices.contains(is)) {
textsCopy.add(texts.get(is));
}
}

List<List<LayoutToken>> asLayoutTokens = textsCopy.stream()
.map(SuperconductorsParser::textToLayoutTokens)
.collect(Collectors.toList());

List<List<Material>> processed = processParallelLT(asLayoutTokens);

emptyIndices.forEach(i -> processed.add(i, new ArrayList<>()));

List<List<Material>> output = new ArrayList<>();
for (int is = 0; is < texts.size(); is++) {
if (!emptyIndices.contains(is)) {
output.add(processed.get(is));
} else {
output.add(new ArrayList<>());
}
}

return output;
}

public List<List<Material>> processParallelLT(List<List<LayoutToken>> layoutTokensBatch) {

List<List<Material>> entities = new ArrayList<>();

//Normalisation
List<List<LayoutToken>> normalisedTokens = layoutTokensBatch.stream()
.map(SuperconductorsParser::normalizeAndRetokenizeLayoutTokens)
.toList();

try {
List<String> tokensWithFeatures = normalisedTokens.stream().map(nt -> addFeatures(nt) + "\n").toList();

String labellingResult = null;
try {
labellingResult = label(tokensWithFeatures);
} catch (Exception e) {
throw new GrobidException("CRF labeling for superconductors parsing failed.", e);
}

List<String> resultingBlocks = Arrays.asList(labellingResult.split("\n\n"));
List<List<Material>> localEntities = extractParallelResults(normalisedTokens, resultingBlocks);

entities.addAll(localEntities);
} catch (Exception e) {
throw new GrobidException("An exception occurred while running Grobid.", e);
}

return entities;
}

public List<List<Material>> extractParallelResults(List<List<LayoutToken>> tokens, List<String> results) {
List<List<Material>> spans = new ArrayList<>();
if (tokens.size() != results.size()) {
throw new GrobidException("One of the text provided is invalid or empty and cannot be tagged. Please provide a clean input.", GrobidExceptionStatus.BAD_INPUT_DATA);
}
for (int i = 0; i < tokens.size(); i++) {
spans.add(extractResults(tokens.get(i), results.get(i)));
}

return spans;
}


public List<Material> process(List<LayoutToken> tokens) {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,16 @@ public ChemicalComposition convertNameToFormula(String name) {
try (CloseableHttpResponse response = httpClient.execute(request)) {
int statusCode = response.getStatusLine().getStatusCode();

if (statusCode == HttpURLConnection.HTTP_OK) {
outputFormula = fromJsonToChemicalComposition(response.getEntity().getContent());
if (statusCode != HttpURLConnection.HTTP_OK) {
LOGGER.debug("Not OK answer. Input: " + name + ". Status code: " + response.getStatusLine().getStatusCode());
} else {
LOGGER.debug("Not OK answer. Input: " + name + ", status code: " + statusCode);
outputFormula = fromJsonToChemicalComposition(response.getEntity().getContent());
if (outputFormula != null && outputFormula.getCode() != HttpURLConnection.HTTP_OK) {
LOGGER.debug("Not OK answer. Input: " + name + ". " +
"Status code: " + outputFormula.getCode() +
"Message: " + outputFormula.getMessage());
outputFormula = new ChemicalComposition();
}
}
}

Expand Down Expand Up @@ -101,6 +107,11 @@ public ChemicalComposition convertFormulaToComposition(String formula) {
LOGGER.debug("Not OK answer. Input: " + formula + ". Status code: " + response.getStatusLine().getStatusCode());
} else {
outputComposition = fromJsonToChemicalComposition(response.getEntity().getContent());
if (outputComposition != null && outputComposition.getCode() != HttpURLConnection.HTTP_OK) {
LOGGER.debug("Not OK answer. Input: " + formula + ". Status code: " + outputComposition.getCode() +
"Message: " + outputComposition.getMessage());
outputComposition = new ChemicalComposition();
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import javax.ws.rs.Path;
import javax.ws.rs.Produces;
import javax.ws.rs.core.MediaType;
import java.util.Arrays;
import java.util.List;

@Singleton
Expand Down Expand Up @@ -42,10 +43,32 @@ public List<Material> processTextSuperconductorsGet(@FormDataParam("text") Strin
return parseMaterial(text);
}


@Path("multiparse")
@Produces(MediaType.APPLICATION_JSON)
@POST
public List<List<Material>> processTextSuperconductorsPost2(@FormDataParam("texts") String texts) {
return parseMaterials(texts);
}

@Path("multiparse")
@Produces(MediaType.APPLICATION_JSON)
@GET
public List<List<Material>> processTextSuperconductorsGet2(@FormDataParam("texts") String texts) {
return parseMaterials(texts);
}

private List<Material> parseMaterial(@FormDataParam("text") String text) {
String textPreprocessed = text.replace("\r\n", "\n");

return materialParser.process(textPreprocessed);
}

private List<List<Material>> parseMaterials(@FormDataParam("text") String text) {
String textPreprocessed = text.replace("\r\n", "\n");

List<String> list = Arrays.asList(textPreprocessed.split("\n"));
return materialParser.processParallel(list);
}

}

This file was deleted.

Loading