diff --git a/anhalytics-harvest/src/main/java/fr/inria/anhalytics/harvest/converters/HalTEIConverter.java b/anhalytics-harvest/src/main/java/fr/inria/anhalytics/harvest/converters/HalTEIConverter.java index 9805b8e..8b92918 100644 --- a/anhalytics-harvest/src/main/java/fr/inria/anhalytics/harvest/converters/HalTEIConverter.java +++ b/anhalytics-harvest/src/main/java/fr/inria/anhalytics/harvest/converters/HalTEIConverter.java @@ -204,6 +204,8 @@ private Element createMetadataTEIHeader(NodeList stuffToTake, Document doc) { private void parseOrgsAddress(Document doc, NodeList orgs) { Node org = null; GrobidService gs = new GrobidService(); + DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory + .newInstance(); for (int i = orgs.getLength() - 1; i >= 0; i--) { org = orgs.item(i); if (org.getNodeType() == Node.ELEMENT_NODE) { @@ -211,12 +213,12 @@ private void parseOrgsAddress(Document doc, NodeList orgs) { NodeList addressNodes = orgElt.getElementsByTagName("addrLine"); NodeList orgNameNodes = orgElt.getElementsByTagName("orgName"); - String orgNameStr = ""; + StringBuilder orgNameStr = new StringBuilder(); Node orgNameNode = null; for (int y = orgNameNodes.getLength() - 1; y >= 0; y--) { orgNameNode = orgNameNodes.item(y); if (orgNameNode.getNodeType() == Node.ELEMENT_NODE) { - orgNameStr += !orgNameStr.isEmpty() ? " "+orgNameNode.getTextContent():orgNameNode.getTextContent(); + orgNameStr.append((orgNameStr.length() > 0) ? " " + orgNameNode.getTextContent() : orgNameNode.getTextContent()); } } @@ -231,8 +233,7 @@ private void parseOrgsAddress(Document doc, NodeList orgs) { if (addrLine != null && isNotBlank(addrLine.getTextContent())) { grobidResponse = gs.processAffiliation(orgNameStr + " " + addrLine.getTextContent() + " " + countryCode); try { - Element node = DocumentBuilderFactory - .newInstance() + Element node = documentBuilderFactory .newDocumentBuilder() .parse(new ByteArrayInputStream(grobidResponse.getBytes())) .getDocumentElement(); diff --git a/anhalytics-harvest/src/main/java/fr/inria/anhalytics/harvest/converters/IstexTEIConverter.java b/anhalytics-harvest/src/main/java/fr/inria/anhalytics/harvest/converters/IstexTEIConverter.java index 2f1fcbe..696973c 100644 --- a/anhalytics-harvest/src/main/java/fr/inria/anhalytics/harvest/converters/IstexTEIConverter.java +++ b/anhalytics-harvest/src/main/java/fr/inria/anhalytics/harvest/converters/IstexTEIConverter.java @@ -112,6 +112,8 @@ private void updateKeywords(Document metadata) { private void parseAffiliationString(Document doc, NodeList affs) { Node aff = null; GrobidService gs = new GrobidService(); + DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory + .newInstance(); for (int i = affs.getLength() - 1; i >= 0; i--) { aff = affs.item(i); if (aff.getNodeType() == Node.ELEMENT_NODE) { @@ -125,8 +127,7 @@ private void parseAffiliationString(Document doc, NodeList affs) { try { // (HACK)Grobid may split affiliation string into two affiliation elements, which is considered not well-formed. grobidResponse = "" + grobidResponse + ""; - Element node = DocumentBuilderFactory - .newInstance() + Element node = documentBuilderFactory .newDocumentBuilder() .parse(new ByteArrayInputStream(grobidResponse.getBytes())) .getDocumentElement(); diff --git a/anhalytics-harvest/src/main/java/fr/inria/anhalytics/harvest/crossref/CrossRef.java b/anhalytics-harvest/src/main/java/fr/inria/anhalytics/harvest/crossref/CrossRef.java index fc900c9..40e5998 100644 --- a/anhalytics-harvest/src/main/java/fr/inria/anhalytics/harvest/crossref/CrossRef.java +++ b/anhalytics-harvest/src/main/java/fr/inria/anhalytics/harvest/crossref/CrossRef.java @@ -1,25 +1,26 @@ package fr.inria.anhalytics.harvest.crossref; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; import fr.inria.anhalytics.commons.exceptions.ServiceException; import fr.inria.anhalytics.commons.exceptions.SystemException; import fr.inria.anhalytics.commons.managers.MongoFileManager; import fr.inria.anhalytics.commons.properties.HarvestProperties; -import com.fasterxml.jackson.databind.*; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; - import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.NodeList; + import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathFactory; - -import java.io.*; - +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; @@ -65,19 +66,8 @@ public class CrossRef { private MongoFileManager mm; - private DocumentBuilder docBuilder; - public CrossRef() { this.mm = MongoFileManager.getInstance(false); - - DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); - docFactory.setValidating(false); - //docFactory.setNamespaceAware(true); - try { - docBuilder = docFactory.newDocumentBuilder(); - } catch (ParserConfigurationException e) { - throw new SystemException("Cannot instantiate CrossRef parser", e); - } } /** @@ -85,6 +75,7 @@ public CrossRef() { * service based on core metadata */ public void findDois() { +// XPath xPath = XPathFactory.newInstance().newXPath() // String doi = ""; // String aut = ""; // String title = ""; @@ -277,7 +268,7 @@ private String getMetadataByDoi(String doi) throws Exception { } private HttpURLConnection openConnection(URL url) { - HttpURLConnection urlConn; + HttpURLConnection urlConn; try { urlConn = (HttpURLConnection) url.openConnection(); } catch (IOException e) { @@ -293,10 +284,20 @@ private HttpURLConnection openConnection(URL url) { /** * Try to consolidate some uncertain bibliographical data with crossref web * service based on title and first author. - * */ private String queryCrossref(String query) throws Exception { + DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); + docFactory.setValidating(false); + //docFactory.setNamespaceAware(true); + DocumentBuilder docBuilder = null; + + try { + docBuilder = docFactory.newDocumentBuilder(); + } catch (ParserConfigurationException e) { + throw new SystemException("Cannot instantiate CrossRef parser", e); + } + String doi = ""; // we check if the entry is not already in the DB diff --git a/anhalytics-kb/src/main/java/fr/inria/anhalytics/kb/datamine/KnowledgeBaseFeeder.java b/anhalytics-kb/src/main/java/fr/inria/anhalytics/kb/datamine/KnowledgeBaseFeeder.java index 448264f..9638f4e 100644 --- a/anhalytics-kb/src/main/java/fr/inria/anhalytics/kb/datamine/KnowledgeBaseFeeder.java +++ b/anhalytics-kb/src/main/java/fr/inria/anhalytics/kb/datamine/KnowledgeBaseFeeder.java @@ -71,6 +71,17 @@ public void initKnowledgeBase() { initResult = mm.initObjects(null, MongoFileManager.ONLY_NOT_MINED_INIT_KB_PROCESS); } + DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); + docFactory.setValidating(false); + //docFactory.setNamespaceAware(true); + + DocumentBuilder docBuilder = null; + try { + docBuilder = docFactory.newDocumentBuilder(); + } catch (ParserConfigurationException e) { + e.printStackTrace(); + } + if (initResult) { while (mm.hasMore()) { BiblioObject biblioObject = mm.nextBiblioObject(); @@ -81,18 +92,18 @@ public void initKnowledgeBase() { adf.openTransaction(); Document teiDoc = null; try { - InputStream teiStream = new ByteArrayInputStream(mm.getTEICorpus(biblioObject).getBytes()); - DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); - docFactory.setValidating(false); - //docFactory.setNamespaceAware(true); - DocumentBuilder docBuilder = null; + InputStream teiStream = null; try { - docBuilder = docFactory.newDocumentBuilder(); + teiStream = new ByteArrayInputStream(mm.getTEICorpus(biblioObject).getBytes()); teiDoc = docBuilder.parse(teiStream); } catch (Exception e) { logger.error("Error when parsing TEI stream. ", e); + } finally { + if (teiStream != null) { + teiStream.close(); + } + } - teiStream.close(); Publication pub = new Publication(); @@ -138,7 +149,7 @@ public void initKnowledgeBase() { processPersons(editors, "editor", pub, teiDoc, authorsFromfulltextTeiHeader); logger.info("#################################################################"); - } catch(NumberOfCoAuthorsExceededException e) { + } catch (NumberOfCoAuthorsExceededException e) { logger.warn("Skipping publication, number of coauthors are exceeding 30", e); adf.rollback(); teiDoc = null;