diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportDDIServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportDDIServiceBean.java index b96acd8fc7e..ecb099cf26c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportDDIServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportDDIServiceBean.java @@ -47,6 +47,7 @@ public class ImportDDIServiceBean { public static final String NAMING_PROTOCOL_DOI = "doi"; public static final String AGENCY_HANDLE = "handle"; public static final String AGENCY_DOI = "DOI"; + public static final String AGENCY_DARA = "dara"; // da|ra - http://www.da-ra.de/en/home/ public static final String REPLICATION_FOR_TYPE = "replicationFor"; public static final String VAR_WEIGHTED = "wgtd"; public static final String VAR_INTERVAL_CONTIN = "contin"; @@ -91,6 +92,7 @@ public class ImportDDIServiceBean { public static final String NOTE_SUBJECT_LOCKSS_PERM = "LOCKSS Permission"; public static final String NOTE_TYPE_REPLICATION_FOR = "DVN:REPLICATION_FOR"; + private static final String HARVESTED_FILE_STORAGE_PREFIX = "http://"; private XMLInputFactory xmlInputFactory = null; @EJB CustomFieldServiceBean customFieldService; @@ -241,18 +243,28 @@ private void processCodeBook(ImportType importType, XMLStreamReader xmlr, Datase if (event == XMLStreamConstants.START_ELEMENT) { if (xmlr.getLocalName().equals("docDscr")) { processDocDscr(xmlr, datasetDTO); - } - else if (xmlr.getLocalName().equals("stdyDscr")) { + } else if (xmlr.getLocalName().equals("stdyDscr")) { processStdyDscr(importType, xmlr, datasetDTO); - } - else if (xmlr.getLocalName().equals("fileDscr") && !isMigrationImport(importType)) { + } else if (xmlr.getLocalName().equals("otherMat") && (isNewImport(importType) || isHarvestWithFilesImport(importType)) ) { + processOtherMat(xmlr, datasetDTO, filesMap); + } else if (xmlr.getLocalName().equals("fileDscr") && isHarvestWithFilesImport(importType)) { + // If this is a harvesting import, we'll attempt to extract some minimal + // file-level metadata information from the fileDscr sections as well. + // TODO: add more info here... -- 4.6 + processFileDscrMinimal(xmlr, datasetDTO, filesMap); + } else if (xmlr.getLocalName().equals("fileDscr") && isNewImport(importType)) { + // this is a "full" fileDscr section - Dataverses use it + // to encode *tabular* files only. It will contain the information + // about variables, observations, etc. It will be complemented + // by a number of entries in the dataDscr section. + // Dataverses do not use this section for harvesting exports, since + // we don't harvest tabular metadata. And all the "regular" + // file-level metadata is encoded in otherMat sections. + // The goal is to one day be able to import such tabular + // metadata using the direct (non-harvesting) import API. // EMK TODO: add this back in for ImportType.NEW //processFileDscr(xmlr, datasetDTO, filesMap); - - } - else if (xmlr.getLocalName().equals("otherMat") && (isNewImport(importType) || isHarvestWithFilesImport(importType)) ) { - processOtherMat(xmlr, datasetDTO, filesMap); - } + } } else if (event == XMLStreamConstants.END_ELEMENT) { if (xmlr.getLocalName().equals("codeBook")) return; @@ -432,12 +444,23 @@ else if (xmlr.getLocalName().equals("relStdy")) { private void processCitation(ImportType importType, XMLStreamReader xmlr, DatasetDTO datasetDTO) throws XMLStreamException, ImportException { DatasetVersionDTO dvDTO = datasetDTO.getDatasetVersion(); MetadataBlockDTO citation=datasetDTO.getDatasetVersion().getMetadataBlocks().get("citation"); + boolean distStatementProcessed = false; for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) { if (event == XMLStreamConstants.START_ELEMENT) { if (xmlr.getLocalName().equals("titlStmt")) processTitlStmt(xmlr, datasetDTO); else if (xmlr.getLocalName().equals("rspStmt")) processRspStmt(xmlr,citation); else if (xmlr.getLocalName().equals("prodStmt")) processProdStmt(xmlr,citation); - else if (xmlr.getLocalName().equals("distStmt")) processDistStmt(xmlr,citation); + else if (xmlr.getLocalName().equals("distStmt")) { + if (distStatementProcessed) { + // We've already encountered one Distribution Statement in + // this citation, we'll just skip any consecutive ones. + // This is a defensive check against duplicate distStmt + // in some DDIs (notably, from ICPSR) + } else { + processDistStmt(xmlr,citation); + distStatementProcessed = true; + } + } else if (xmlr.getLocalName().equals("serStmt")) processSerStmt(xmlr,citation); else if (xmlr.getLocalName().equals("verStmt")) processVerStmt(importType, xmlr,dvDTO); else if (xmlr.getLocalName().equals("notes")) { @@ -882,11 +905,23 @@ private void processAnlyInfo(XMLStreamReader xmlr, MetadataBlockDTO socialScienc private void processDataColl(XMLStreamReader xmlr, DatasetVersionDTO dvDTO) throws XMLStreamException { MetadataBlockDTO socialScience =getSocialScience(dvDTO); + + String collMode = ""; + String timeMeth = ""; + String weight = ""; + for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) { if (event == XMLStreamConstants.START_ELEMENT) { //timeMethod if (xmlr.getLocalName().equals("timeMeth")) { - socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("timeMethod", parseText( xmlr, "timeMeth" ))); + String thisValue = parseText( xmlr, "timeMeth" ); + if (!StringUtil.isEmpty(thisValue)) { + if (!"".equals(timeMeth)) { + timeMeth = timeMeth.concat(", "); + } + timeMeth = timeMeth.concat(thisValue); + } + //socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("timeMethod", parseText( xmlr, "timeMeth" ))); } else if (xmlr.getLocalName().equals("dataCollector")) { socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("dataCollector", parseText( xmlr, "dataCollector" ))); // frequencyOfDataCollection @@ -903,7 +938,14 @@ private void processDataColl(XMLStreamReader xmlr, DatasetVersionDTO dvDTO) thro socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("deviationsFromSampleDesign", parseText( xmlr, "deviat" ))); // collectionMode } else if (xmlr.getLocalName().equals("collMode")) { - socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("collectionMode", parseText( xmlr, "collMode" ))); + String thisValue = parseText( xmlr, "collMode" ); + if (!StringUtil.isEmpty(thisValue)) { + if (!"".equals(collMode)) { + collMode = collMode.concat(", "); + } + collMode = collMode.concat(thisValue); + } + //socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("collectionMode", parseText( xmlr, "collMode" ))); //researchInstrument } else if (xmlr.getLocalName().equals("resInstru")) { socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("researchInstrument", parseText( xmlr, "resInstru" ))); @@ -916,12 +958,30 @@ private void processDataColl(XMLStreamReader xmlr, DatasetVersionDTO dvDTO) thro } else if (xmlr.getLocalName().equals("ConOps")) { socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("controlOperations", parseText( xmlr, "ConOps" ))); } else if (xmlr.getLocalName().equals("weight")) { - socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("weighting", parseText( xmlr, "weight" ))); + String thisValue = parseText( xmlr, "weight" ); + if (!StringUtil.isEmpty(thisValue)) { + if (!"".equals(weight)) { + weight = weight.concat(", "); + } + weight = weight.concat(thisValue); + } + //socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("weighting", parseText( xmlr, "weight" ))); } else if (xmlr.getLocalName().equals("cleanOps")) { socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("cleaningOperations", parseText( xmlr, "cleanOps" ))); - } + } } else if (event == XMLStreamConstants.END_ELEMENT) { - if (xmlr.getLocalName().equals("dataColl")) return; + if (xmlr.getLocalName().equals("dataColl")) { + if (!StringUtil.isEmpty(timeMeth)) { + socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("timeMethod", timeMeth)); + } + if (!StringUtil.isEmpty(collMode)) { + socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("collectionMode", collMode)); + } + if (!StringUtil.isEmpty(weight)) { + socialScience.getFields().add(FieldDTO.createPrimitiveFieldDTO("weighting", weight)); + } + return; + } } } } @@ -1242,6 +1302,16 @@ private void processTitlStmt(XMLStreamReader xmlr, DatasetDTO datasetDTO) throws parseStudyIdHandle( parseText(xmlr), datasetDTO ); } else if ( AGENCY_DOI.equals( xmlr.getAttributeValue(null, "agency") ) ) { parseStudyIdDOI( parseText(xmlr), datasetDTO ); + } else if ( AGENCY_DARA.equals( xmlr.getAttributeValue(null, "agency"))) { + /* + da|ra - "Registration agency for social and economic data" + (http://www.da-ra.de/en/home/) + ICPSR uses da|ra to register their DOIs; so they have agency="dara" + in their IDNo entries. + Also, their DOIs are formatted differently, without the + hdl: prefix. + */ + parseStudyIdDoiICPSRdara( parseText(xmlr), datasetDTO ); } else { HashSet set = new HashSet<>(); addToSet(set,"otherIdAgency", xmlr.getAttributeValue(null, "agency")); @@ -1325,16 +1395,23 @@ private Object parseTextNew(XMLStreamReader xmlr, String endTag) throws XMLStrea if (event == XMLStreamConstants.CHARACTERS) { returnString += xmlr.getText().trim().replace('\n',' '); } else if (event == XMLStreamConstants.START_ELEMENT) { - if (xmlr.getLocalName().equals("p")) { - returnString += "

" + parseText(xmlr, "p") + "

"; - } else if (xmlr.getLocalName().equals("emph")) { - returnString += "" + parseText(xmlr, "emph") + ""; - } else if (xmlr.getLocalName().equals("hi")) { - returnString += "" + parseText(xmlr, "hi") + ""; + if (xmlr.getLocalName().equals("p") || xmlr.getLocalName().equals("br") || xmlr.getLocalName().equals("head")) { + returnString += "

" + parseText(xmlr, xmlr.getLocalName()) + "

"; + } else if (xmlr.getLocalName().equals("emph") || xmlr.getLocalName().equals("em") || xmlr.getLocalName().equals("i")) { + returnString += "" + parseText(xmlr, xmlr.getLocalName()) + ""; + } else if (xmlr.getLocalName().equals("hi") || xmlr.getLocalName().equals("b")) { + returnString += "" + parseText(xmlr, xmlr.getLocalName()) + ""; } else if (xmlr.getLocalName().equals("ExtLink")) { String uri = xmlr.getAttributeValue(null, "URI"); String text = parseText(xmlr, "ExtLink").trim(); returnString += "" + ( StringUtil.isEmpty(text) ? uri : text) + ""; + } else if (xmlr.getLocalName().equals("a") || xmlr.getLocalName().equals("A")) { + String uri = xmlr.getAttributeValue(null, "URI"); + if (StringUtil.isEmpty(uri)) { + uri = xmlr.getAttributeValue(null, "HREF"); + } + String text = parseText(xmlr, xmlr.getLocalName()).trim(); + returnString += "" + ( StringUtil.isEmpty(text) ? uri : text) + ""; } else if (xmlr.getLocalName().equals("list")) { returnString += parseText_list(xmlr); } else if (xmlr.getLocalName().equals("citation")) { @@ -1343,6 +1420,8 @@ private Object parseTextNew(XMLStreamReader xmlr, String endTag) throws XMLStrea } else { returnString += parseText_citation(xmlr); } + } else if (xmlr.getLocalName().equals("txt")) { + returnString += parseText(xmlr); } else { throw new EJBException("ERROR occurred in mapDDI (parseText): tag not yet supported: <" + xmlr.getLocalName() + ">" ); } @@ -1373,7 +1452,7 @@ private String parseText_list (XMLStreamReader xmlr) throws XMLStreamException { // check type String listType = xmlr.getAttributeValue(null, "type"); - if ("bulleted".equals(listType) ){ + if ("bulleted".equals(listType) || listType == null){ listString = ""; } else if ("ordered".equals(listType) ) { @@ -1524,6 +1603,31 @@ private void parseStudyIdDOI(String _id, DatasetDTO datasetDTO) throws ImportExc datasetDTO.setIdentifier(_id.substring(index2+1)); } + + private void parseStudyIdDoiICPSRdara(String _id, DatasetDTO datasetDTO) throws ImportException{ + /* + dara/ICPSR DOIs are formatted without the hdl: prefix; for example - + 10.3886/ICPSR06635.v1 + so we assume that everything before the last "/" is the authority, + and everything past it - the identifier: + */ + + int index = _id.lastIndexOf('/'); + + if (index == -1) { + throw new ImportException("Error parsing ICPSR/dara DOI IdNo: "+_id+". '/' not found in string"); + } + + if (index == _id.length() - 1) { + throw new ImportException("Error parsing ICPSR/dara DOI IdNo: "+_id+" ends with '/'"); + } + + datasetDTO.setAuthority(_id.substring(0, index)); + datasetDTO.setProtocol("doi"); + datasetDTO.setDoiSeparator("/"); + + datasetDTO.setIdentifier(_id.substring(index+1)); + } // Helper methods private MetadataBlockDTO getCitation(DatasetVersionDTO dvDTO) { return dvDTO.getMetadataBlocks().get("citation"); @@ -1609,6 +1713,58 @@ private void processOtherMat(XMLStreamReader xmlr, DatasetDTO datasetDTO, Map fi } } + // this method is for attempting to extract the minimal amount of file-level + // metadata from an ICPSR-supplied DDI. (they use the "fileDscr" instead of + // "otherMat" for general file metadata; the only field they populate is + // "fileName". -- 4.6 + + private void processFileDscrMinimal(XMLStreamReader xmlr, DatasetDTO datasetDTO, Map filesMap) throws XMLStreamException { + FileMetadataDTO fmdDTO = new FileMetadataDTO(); + + if (datasetDTO.getDatasetVersion().getFileMetadatas() == null) { + datasetDTO.getDatasetVersion().setFileMetadatas(new ArrayList<>()); + } + datasetDTO.getDatasetVersion().getFileMetadatas().add(fmdDTO); + + DataFileDTO dfDTO = new DataFileDTO(); + dfDTO.setContentType("data/various-formats"); // reserved ICPSR content type identifier + fmdDTO.setDataFile(dfDTO); + + for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) { + if (event == XMLStreamConstants.START_ELEMENT) { + if (xmlr.getLocalName().equals("fileName")) { + // this is the file name: + String label = parseText(xmlr); + // do some cleanup: + int col = label.lastIndexOf(':'); + if ( col > -1) { + if (col < label.length() - 1) { + label = label.substring(col+1); + } else { + label = label.replaceAll(":", ""); + } + } + label = label.replaceAll("[#;<>\\?\\|\\*\"]", ""); + label = label.replaceAll("/", "-"); + // strip leading blanks: + label = label.replaceFirst("^[ \t]*", ""); + fmdDTO.setLabel(label); + } + } else if (event == XMLStreamConstants.END_ELEMENT) { + if (xmlr.getLocalName().equals("fileDscr")) { + if (fmdDTO.getLabel() == null || fmdDTO.getLabel().trim().equals("") ) { + fmdDTO.setLabel("harvested file"); + } + if (StringUtil.isEmpty(fmdDTO.getDataFile().getStorageIdentifier())) { + fmdDTO.getDataFile().setStorageIdentifier(HARVESTED_FILE_STORAGE_PREFIX); + } + + return; + } + } + } + } + private void processFileDscr(XMLStreamReader xmlr, DatasetDTO datasetDTO, Map filesMap) throws XMLStreamException { FileMetadataDTO fmdDTO = new FileMetadataDTO(); diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java index 4c5865d560e..2b4e30e25c1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java @@ -38,6 +38,7 @@ import edu.harvard.iq.dataverse.util.json.JsonParseException; import edu.harvard.iq.dataverse.util.json.JsonParser; import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintWriter; import java.io.StringReader; @@ -221,7 +222,8 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve // Kraffmiller's export modules; replace the logic below with clean // programmatic lookup of the import plugin needed. - if ("ddi".equalsIgnoreCase(metadataFormat) || "oai_ddi".equals(metadataFormat)) { + if ("ddi".equalsIgnoreCase(metadataFormat) || "oai_ddi".equals(metadataFormat) + || metadataFormat.toLowerCase().matches("^oai_ddi.*")) { try { String xmlToParse = new String(Files.readAllBytes(metadataFile.toPath())); // TODO: @@ -230,16 +232,16 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve // ImportType.HARVEST vs. ImportType.HARVEST_WITH_FILES logger.fine("importing DDI "+metadataFile.getAbsolutePath()); dsDTO = importDDIService.doImport(ImportType.HARVEST_WITH_FILES, xmlToParse); - } catch (XMLStreamException e) { - throw new ImportException("XMLStreamException" + e); + } catch (Exception e) { + throw new ImportException("Failed to process DDI XML record: "+ e.getClass() + " (" + e.getMessage() + ")"); } } else if ("dc".equalsIgnoreCase(metadataFormat) || "oai_dc".equals(metadataFormat)) { logger.fine("importing DC "+metadataFile.getAbsolutePath()); try { String xmlToParse = new String(Files.readAllBytes(metadataFile.toPath())); dsDTO = importGenericService.processOAIDCxml(xmlToParse); - } catch (XMLStreamException e) { - throw new ImportException("XMLStreamException processing Dublin Core XML record: "+e.getMessage()); + } catch (Exception e) { + throw new ImportException("Failed to process Dublin Core XML record: "+ e.getClass() + " (" + e.getMessage() + ")"); } } else if ("dataverse_json".equals(metadataFormat)) { // This is Dataverse metadata already formatted in JSON. @@ -371,12 +373,20 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve importedDataset = engineSvc.submit(new CreateDatasetCommand(ds, dataverseRequest, false, ImportType.HARVEST)); } - } catch (JsonParseException ex) { - logger.log(Level.INFO, "Error parsing datasetVersion: {0}", ex.getMessage()); - throw new ImportException("Error parsing datasetVersion: " + ex.getMessage(), ex); - } catch (CommandException ex) { - logger.log(Level.INFO, "Error excuting Create dataset command: {0}", ex.getMessage()); - throw new ImportException("Error excuting dataverse command: " + ex.getMessage(), ex); + } catch (Exception ex) { + logger.fine("Failed to import harvested dataset: " + ex.getClass() + ": " + ex.getMessage()); + FileOutputStream savedJsonFileStream = new FileOutputStream(new File(metadataFile.getAbsolutePath() + ".json")); + byte[] jsonBytes = json.getBytes(); + int i = 0; + while (i < jsonBytes.length) { + int chunkSize = i + 8192 <= jsonBytes.length ? 8192 : jsonBytes.length - i; + savedJsonFileStream.write(jsonBytes, i, chunkSize); + i += chunkSize; + savedJsonFileStream.flush(); + } + savedJsonFileStream.close(); + logger.info("JSON produced saved in " + metadataFile.getAbsolutePath() + ".json"); + throw new ImportException("Failed to import harvested dataset: " + ex.getClass() + " (" + ex.getMessage() + ")", ex); } return importedDataset; } diff --git a/src/main/java/edu/harvard/iq/dataverse/harvest/client/FastGetRecord.java b/src/main/java/edu/harvard/iq/dataverse/harvest/client/FastGetRecord.java index 6acdaf06102..742771ef9a5 100644 --- a/src/main/java/edu/harvard/iq/dataverse/harvest/client/FastGetRecord.java +++ b/src/main/java/edu/harvard/iq/dataverse/harvest/client/FastGetRecord.java @@ -72,7 +72,16 @@ public class FastGetRecord { - private static String DATAVERSE_EXTENDED_METADATA = "dataverse_json"; + private static final String DATAVERSE_EXTENDED_METADATA = "dataverse_json"; + private static final String XML_METADATA_TAG = "metadata"; + private static final String XML_METADATA_TAG_OPEN = "<"+XML_METADATA_TAG+">"; + private static final String XML_METADATA_TAG_CLOSE = ""; + private static final String XML_OAI_PMH_CLOSING_TAGS = ""; + private static final String XML_XMLNS_XSI_ATTRIBUTE_TAG = "xmlns:xsi="; + private static final String XML_XMLNS_XSI_ATTRIBUTE = " "+XML_XMLNS_XSI_ATTRIBUTE_TAG+"\"http://www.w3.org/2001/XMLSchema-instance\">"; + private static final String XML_COMMENT_START = ""; + /** * Client-side GetRecord verb constructor * @@ -186,20 +195,34 @@ public void harvestRecord(String baseURL, String identifier, String metadataPref while ( ( line = rd.readLine () ) != null) { if (!metadataFlag) { - if (line.matches(".*.*")) { + if (line.matches(".*"+XML_METADATA_TAG_OPEN+".*")) { String lineCopy = line; - int i = line.indexOf(""); - line = line.substring(i+10); + int i = line.indexOf(XML_METADATA_TAG_OPEN); + if (line.length() > i + XML_METADATA_TAG_OPEN.length()) { + line = line.substring(i + XML_METADATA_TAG_OPEN.length()); + // TODO: check if there's anything useful (non-white space, etc.) + // in the remaining part of the line? + if ((i = line.indexOf('<')) > -1) { + if (i > 0) { + line = line.substring(i); + } + } else { + line = null; + } + + } else { + line = null; + } - oaiResponseHeader = oaiResponseHeader.concat(lineCopy.replaceAll(".*", "")); + oaiResponseHeader = oaiResponseHeader.concat(lineCopy.replaceAll(XML_METADATA_TAG_OPEN+".*", XML_METADATA_TAG_OPEN+XML_METADATA_TAG_CLOSE+XML_OAI_PMH_CLOSING_TAGS)); tempFileStream = new FileOutputStream(savedMetadataFile); metadataOut = new PrintWriter (tempFileStream, true); //metadataOut.println(""); /* ? */ metadataFlag = true; - } else if (line.matches(".*]*>.*")) { + } else if (line.matches(".*<"+XML_METADATA_TAG+" [^>]*>.*")) { if (metadataPrefix.equals(DATAVERSE_EXTENDED_METADATA)) { oaiResponseHeader = oaiResponseHeader.concat(line); metadataWritten = true; @@ -207,7 +230,10 @@ public void harvestRecord(String baseURL, String identifier, String metadataPref } } } + + //System.out.println(line); + if (line != null) { if (metadataFlag) { if (!metadataWritten) { // Inside an OAI-PMH GetRecord response, the metadata @@ -224,26 +250,26 @@ public void harvestRecord(String baseURL, String identifier, String metadataPref // significant. // -- L.A. - if (line.matches(" -1) { - if (!line.substring(i).matches("^]*/")) { + while ((i = line.indexOf("<"+XML_METADATA_TAG, i)) > -1) { + if (!line.substring(i).matches("^<"+XML_METADATA_TAG+"[^>]*/")) { // don't count if it's a closed, empty tag: // mopen++; } - i+=10; + i+=XML_METADATA_TAG_OPEN.length(); } } - if (line.matches(".*.*")) { + if (line.matches(".*"+XML_METADATA_TAG_CLOSE+".*")) { int i = 0; - while ((i = line.indexOf("", i)) > -1) { - i+=11; + while ((i = line.indexOf(XML_METADATA_TAG_CLOSE, i)) > -1) { + i+=XML_METADATA_TAG_CLOSE.length(); mclose++; } if ( mclose > mopen ) { - line = line.substring(0, line.lastIndexOf("")); + line = line.substring(0, line.lastIndexOf(XML_METADATA_TAG_CLOSE)); metadataWritten = true; } } @@ -262,10 +288,13 @@ public void harvestRecord(String baseURL, String identifier, String metadataPref // the first "real" XML element (of the form // ). So we need to skip these! - while ( (line.indexOf('<', offset) > -1) - && - "",offset)) < 0)) { + ((offset = line.indexOf(XML_COMMENT_END,offset)) < 0)) { line = line.replaceAll("[\n\r]", " "); offset = line.length(); line = line.concat(rd.readLine()); } - offset += 3; + offset += XML_COMMENT_END.length(); } // if we have skipped some comments, is there another @@ -319,10 +348,11 @@ public void harvestRecord(String baseURL, String identifier, String metadataPref int i = firstElementStart; - if (!line.substring(i).matches("^<[^>]*xmlns.*")) { + if (!line.substring(i).matches("^<[^>]*"+XML_XMLNS_XSI_ATTRIBUTE_TAG+".*")) { String head = line.substring(0, i); String tail = line.substring(i); - tail = tail.replaceFirst(">", " xmlns=\"http://www.openarchives.org/OAI/2.0/\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\">"); + //tail = tail.replaceFirst(">", " xmlns=\"http://www.openarchives.org/OAI/2.0/\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\">"); + tail = tail.replaceFirst(">", XML_XMLNS_XSI_ATTRIBUTE); line = head + tail; } @@ -340,6 +370,7 @@ public void harvestRecord(String baseURL, String identifier, String metadataPref } else { oaiResponseHeader = oaiResponseHeader.concat(line); } + } } // parse the OAI Record header: diff --git a/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvesterServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvesterServiceBean.java index f5e1a4ca976..4d546d57eea 100644 --- a/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvesterServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvesterServiceBean.java @@ -50,6 +50,8 @@ import edu.harvard.iq.dataverse.harvest.client.oai.OaiHandler; import edu.harvard.iq.dataverse.harvest.client.oai.OaiHandlerException; import edu.harvard.iq.dataverse.search.IndexServiceBean; +import java.io.FileWriter; +import java.io.PrintWriter; import javax.persistence.EntityManager; import javax.persistence.PersistenceContext; @@ -151,10 +153,14 @@ public void doHarvest(DataverseRequest dataverseRequest, Long harvestingClientId MutableBoolean harvestErrorOccurred = new MutableBoolean(false); String logTimestamp = logFormatter.format(new Date()); Logger hdLogger = Logger.getLogger("edu.harvard.iq.dataverse.harvest.client.HarvesterServiceBean." + harvestingDataverse.getAlias() + logTimestamp); - String logFileName = "../logs" + File.separator + "harvest_" + harvestingClientConfig.getName() + logTimestamp + ".log"; + String logFileName = "../logs" + File.separator + "harvest_" + harvestingClientConfig.getName() + "_" + logTimestamp + ".log"; FileHandler fileHandler = new FileHandler(logFileName); hdLogger.setUseParentHandlers(false); hdLogger.addHandler(fileHandler); + + PrintWriter importCleanupLog = new PrintWriter(new FileWriter( "../logs/harvest_cleanup_" + harvestingClientConfig.getName() + "_" + logTimestamp+".txt")); + + List harvestedDatasetIds = null; List harvestedDatasetIdsThisBatch = new ArrayList(); @@ -177,7 +183,7 @@ public void doHarvest(DataverseRequest dataverseRequest, Long harvestingClientId if (harvestingClientConfig.isOai()) { - harvestedDatasetIds = harvestOAI(dataverseRequest, harvestingClientConfig, hdLogger, harvestErrorOccurred, failedIdentifiers, deletedIdentifiers, harvestedDatasetIdsThisBatch); + harvestedDatasetIds = harvestOAI(dataverseRequest, harvestingClientConfig, hdLogger, importCleanupLog, harvestErrorOccurred, failedIdentifiers, deletedIdentifiers, harvestedDatasetIdsThisBatch); } else { throw new IOException("Unsupported harvest type"); @@ -221,6 +227,7 @@ public void doHarvest(DataverseRequest dataverseRequest, Long harvestingClientId harvestingClientService.resetHarvestInProgress(harvestingClientId); fileHandler.close(); hdLogger.removeHandler(fileHandler); + importCleanupLog.close(); } } @@ -231,7 +238,7 @@ public void doHarvest(DataverseRequest dataverseRequest, Long harvestingClientId * @param harvestErrorOccurred have we encountered any errors during harvest? * @param failedIdentifiers Study Identifiers for failed "GetRecord" requests */ - private List harvestOAI(DataverseRequest dataverseRequest, HarvestingClient harvestingClient, Logger hdLogger, MutableBoolean harvestErrorOccurred, List failedIdentifiers, List deletedIdentifiers, List harvestedDatasetIdsThisBatch) + private List harvestOAI(DataverseRequest dataverseRequest, HarvestingClient harvestingClient, Logger hdLogger, PrintWriter importCleanupLog, MutableBoolean harvestErrorOccurred, List failedIdentifiers, List deletedIdentifiers, List harvestedDatasetIdsThisBatch) throws IOException, ParserConfigurationException, SAXException, TransformerException { logBeginOaiHarvest(hdLogger, harvestingClient); @@ -262,7 +269,7 @@ private List harvestOAI(DataverseRequest dataverseRequest, HarvestingClien MutableBoolean getRecordErrorOccurred = new MutableBoolean(false); // Retrieve and process this record with a separate GetRecord call: - Long datasetId = processRecord(dataverseRequest, hdLogger, oaiHandler, identifier, getRecordErrorOccurred, processedSizeThisBatch, deletedIdentifiers); + Long datasetId = processRecord(dataverseRequest, hdLogger, importCleanupLog, oaiHandler, identifier, getRecordErrorOccurred, processedSizeThisBatch, deletedIdentifiers); hdLogger.info("Total content processed in this batch so far: "+processedSizeThisBatch); if (datasetId != null) { @@ -278,6 +285,8 @@ private List harvestOAI(DataverseRequest dataverseRequest, HarvestingClien if (getRecordErrorOccurred.booleanValue() == true) { failedIdentifiers.add(identifier); harvestErrorOccurred.setValue(true); + //temporary: + //throw new IOException("Exception occured, stopping harvest"); } // reindexing in batches? - this is from DVN 3; @@ -307,7 +316,7 @@ private List harvestOAI(DataverseRequest dataverseRequest, HarvestingClien @TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED) - public Long processRecord(DataverseRequest dataverseRequest, Logger hdLogger, OaiHandler oaiHandler, String identifier, MutableBoolean recordErrorOccurred, MutableLong processedSizeThisBatch, List deletedIdentifiers) { + public Long processRecord(DataverseRequest dataverseRequest, Logger hdLogger, PrintWriter importCleanupLog, OaiHandler oaiHandler, String identifier, MutableBoolean recordErrorOccurred, MutableLong processedSizeThisBatch, List deletedIdentifiers) { String errMessage = null; Dataset harvestedDataset = null; logGetRecord(hdLogger, oaiHandler, identifier); @@ -334,15 +343,16 @@ public Long processRecord(DataverseRequest dataverseRequest, Logger hdLogger, Oa } } else { - hdLogger.fine("Successfully retrieved GetRecord response."); + hdLogger.info("Successfully retrieved GetRecord response."); tempFile = record.getMetadataFile(); + PrintWriter cleanupLog; harvestedDataset = importService.doImportHarvestedDataset(dataverseRequest, oaiHandler.getHarvestingClient(), identifier, oaiHandler.getMetadataPrefix(), record.getMetadataFile(), - null); + importCleanupLog); hdLogger.fine("Harvest Successful for identifier " + identifier); hdLogger.fine("Size of this record: " + record.getMetadataFile().length()); @@ -355,7 +365,10 @@ public Long processRecord(DataverseRequest dataverseRequest, Logger hdLogger, Oa } finally { if (tempFile != null) { - try{tempFile.delete();}catch(Throwable t){}; + // temporary - let's not delete the temp metadata file if anything went wrong, for now: + if (errMessage == null) { + try{tempFile.delete();}catch(Throwable t){}; + } } } @@ -445,6 +458,9 @@ public void logGetRecordException(Logger hdLogger, OaiHandler oaiHandler, String +e.getMessage(); hdLogger.log(Level.SEVERE, errMessage); + + // temporary: + e.printStackTrace(); } diff --git a/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvestingClient.java b/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvestingClient.java index ee2ddb8bd12..12d3ebac6f3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvestingClient.java +++ b/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvestingClient.java @@ -93,8 +93,8 @@ public void setId(Long id) { public static final String HARVEST_STYLE_DESCRIPTION_DEFAULT="Generic OAI resource (DC)"; - public static final List HARVEST_STYLE_LIST = Arrays.asList(HARVEST_STYLE_DATAVERSE, HARVEST_STYLE_VDC, HARVEST_STYLE_NESSTAR, HARVEST_STYLE_ROPER, HARVEST_STYLE_HGL, HARVEST_STYLE_DEFAULT); - public static final List HARVEST_STYLE_DESCRIPTION_LIST = Arrays.asList(HARVEST_STYLE_DESCRIPTION_DATAVERSE, HARVEST_STYLE_DESCRIPTION_VDC, HARVEST_STYLE_DESCRIPTION_NESSTAR, HARVEST_STYLE_DESCRIPTION_ROPER, HARVEST_STYLE_DESCRIPTION_HGL, HARVEST_STYLE_DESCRIPTION_DEFAULT); + public static final List HARVEST_STYLE_LIST = Arrays.asList(HARVEST_STYLE_DATAVERSE, HARVEST_STYLE_VDC, HARVEST_STYLE_ICPSR, HARVEST_STYLE_NESSTAR, HARVEST_STYLE_ROPER, HARVEST_STYLE_HGL, HARVEST_STYLE_DEFAULT); + public static final List HARVEST_STYLE_DESCRIPTION_LIST = Arrays.asList(HARVEST_STYLE_DESCRIPTION_DATAVERSE, HARVEST_STYLE_DESCRIPTION_VDC, HARVEST_STYLE_DESCRIPTION_ICPSR, HARVEST_STYLE_DESCRIPTION_NESSTAR, HARVEST_STYLE_DESCRIPTION_ROPER, HARVEST_STYLE_DESCRIPTION_HGL, HARVEST_STYLE_DESCRIPTION_DEFAULT); public static final Map HARVEST_STYLE_INFOMAP = new LinkedHashMap(); diff --git a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java index 28cc1a288a5..1da7f2ea34a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java @@ -404,7 +404,7 @@ public DataFile parseDataFile(JsonObject datafileJson) { if (contentType == null) { contentType = "application/octet-stream"; } - String storageIdentifier = datafileJson.getString("storageIdentifier"); + String storageIdentifier = datafileJson.getString("storageIdentifier", " "); JsonObject checksum = datafileJson.getJsonObject("checksum"); if (checksum != null) { // newer style that allows for SHA-1 rather than MD5