Skip to content

Commit 9c82c2d

Browse files
committed
An implementation of ListRecords-based harvesting without cheating (without additional GetRecord calls) #10909
1 parent 5d000f3 commit 9c82c2d

File tree

3 files changed

+88
-32
lines changed

3 files changed

+88
-32
lines changed

src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java

+44-21
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,23 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest,
215215
File metadataFile,
216216
Date oaiDateStamp,
217217
PrintWriter cleanupLog) throws ImportException, IOException {
218+
219+
logger.fine("importing " + metadataFormat + " saved in " + metadataFile.getAbsolutePath());
220+
221+
//@todo check for an IOException here, through ImportException instead, if caught
222+
String metadataAsString = new String(Files.readAllBytes(metadataFile.toPath()));
223+
return doImportHarvestedDataset(dataverseRequest, harvestingClient, harvestIdentifier, metadataFormat, metadataAsString, oaiDateStamp, cleanupLog);
224+
}
225+
226+
@TransactionAttribute(TransactionAttributeType.REQUIRES_NEW)
227+
public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest,
228+
HarvestingClient harvestingClient,
229+
String harvestIdentifier,
230+
String metadataFormat,
231+
String metadataString,
232+
Date oaiDateStamp,
233+
PrintWriter cleanupLog) throws ImportException, IOException {
234+
218235
if (harvestingClient == null || harvestingClient.getDataverse() == null) {
219236
throw new ImportException("importHarvestedDataset called with a null harvestingClient, or an invalid harvestingClient.");
220237
}
@@ -234,32 +251,32 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest,
234251
// Kraffmiller's export modules; replace the logic below with clean
235252
// programmatic lookup of the import plugin needed.
236253

254+
logger.fine("importing " + metadataFormat + " for " + harvestIdentifier);
255+
237256
if ("ddi".equalsIgnoreCase(metadataFormat) || "oai_ddi".equals(metadataFormat)
238257
|| metadataFormat.toLowerCase().matches("^oai_ddi.*")) {
239258
try {
240-
String xmlToParse = new String(Files.readAllBytes(metadataFile.toPath()));
259+
///String xmlToParse = new String(Files.readAllBytes(metadataFile.toPath()));
241260
// TODO:
242261
// import type should be configurable - it should be possible to
243262
// select whether you want to harvest with or without files,
244263
// ImportType.HARVEST vs. ImportType.HARVEST_WITH_FILES
245-
logger.fine("importing DDI "+metadataFile.getAbsolutePath());
246-
dsDTO = importDDIService.doImport(ImportType.HARVEST, xmlToParse);
247-
} catch (IOException | XMLStreamException | ImportException e) {
264+
///logger.fine("importing DDI "+metadataFile.getAbsolutePath());
265+
dsDTO = importDDIService.doImport(ImportType.HARVEST, metadataString);
266+
} catch (XMLStreamException | ImportException e) {
248267
throw new ImportException("Failed to process DDI XML record: "+ e.getClass() + " (" + e.getMessage() + ")");
249268
}
250269
} else if ("dc".equalsIgnoreCase(metadataFormat) || "oai_dc".equals(metadataFormat)) {
251-
logger.fine("importing DC "+metadataFile.getAbsolutePath());
270+
//logger.fine("importing DC "+metadataFile.getAbsolutePath());
252271
try {
253-
String xmlToParse = new String(Files.readAllBytes(metadataFile.toPath()));
254-
dsDTO = importGenericService.processOAIDCxml(xmlToParse, harvestIdentifier, harvestingClient.isUseOaiIdentifiersAsPids());
255-
} catch (IOException | XMLStreamException e) {
272+
///String xmlToParse = new String(Files.readAllBytes(metadataFile.toPath()));
273+
dsDTO = importGenericService.processOAIDCxml(metadataString, harvestIdentifier, harvestingClient.isUseOaiIdentifiersAsPids());
274+
} catch (XMLStreamException e) {
256275
throw new ImportException("Failed to process Dublin Core XML record: "+ e.getClass() + " (" + e.getMessage() + ")");
257276
}
258277
} else if ("dataverse_json".equals(metadataFormat)) {
259278
// This is Dataverse metadata already formatted in JSON.
260-
// Simply read it into a string, and pass to the final import further down:
261-
logger.fine("Attempting to import custom dataverse metadata from file "+metadataFile.getAbsolutePath());
262-
json = new String(Files.readAllBytes(metadataFile.toPath()));
279+
json = metadataString;
263280
} else {
264281
throw new ImportException("Unsupported import metadata format: " + metadataFormat);
265282
}
@@ -394,17 +411,23 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest,
394411

395412
} catch (JsonParseException | ImportException | CommandException ex) {
396413
logger.fine("Failed to import harvested dataset: " + ex.getClass() + ": " + ex.getMessage());
397-
FileOutputStream savedJsonFileStream = new FileOutputStream(new File(metadataFile.getAbsolutePath() + ".json"));
398-
byte[] jsonBytes = json.getBytes();
399-
int i = 0;
400-
while (i < jsonBytes.length) {
401-
int chunkSize = i + 8192 <= jsonBytes.length ? 8192 : jsonBytes.length - i;
402-
savedJsonFileStream.write(jsonBytes, i, chunkSize);
403-
i += chunkSize;
404-
savedJsonFileStream.flush();
414+
415+
if (!"dataverse_json".equals(metadataFormat) && json != null) {
416+
// If this was an xml format that were able to transform into
417+
// our json, let's save it for debugging etc. purposes
418+
File tempFile = File.createTempFile("meta", ".json");
419+
FileOutputStream savedJsonFileStream = new FileOutputStream(tempFile);
420+
byte[] jsonBytes = json.getBytes();
421+
int i = 0;
422+
while (i < jsonBytes.length) {
423+
int chunkSize = i + 8192 <= jsonBytes.length ? 8192 : jsonBytes.length - i;
424+
savedJsonFileStream.write(jsonBytes, i, chunkSize);
425+
i += chunkSize;
426+
savedJsonFileStream.flush();
427+
}
428+
savedJsonFileStream.close();
429+
logger.info("JSON produced saved in " + tempFile.getAbsolutePath());
405430
}
406-
savedJsonFileStream.close();
407-
logger.info("JSON produced saved in " + metadataFile.getAbsolutePath() + ".json");
408431
throw new ImportException("Failed to import harvested dataset: " + ex.getClass() + " (" + ex.getMessage() + ")", ex);
409432
}
410433
return importedDataset;

src/main/java/edu/harvard/iq/dataverse/harvest/client/FastGetRecord.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,8 @@ public class FastGetRecord {
8181
private static final String XML_METADATA_TAG_OPEN = "<"+XML_METADATA_TAG+">";
8282
private static final String XML_METADATA_TAG_CLOSE = "</"+XML_METADATA_TAG+">";
8383
private static final String XML_OAI_PMH_CLOSING_TAGS = "</record></GetRecord></OAI-PMH>";
84-
private static final String XML_XMLNS_XSI_ATTRIBUTE_TAG = "xmlns:xsi=";
85-
private static final String XML_XMLNS_XSI_ATTRIBUTE = " "+XML_XMLNS_XSI_ATTRIBUTE_TAG+"\"http://www.w3.org/2001/XMLSchema-instance\">";
84+
public static final String XML_XMLNS_XSI_ATTRIBUTE_TAG = "xmlns:xsi=";
85+
public static final String XML_XMLNS_XSI_ATTRIBUTE = " "+XML_XMLNS_XSI_ATTRIBUTE_TAG+"\"http://www.w3.org/2001/XMLSchema-instance\">";
8686
private static final String XML_COMMENT_START = "<!--";
8787
private static final String XML_COMMENT_END = "-->";
8888

src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvesterServiceBean.java

+42-9
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,11 @@
3939
import edu.harvard.iq.dataverse.EjbDataverseEngine;
4040
import edu.harvard.iq.dataverse.api.imports.ImportServiceBean;
4141
import edu.harvard.iq.dataverse.engine.command.DataverseRequest;
42+
import static edu.harvard.iq.dataverse.harvest.client.FastGetRecord.XML_XMLNS_XSI_ATTRIBUTE_TAG;
43+
import static edu.harvard.iq.dataverse.harvest.client.FastGetRecord.XML_XMLNS_XSI_ATTRIBUTE;
4244
import edu.harvard.iq.dataverse.harvest.client.oai.OaiHandler;
4345
import edu.harvard.iq.dataverse.harvest.client.oai.OaiHandlerException;
4446
import edu.harvard.iq.dataverse.search.IndexServiceBean;
45-
import io.gdcc.xoai.xml.XmlWriter;
4647
import java.io.FileOutputStream;
4748
import java.io.FileWriter;
4849
import java.io.InputStream;
@@ -296,11 +297,12 @@ private void harvestOAIviaListRecords(OaiHandler oaiHandler, DataverseRequest da
296297

297298
Record oaiRecord = idIter.next();
298299

299-
/*try {
300-
harvesterLogger.info("record.getMetadata() (via XmlWriter):" + XmlWriter.toString(oaiRecord.getMetadata()));
301-
} catch (XMLStreamException xsx) {
302-
harvesterLogger.info("Caught an XMLStreamException: " + xsx.getMessage());
303-
}*/
300+
//try {
301+
//harvesterLogger.info("record.getMetadata() (via getMetadataAsString()):" + oaiRecord.getMetadata().getMetadataAsString());
302+
System.out.println("record.getMetadata() (via getMetadataAsString()):" + oaiRecord.getMetadata().getMetadataAsString());
303+
//} catch (XMLStreamException xsx) {
304+
// harvesterLogger.info("Caught an XMLStreamException: " + xsx.getMessage());
305+
//}
304306

305307

306308
Header h = oaiRecord.getHeader();
@@ -318,10 +320,41 @@ private void harvestOAIviaListRecords(OaiHandler oaiHandler, DataverseRequest da
318320

319321
MutableBoolean getRecordErrorOccurred = new MutableBoolean(false);
320322

321-
//Metadata oaiMetadata = oaiRecord.getMetadata();
323+
Metadata oaiMetadata = oaiRecord.getMetadata();
324+
String metadataString = oaiMetadata.getMetadataAsString();
322325

323-
// Retrieve and process this record with a separate GetRecord call:
324-
Long datasetId = processRecord(dataverseRequest, harvesterLogger, importCleanupLog, oaiHandler, identifier, getRecordErrorOccurred, deletedIdentifiers, dateStamp, httpClient);
326+
Long datasetId = null;
327+
328+
if (metadataString != null) {
329+
Dataset harvestedDataset = null;
330+
331+
// Some xml header sanitation:
332+
if (!metadataString.matches("^<[^>]*" + XML_XMLNS_XSI_ATTRIBUTE_TAG + ".*")) {
333+
metadataString = metadataString.replaceFirst(">", XML_XMLNS_XSI_ATTRIBUTE);
334+
}
335+
336+
try {
337+
harvestedDataset = importService.doImportHarvestedDataset(dataverseRequest,
338+
oaiHandler.getHarvestingClient(),
339+
identifier,
340+
oaiHandler.getMetadataPrefix(),
341+
metadataString,
342+
dateStamp,
343+
importCleanupLog);
344+
345+
harvesterLogger.fine("Harvest Successful for identifier " + identifier);
346+
harvesterLogger.fine("Size of this record: " + metadataString.length());
347+
} catch (Throwable e) {
348+
logGetRecordException(harvesterLogger, oaiHandler, identifier, e);
349+
}
350+
if (harvestedDataset != null) {
351+
datasetId = harvestedDataset.getId();
352+
}
353+
} else {
354+
// Instead of giving up here, let's try to retrieve and process
355+
// this record with a separate GetRecord call:
356+
datasetId = processRecord(dataverseRequest, harvesterLogger, importCleanupLog, oaiHandler, identifier, getRecordErrorOccurred, deletedIdentifiers, dateStamp, httpClient);
357+
}
325358

326359
if (datasetId != null) {
327360
harvestedDatasetIds.add(datasetId);

0 commit comments

Comments
 (0)