From 711dc6362dc629269d7db5840eb13821fc978682 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Fri, 9 Dec 2022 10:39:44 -0500 Subject: [PATCH 1/9] extra metadata from NetCDF and HDF5 files in NcML format #9153 --- doc/release-notes/9153-extract-metadata.md | 1 + .../source/user/dataset-management.rst | 7 ++ .../edu/harvard/iq/dataverse/DatasetPage.java | 1 + .../iq/dataverse/EditDatafilesPage.java | 1 + .../datadeposit/MediaResourceManagerImpl.java | 1 + .../datasetutility/AddReplaceFileHelper.java | 2 + .../dataverse/ingest/IngestServiceBean.java | 64 ++++++++++++++++++- .../harvard/iq/dataverse/api/NetcdfIT.java | 57 +++++++++++++++++ 8 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 doc/release-notes/9153-extract-metadata.md create mode 100644 src/test/java/edu/harvard/iq/dataverse/api/NetcdfIT.java diff --git a/doc/release-notes/9153-extract-metadata.md b/doc/release-notes/9153-extract-metadata.md new file mode 100644 index 00000000000..ce4cc714805 --- /dev/null +++ b/doc/release-notes/9153-extract-metadata.md @@ -0,0 +1 @@ +For NetCDF and HDF5 files, an attempt will be made to extract metadata in NcML (XML) format and save it as an auxiliary file. diff --git a/doc/sphinx-guides/source/user/dataset-management.rst b/doc/sphinx-guides/source/user/dataset-management.rst index ec3bb392ce5..e891ca72880 100755 --- a/doc/sphinx-guides/source/user/dataset-management.rst +++ b/doc/sphinx-guides/source/user/dataset-management.rst @@ -299,6 +299,13 @@ Astronomy (FITS) Metadata found in the header section of `Flexible Image Transport System (FITS) files `_ are automatically extracted by the Dataverse Software, aggregated and displayed in the Astronomy Domain-Specific Metadata of the Dataset that the file belongs to. This FITS file metadata, is therefore searchable and browsable (facets) at the Dataset-level. +NetCDF and HDF5 +--------------- + +For NetCDF and HDF5 files, an attempt will be made to extract metadata in NcML_ (XML) format and save it as an auxiliary file. (See also :doc:`/developers/aux-file-support` in the Developer Guide.) + +.. _NcML: https://docs.unidata.ucar.edu/netcdf-java/current/userguide/ncml_overview.html + Compressed Files ---------------- diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index 6e71f6c5042..b538aaca2c6 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -3733,6 +3733,7 @@ public String save() { // Call Ingest Service one more time, to // queue the data ingest jobs for asynchronous execution: ingestService.startIngestJobsForDataset(dataset, (AuthenticatedUser) session.getUser()); + ingestService.extractMetadata(dataset, (AuthenticatedUser) session.getUser()); //After dataset saved, then persist prov json data if(systemConfig.isProvCollectionEnabled()) { diff --git a/src/main/java/edu/harvard/iq/dataverse/EditDatafilesPage.java b/src/main/java/edu/harvard/iq/dataverse/EditDatafilesPage.java index fc8df8681af..d045126a3aa 100644 --- a/src/main/java/edu/harvard/iq/dataverse/EditDatafilesPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/EditDatafilesPage.java @@ -1225,6 +1225,7 @@ public String save() { // queue the data ingest jobs for asynchronous execution: if (mode == FileEditMode.UPLOAD) { ingestService.startIngestJobsForDataset(dataset, (AuthenticatedUser) session.getUser()); + ingestService.extractMetadata(dataset, (AuthenticatedUser) session.getUser()); } if (FileEditMode.EDIT == mode && Referrer.FILE == referrer && fileMetadatas.size() > 0) { diff --git a/src/main/java/edu/harvard/iq/dataverse/api/datadeposit/MediaResourceManagerImpl.java b/src/main/java/edu/harvard/iq/dataverse/api/datadeposit/MediaResourceManagerImpl.java index 5491024c73c..e8d25bb4148 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/datadeposit/MediaResourceManagerImpl.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/datadeposit/MediaResourceManagerImpl.java @@ -373,6 +373,7 @@ DepositReceipt replaceOrAddFiles(String uri, Deposit deposit, AuthCredentials au } ingestService.startIngestJobsForDataset(dataset, user); + ingestService.extractMetadata(dataset, user); ReceiptGenerator receiptGenerator = new ReceiptGenerator(); String baseUrl = urlManager.getHostnamePlusBaseUrlPath(uri); diff --git a/src/main/java/edu/harvard/iq/dataverse/datasetutility/AddReplaceFileHelper.java b/src/main/java/edu/harvard/iq/dataverse/datasetutility/AddReplaceFileHelper.java index febbb249a91..5277d014430 100644 --- a/src/main/java/edu/harvard/iq/dataverse/datasetutility/AddReplaceFileHelper.java +++ b/src/main/java/edu/harvard/iq/dataverse/datasetutility/AddReplaceFileHelper.java @@ -1932,6 +1932,7 @@ private boolean step_100_startIngestJobs(){ // start the ingest! ingestService.startIngestJobsForDataset(dataset, dvRequest.getAuthenticatedUser()); msg("post ingest start"); + ingestService.extractMetadata(dataset, dvRequest.getAuthenticatedUser()); } return true; } @@ -2145,6 +2146,7 @@ public Response addFiles(String jsonData, Dataset dataset, User authUser) { } //ingest job ingestService.startIngestJobsForDataset(dataset, (AuthenticatedUser) authUser); + ingestService.extractMetadata(dataset, (AuthenticatedUser) authUser); } } diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java index b03bae618a4..e261efce642 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java @@ -20,6 +20,8 @@ package edu.harvard.iq.dataverse.ingest; +import edu.harvard.iq.dataverse.AuxiliaryFile; +import edu.harvard.iq.dataverse.AuxiliaryFileServiceBean; import edu.harvard.iq.dataverse.ControlledVocabularyValue; import edu.harvard.iq.dataverse.datavariable.VariableCategory; import edu.harvard.iq.dataverse.datavariable.VariableServiceBean; @@ -72,6 +74,7 @@ //import edu.harvard.iq.dvn.unf.*; import org.dataverse.unf.*; import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; @@ -81,6 +84,7 @@ import java.nio.channels.FileChannel; import java.nio.channels.ReadableByteChannel; import java.nio.channels.WritableByteChannel; +import java.nio.charset.StandardCharsets; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; @@ -113,6 +117,9 @@ import javax.jms.QueueSession; import javax.jms.Message; import javax.faces.application.FacesMessage; +import javax.ws.rs.core.MediaType; +import ucar.nc2.NetcdfFile; +import ucar.nc2.NetcdfFiles; /** * @@ -134,6 +141,8 @@ public class IngestServiceBean { @EJB DataFileServiceBean fileService; @EJB + AuxiliaryFileServiceBean auxiliaryFileService; + @EJB SystemConfig systemConfig; @Resource(lookup = "java:app/jms/queue/ingest") @@ -343,6 +352,7 @@ public List saveAndAddFilesToDataset(DatasetVersion version, try { // FITS is the only type supported for metadata // extraction, as of now. -- L.A. 4.0 + // Consider adding other formats such as NetCDF/HDF5. dataFile.setContentType("application/fits"); metadataExtracted = extractMetadata(tempFileLocation, dataFile, version); } catch (IOException mex) { @@ -565,7 +575,58 @@ public int compare(DataFile d1, DataFile d2) { return sb.toString(); } - + // Note: There is another method called extractMetadata for FITS files. + public void extractMetadata(Dataset dataset, AuthenticatedUser user) { + for (DataFile dataFile : dataset.getFiles()) { + Path pathToLocalDataFile = null; + try { + pathToLocalDataFile = dataFile.getStorageIO().getFileSystemPath(); + } catch (IOException ex) { + logger.info("Exception calling dataAccess.getFileSystemPath: " + ex); + } + InputStream inputStream = null; + if (pathToLocalDataFile != null) { + try ( NetcdfFile netcdfFile = NetcdfFiles.open(pathToLocalDataFile.toString())) { + if (netcdfFile != null) { + // TODO: What should we pass as a URL to toNcml()? + String ncml = netcdfFile.toNcml("FIXME_URL"); + inputStream = new ByteArrayInputStream(ncml.getBytes(StandardCharsets.UTF_8)); + } else { + logger.info("NetcdfFiles.open() could open file id " + dataFile.getId() + " (null returned)."); + } + } catch (IOException ex) { + logger.info("NetcdfFiles.open() could open file id " + dataFile.getId() + ". Exception caught: " + ex); + } + } else { + logger.info("pathToLocalDataFile is null! Are you on S3? Metadata extraction from NetCDF/HDF5 is not yet available."); + // As a tabular file, we'll probably need to download the NetCDF/HDF5 files from S3 and then try to extra the metadata, + // unless we can get some sort of S3 interface working: + // https://docs.unidata.ucar.edu/netcdf-java/current/userguide/dataset_urls.html#object-stores + // If we need to download the file and extract only some of the bytes (hopefully the first bytes) here's the spec for NetCDF: + // https://docs.unidata.ucar.edu/netcdf-c/current/file_format_specifications.html + } + if (inputStream != null) { + // TODO: What should the tag be? + String formatTag = "ncml"; + // TODO: What should the version be? + String formatVersion = "0.1"; + // TODO: What should the origin be? + String origin = "myOrigin"; + boolean isPublic = true; + // TODO: What should the type be? + String type = "myType"; + // TODO: Does NcML have its own content type? (MIME type) + MediaType mediaType = new MediaType("text", "xml"); + try { + AuxiliaryFile auxFile = auxiliaryFileService.processAuxiliaryFile(inputStream, dataFile, formatTag, formatVersion, origin, isPublic, type, mediaType); + logger.info("Aux file extracted from NetCDF/HDF5 file saved: " + auxFile); + } catch (Exception ex) { + logger.info("exception throw calling processAuxiliaryFile: " + ex); + } + } + } + } + public void produceSummaryStatistics(DataFile dataFile, File generatedTabularFile) throws IOException { /* logger.info("Skipping summary statistics and UNF."); @@ -1159,6 +1220,7 @@ public boolean fileMetadataExtractable(DataFile dataFile) { * extractMetadata: * framework for extracting metadata from uploaded files. The results will * be used to populate the metadata of the Dataset to which the file belongs. + * Note that another method called extractMetadata creates aux files from data files. */ public boolean extractMetadata(String tempFileLocation, DataFile dataFile, DatasetVersion editVersion) throws IOException { boolean ingestSuccessful = false; diff --git a/src/test/java/edu/harvard/iq/dataverse/api/NetcdfIT.java b/src/test/java/edu/harvard/iq/dataverse/api/NetcdfIT.java new file mode 100644 index 00000000000..a83af514935 --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/api/NetcdfIT.java @@ -0,0 +1,57 @@ +package edu.harvard.iq.dataverse.api; + +import com.jayway.restassured.RestAssured; +import com.jayway.restassured.path.json.JsonPath; +import com.jayway.restassured.response.Response; +import java.io.IOException; +import static javax.ws.rs.core.Response.Status.CREATED; +import static javax.ws.rs.core.Response.Status.OK; +import org.junit.BeforeClass; +import org.junit.Test; + +public class NetcdfIT { + + @BeforeClass + public static void setUp() { + RestAssured.baseURI = UtilIT.getRestAssuredBaseUri(); + } + + @Test + public void testNmclFromNetcdf() throws IOException { + Response createUser = UtilIT.createRandomUser(); + createUser.then().assertThat().statusCode(OK.getStatusCode()); + String apiToken = UtilIT.getApiTokenFromResponse(createUser); + String username = UtilIT.getUsernameFromResponse(createUser); + + Response createDataverseResponse = UtilIT.createRandomDataverse(apiToken); + createDataverseResponse.prettyPrint(); + createDataverseResponse.then().assertThat() + .statusCode(CREATED.getStatusCode()); + + String dataverseAlias = UtilIT.getAliasFromResponse(createDataverseResponse); + + Response createDataset = UtilIT.createRandomDatasetViaNativeApi(dataverseAlias, apiToken); + createDataset.prettyPrint(); + createDataset.then().assertThat() + .statusCode(CREATED.getStatusCode()); + + Integer datasetId = UtilIT.getDatasetIdFromResponse(createDataset); + String datasetPid = UtilIT.getDatasetPersistentIdFromResponse(createDataset); + + String pathToFile = "src/test/resources/netcdf/madis-raob"; + + Response uploadFile = UtilIT.uploadFileViaNative(datasetId.toString(), pathToFile, apiToken); + uploadFile.prettyPrint(); + uploadFile.then().assertThat().statusCode(OK.getStatusCode()); + + long fileId = JsonPath.from(uploadFile.body().asString()).getLong("data.files[0].dataFile.id"); + String tag = "ncml"; + String version = "0.1"; + + Response downloadNcml = UtilIT.downloadAuxFile(fileId, tag, version, apiToken); + //downloadNcml.prettyPrint(); // long output + downloadNcml.then().assertThat() + .statusCode(OK.getStatusCode()) + .contentType("text/xml; name=\"madis-raob.ncml_0.1.xml\";charset=UTF-8"); + } +} From 438b86cf7a04e3be44c84883f2842680bd98450a Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Wed, 14 Dec 2022 15:33:41 -0500 Subject: [PATCH 2/9] extract NcML earlier, from temp file during upload #9153 --- .../harvard/iq/dataverse/AuxiliaryFile.java | 5 +- .../dataverse/AuxiliaryFileServiceBean.java | 23 ++++- .../edu/harvard/iq/dataverse/DatasetPage.java | 1 - .../iq/dataverse/EditDatafilesPage.java | 1 - .../datadeposit/MediaResourceManagerImpl.java | 1 - .../datasetutility/AddReplaceFileHelper.java | 2 - .../dataverse/ingest/IngestServiceBean.java | 93 ++++++++----------- src/main/java/propertyFiles/Bundle.properties | 1 + .../harvard/iq/dataverse/api/NetcdfIT.java | 2 +- 9 files changed, 65 insertions(+), 64 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/AuxiliaryFile.java b/src/main/java/edu/harvard/iq/dataverse/AuxiliaryFile.java index a7a89934f47..344032ef5e3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/AuxiliaryFile.java +++ b/src/main/java/edu/harvard/iq/dataverse/AuxiliaryFile.java @@ -55,7 +55,10 @@ public class AuxiliaryFile implements Serializable { private String formatTag; private String formatVersion; - + + /** + * The application/entity that created the auxiliary file. + */ private String origin; private boolean isPublic; diff --git a/src/main/java/edu/harvard/iq/dataverse/AuxiliaryFileServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/AuxiliaryFileServiceBean.java index 76c91382868..05f3e209632 100644 --- a/src/main/java/edu/harvard/iq/dataverse/AuxiliaryFileServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/AuxiliaryFileServiceBean.java @@ -70,9 +70,13 @@ public AuxiliaryFile save(AuxiliaryFile auxiliaryFile) { * @param type how to group the files such as "DP" for "Differentially * @param mediaType user supplied content type (MIME type) * Private Statistics". - * @return success boolean - returns whether the save was successful + * @param save boolean - true to save immediately, false to let the cascade + * do persist to the database. + * @return an AuxiliaryFile with an id when save=true (assuming no + * exceptions) or an AuxiliaryFile without an id that will be persisted + * later through the cascade. */ - public AuxiliaryFile processAuxiliaryFile(InputStream fileInputStream, DataFile dataFile, String formatTag, String formatVersion, String origin, boolean isPublic, String type, MediaType mediaType) { + public AuxiliaryFile processAuxiliaryFile(InputStream fileInputStream, DataFile dataFile, String formatTag, String formatVersion, String origin, boolean isPublic, String type, MediaType mediaType, boolean save) { StorageIO storageIO = null; AuxiliaryFile auxFile = new AuxiliaryFile(); @@ -114,7 +118,14 @@ public AuxiliaryFile processAuxiliaryFile(InputStream fileInputStream, DataFile auxFile.setType(type); auxFile.setDataFile(dataFile); auxFile.setFileSize(storageIO.getAuxObjectSize(auxExtension)); - auxFile = save(auxFile); + if (save) { + auxFile = save(auxFile); + } else { + if (dataFile.getAuxiliaryFiles() == null) { + dataFile.setAuxiliaryFiles(new ArrayList<>()); + } + dataFile.getAuxiliaryFiles().add(auxFile); + } } catch (IOException ioex) { logger.severe("IO Exception trying to save auxiliary file: " + ioex.getMessage()); throw new InternalServerErrorException(); @@ -129,7 +140,11 @@ public AuxiliaryFile processAuxiliaryFile(InputStream fileInputStream, DataFile } return auxFile; } - + + public AuxiliaryFile processAuxiliaryFile(InputStream fileInputStream, DataFile dataFile, String formatTag, String formatVersion, String origin, boolean isPublic, String type, MediaType mediaType) { + return processAuxiliaryFile(fileInputStream, dataFile, formatTag, formatVersion, origin, isPublic, type, mediaType, true); + } + public AuxiliaryFile lookupAuxiliaryFile(DataFile dataFile, String formatTag, String formatVersion) { Query query = em.createNamedQuery("AuxiliaryFile.lookupAuxiliaryFile"); diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index b538aaca2c6..6e71f6c5042 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -3733,7 +3733,6 @@ public String save() { // Call Ingest Service one more time, to // queue the data ingest jobs for asynchronous execution: ingestService.startIngestJobsForDataset(dataset, (AuthenticatedUser) session.getUser()); - ingestService.extractMetadata(dataset, (AuthenticatedUser) session.getUser()); //After dataset saved, then persist prov json data if(systemConfig.isProvCollectionEnabled()) { diff --git a/src/main/java/edu/harvard/iq/dataverse/EditDatafilesPage.java b/src/main/java/edu/harvard/iq/dataverse/EditDatafilesPage.java index d045126a3aa..fc8df8681af 100644 --- a/src/main/java/edu/harvard/iq/dataverse/EditDatafilesPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/EditDatafilesPage.java @@ -1225,7 +1225,6 @@ public String save() { // queue the data ingest jobs for asynchronous execution: if (mode == FileEditMode.UPLOAD) { ingestService.startIngestJobsForDataset(dataset, (AuthenticatedUser) session.getUser()); - ingestService.extractMetadata(dataset, (AuthenticatedUser) session.getUser()); } if (FileEditMode.EDIT == mode && Referrer.FILE == referrer && fileMetadatas.size() > 0) { diff --git a/src/main/java/edu/harvard/iq/dataverse/api/datadeposit/MediaResourceManagerImpl.java b/src/main/java/edu/harvard/iq/dataverse/api/datadeposit/MediaResourceManagerImpl.java index e8d25bb4148..5491024c73c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/datadeposit/MediaResourceManagerImpl.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/datadeposit/MediaResourceManagerImpl.java @@ -373,7 +373,6 @@ DepositReceipt replaceOrAddFiles(String uri, Deposit deposit, AuthCredentials au } ingestService.startIngestJobsForDataset(dataset, user); - ingestService.extractMetadata(dataset, user); ReceiptGenerator receiptGenerator = new ReceiptGenerator(); String baseUrl = urlManager.getHostnamePlusBaseUrlPath(uri); diff --git a/src/main/java/edu/harvard/iq/dataverse/datasetutility/AddReplaceFileHelper.java b/src/main/java/edu/harvard/iq/dataverse/datasetutility/AddReplaceFileHelper.java index 5277d014430..febbb249a91 100644 --- a/src/main/java/edu/harvard/iq/dataverse/datasetutility/AddReplaceFileHelper.java +++ b/src/main/java/edu/harvard/iq/dataverse/datasetutility/AddReplaceFileHelper.java @@ -1932,7 +1932,6 @@ private boolean step_100_startIngestJobs(){ // start the ingest! ingestService.startIngestJobsForDataset(dataset, dvRequest.getAuthenticatedUser()); msg("post ingest start"); - ingestService.extractMetadata(dataset, dvRequest.getAuthenticatedUser()); } return true; } @@ -2146,7 +2145,6 @@ public Response addFiles(String jsonData, Dataset dataset, User authUser) { } //ingest job ingestService.startIngestJobsForDataset(dataset, (AuthenticatedUser) authUser); - ingestService.extractMetadata(dataset, (AuthenticatedUser) authUser); } } diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java index e261efce642..b5934c1167f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java @@ -241,6 +241,45 @@ public List saveAndAddFilesToDataset(DatasetVersion version, savedSuccess = true; logger.fine("Success: permanently saved file " + dataFile.getFileMetadata().getLabel()); + // TODO: reformat this file to remove the many tabs added in cc08330 + InputStream inputStream = null; + if (tempLocationPath != null) { + try ( NetcdfFile netcdfFile = NetcdfFiles.open(tempLocationPath.toString())) { + if (netcdfFile != null) { + // For now, empty string. What should we pass as a URL to toNcml()? The filename (including the path) most commonly at https://docs.unidata.ucar.edu/netcdf-java/current/userguide/ncml_cookbook.html + // With an empty string the XML will show 'location="file:"'. + String ncml = netcdfFile.toNcml(""); + inputStream = new ByteArrayInputStream(ncml.getBytes(StandardCharsets.UTF_8)); + } else { + logger.info("NetcdfFiles.open() could open file id " + dataFile.getId() + " (null returned)."); + } + } catch (IOException ex) { + logger.info("NetcdfFiles.open() could open file id " + dataFile.getId() + ". Exception caught: " + ex); + } + } else { + logger.info("tempLocationPath is null for file id " + dataFile.getId() + ". Can't extract NcML."); + } + if (inputStream != null) { + // If you change NcML, you must also change the previewer. + String formatTag = "NcML"; + // 0.1 is arbitrary. It's our first attempt to put out NcML so we're giving it a low number. + // If you bump the number here, be sure the bump the number in the previewer as well. + // We could use 2.2 here since that's the current version of NcML. + String formatVersion = "0.1"; + String origin = "netcdf-java"; + boolean isPublic = true; + // See also file.auxfiles.types.NcML in Bundle.properties. Used to group aux files in UI. + String type = "NcML"; + // XML because NcML doesn't have its own MIME/content type at https://www.iana.org/assignments/media-types/media-types.xhtml + MediaType mediaType = new MediaType("text", "xml"); + try { + AuxiliaryFile auxFile = auxiliaryFileService.processAuxiliaryFile(inputStream, dataFile, formatTag, formatVersion, origin, isPublic, type, mediaType, false); + logger.fine ("Aux file extracted from NetCDF/HDF5 file saved to storage (but not to the database yet) from file id " + dataFile.getId()); + } catch (Exception ex) { + logger.info("exception throw calling processAuxiliaryFile: " + ex); + } + } + } catch (IOException ioex) { logger.warning("Failed to save the file, storage id " + dataFile.getStorageIdentifier() + " (" + ioex.getMessage() + ")"); } finally { @@ -302,6 +341,7 @@ public List saveAndAddFilesToDataset(DatasetVersion version, // Any necessary post-processing: // performPostProcessingTasks(dataFile); } else { + System.out.println("driver is not tmp"); try { StorageIO dataAccess = DataAccess.getStorageIO(dataFile); //Populate metadata @@ -575,58 +615,6 @@ public int compare(DataFile d1, DataFile d2) { return sb.toString(); } - // Note: There is another method called extractMetadata for FITS files. - public void extractMetadata(Dataset dataset, AuthenticatedUser user) { - for (DataFile dataFile : dataset.getFiles()) { - Path pathToLocalDataFile = null; - try { - pathToLocalDataFile = dataFile.getStorageIO().getFileSystemPath(); - } catch (IOException ex) { - logger.info("Exception calling dataAccess.getFileSystemPath: " + ex); - } - InputStream inputStream = null; - if (pathToLocalDataFile != null) { - try ( NetcdfFile netcdfFile = NetcdfFiles.open(pathToLocalDataFile.toString())) { - if (netcdfFile != null) { - // TODO: What should we pass as a URL to toNcml()? - String ncml = netcdfFile.toNcml("FIXME_URL"); - inputStream = new ByteArrayInputStream(ncml.getBytes(StandardCharsets.UTF_8)); - } else { - logger.info("NetcdfFiles.open() could open file id " + dataFile.getId() + " (null returned)."); - } - } catch (IOException ex) { - logger.info("NetcdfFiles.open() could open file id " + dataFile.getId() + ". Exception caught: " + ex); - } - } else { - logger.info("pathToLocalDataFile is null! Are you on S3? Metadata extraction from NetCDF/HDF5 is not yet available."); - // As a tabular file, we'll probably need to download the NetCDF/HDF5 files from S3 and then try to extra the metadata, - // unless we can get some sort of S3 interface working: - // https://docs.unidata.ucar.edu/netcdf-java/current/userguide/dataset_urls.html#object-stores - // If we need to download the file and extract only some of the bytes (hopefully the first bytes) here's the spec for NetCDF: - // https://docs.unidata.ucar.edu/netcdf-c/current/file_format_specifications.html - } - if (inputStream != null) { - // TODO: What should the tag be? - String formatTag = "ncml"; - // TODO: What should the version be? - String formatVersion = "0.1"; - // TODO: What should the origin be? - String origin = "myOrigin"; - boolean isPublic = true; - // TODO: What should the type be? - String type = "myType"; - // TODO: Does NcML have its own content type? (MIME type) - MediaType mediaType = new MediaType("text", "xml"); - try { - AuxiliaryFile auxFile = auxiliaryFileService.processAuxiliaryFile(inputStream, dataFile, formatTag, formatVersion, origin, isPublic, type, mediaType); - logger.info("Aux file extracted from NetCDF/HDF5 file saved: " + auxFile); - } catch (Exception ex) { - logger.info("exception throw calling processAuxiliaryFile: " + ex); - } - } - } - } - public void produceSummaryStatistics(DataFile dataFile, File generatedTabularFile) throws IOException { /* logger.info("Skipping summary statistics and UNF."); @@ -1220,7 +1208,6 @@ public boolean fileMetadataExtractable(DataFile dataFile) { * extractMetadata: * framework for extracting metadata from uploaded files. The results will * be used to populate the metadata of the Dataset to which the file belongs. - * Note that another method called extractMetadata creates aux files from data files. */ public boolean extractMetadata(String tempFileLocation, DataFile dataFile, DatasetVersion editVersion) throws IOException { boolean ingestSuccessful = false; diff --git a/src/main/java/propertyFiles/Bundle.properties b/src/main/java/propertyFiles/Bundle.properties index b19e80020ba..0ec81cb7d6b 100644 --- a/src/main/java/propertyFiles/Bundle.properties +++ b/src/main/java/propertyFiles/Bundle.properties @@ -2007,6 +2007,7 @@ file.remotelyStored=This file is stored remotely - click for more info file.auxfiles.download.header=Download Auxiliary Files # These types correspond to the AuxiliaryFile.Type enum. file.auxfiles.types.DP=Differentially Private Statistics +file.auxfiles.types.NcML=XML from NetCDF/HDF5 (NcML) # Add more types here file.auxfiles.unspecifiedTypes=Other Auxiliary Files diff --git a/src/test/java/edu/harvard/iq/dataverse/api/NetcdfIT.java b/src/test/java/edu/harvard/iq/dataverse/api/NetcdfIT.java index a83af514935..74179b98833 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/NetcdfIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/NetcdfIT.java @@ -45,7 +45,7 @@ public void testNmclFromNetcdf() throws IOException { uploadFile.then().assertThat().statusCode(OK.getStatusCode()); long fileId = JsonPath.from(uploadFile.body().asString()).getLong("data.files[0].dataFile.id"); - String tag = "ncml"; + String tag = "NcML"; String version = "0.1"; Response downloadNcml = UtilIT.downloadAuxFile(fileId, tag, version, apiToken); From 0e00766eec1d3cb043c6863b10856be877eb6da7 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Wed, 14 Dec 2022 15:59:46 -0500 Subject: [PATCH 3/9] add NetcdfIT to list of tests #9153 --- tests/integration-tests.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration-tests.txt b/tests/integration-tests.txt index 6e6668d45af..1e9110be2de 100644 --- a/tests/integration-tests.txt +++ b/tests/integration-tests.txt @@ -1 +1 @@ -DataversesIT,DatasetsIT,SwordIT,AdminIT,BuiltinUsersIT,UsersIT,UtilIT,ConfirmEmailIT,FileMetadataIT,FilesIT,SearchIT,InReviewWorkflowIT,HarvestingServerIT,HarvestingClientsIT,MoveIT,MakeDataCountApiIT,FileTypeDetectionIT,EditDDIIT,ExternalToolsIT,AccessIT,DuplicateFilesIT,DownloadFilesIT,LinkIT,DeleteUsersIT,DeactivateUsersIT,AuxiliaryFilesIT,InvalidCharactersIT,LicensesIT,NotificationsIT,BagIT,MetadataBlocksIT +DataversesIT,DatasetsIT,SwordIT,AdminIT,BuiltinUsersIT,UsersIT,UtilIT,ConfirmEmailIT,FileMetadataIT,FilesIT,SearchIT,InReviewWorkflowIT,HarvestingServerIT,HarvestingClientsIT,MoveIT,MakeDataCountApiIT,FileTypeDetectionIT,EditDDIIT,ExternalToolsIT,AccessIT,DuplicateFilesIT,DownloadFilesIT,LinkIT,DeleteUsersIT,DeactivateUsersIT,AuxiliaryFilesIT,InvalidCharactersIT,LicensesIT,NotificationsIT,BagIT,MetadataBlocksIT,NetcdfIT From 9edaf595480aeae85185e90d24d06d064bf0dc55 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Tue, 20 Dec 2022 10:13:52 -0500 Subject: [PATCH 4/9] add "requirements" and "auxFilesExist" to external tools #9153 The use case is an external tool that operates on aux files pulled out of NetCDF/HDF5 files. --- .../root/external-tools/auxFileTool.json | 26 ++++ .../source/api/external-tools.rst | 14 +- .../edu/harvard/iq/dataverse/DatasetPage.java | 2 +- .../edu/harvard/iq/dataverse/FilePage.java | 15 ++- .../edu/harvard/iq/dataverse/api/TestApi.java | 4 +- .../dataverse/externaltools/ExternalTool.java | 22 +++- .../ExternalToolServiceBean.java | 45 ++++++- .../V5.13.0.3__9153-extract-metadata.sql | 1 + .../iq/dataverse/api/ExternalToolsIT.java | 121 ++++++++++++++++++ .../ExternalToolServiceBeanTest.java | 68 +++++++++- 10 files changed, 306 insertions(+), 12 deletions(-) create mode 100644 doc/sphinx-guides/source/_static/installation/files/root/external-tools/auxFileTool.json create mode 100644 src/main/resources/db/migration/V5.13.0.3__9153-extract-metadata.sql diff --git a/doc/sphinx-guides/source/_static/installation/files/root/external-tools/auxFileTool.json b/doc/sphinx-guides/source/_static/installation/files/root/external-tools/auxFileTool.json new file mode 100644 index 00000000000..b188520dabb --- /dev/null +++ b/doc/sphinx-guides/source/_static/installation/files/root/external-tools/auxFileTool.json @@ -0,0 +1,26 @@ +{ + "displayName": "AuxFileViewer", + "description": "Show an auxiliary file from a dataset file.", + "toolName": "auxPreviewer", + "scope": "file", + "types": [ + "preview" + ], + "toolUrl": "https://example.com/AuxFileViewer.html", + "toolParameters": { + "queryParameters": [ + { + "fileid": "{fileId}" + } + ] + }, + "requirements": { + "auxFilesExist": [ + { + "formatTag": "myFormatTag", + "formatVersion": "0.1" + } + ] + }, + "contentType": "application/foobar" +} diff --git a/doc/sphinx-guides/source/api/external-tools.rst b/doc/sphinx-guides/source/api/external-tools.rst index 4f6c9a8015c..eec9944338f 100644 --- a/doc/sphinx-guides/source/api/external-tools.rst +++ b/doc/sphinx-guides/source/api/external-tools.rst @@ -53,15 +53,21 @@ External tools must be expressed in an external tool manifest file, a specific J Examples of Manifests +++++++++++++++++++++ -Let's look at two examples of external tool manifests (one at the file level and one at the dataset level) before we dive into how they work. +Let's look at a few examples of external tool manifests (both at the file level and at the dataset level) before we dive into how they work. + +.. _tools-for-files: External Tools for Files ^^^^^^^^^^^^^^^^^^^^^^^^ -:download:`fabulousFileTool.json <../_static/installation/files/root/external-tools/fabulousFileTool.json>` is a file level both an "explore" tool and a "preview" tool that operates on tabular files: +:download:`fabulousFileTool.json <../_static/installation/files/root/external-tools/fabulousFileTool.json>` is a file level (both an "explore" tool and a "preview" tool) that operates on tabular files: .. literalinclude:: ../_static/installation/files/root/external-tools/fabulousFileTool.json +:download:`auxFileTool.json <../_static/installation/files/root/external-tools/auxFileTool.json>` is a file level preview tool that operates on auxiliary files associated with a data file (note the "requirements" section): + +.. literalinclude:: ../_static/installation/files/root/external-tools/auxFileTool.json + External Tools for Datasets ^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -113,6 +119,10 @@ Terminology allowedApiCalls httpMethod Which HTTP method the specified callback uses such as ``GET`` or ``POST``. allowedApiCalls timeOut For non-public datasets and datafiles, how many minutes the signed URLs given to the tool should be valid for. Must be an integer. + + requirements **Resources your tool needs to function.** For now, the only requirement you can specify is that one or more auxiliary files exist (see auxFilesExist in the :ref:`tools-for-files` example). Currently, requirements only apply to preview tools. If the requirements are not met, the preview tool is not shown. + + auxFilesExist **An array containing formatTag and formatVersion pairs** for each auxiliary file that your tool needs to download to function properly. For example, a required aux file could have a ``formatTag`` of "NcML" and a ``formatVersion`` of "1.0". See also :doc:`/developers/aux-file-support`. toolName A **name** of an external tool that is used to differentiate between external tools and also used in bundle.properties for localization in the Dataverse installation web interface. For example, the toolName for Data Explorer is ``explorer``. For the Data Curation Tool the toolName is ``dct``. This is an optional parameter in the manifest JSON file. =========================== ========== diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index 6e71f6c5042..8bb1167afcd 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -5490,7 +5490,7 @@ public List getCachedToolsForDataFile(Long fileId, ExternalTool.Ty return cachedTools; } DataFile dataFile = datafileService.find(fileId); - cachedTools = ExternalToolServiceBean.findExternalToolsByFile(externalTools, dataFile); + cachedTools = externalToolService.findExternalToolsByFile(externalTools, dataFile); cachedToolsByFileId.put(fileId, cachedTools); //add to map so we don't have to do the lifting again return cachedTools; } diff --git a/src/main/java/edu/harvard/iq/dataverse/FilePage.java b/src/main/java/edu/harvard/iq/dataverse/FilePage.java index 85eb79d2ddc..228db0a7584 100644 --- a/src/main/java/edu/harvard/iq/dataverse/FilePage.java +++ b/src/main/java/edu/harvard/iq/dataverse/FilePage.java @@ -39,6 +39,7 @@ import edu.harvard.iq.dataverse.util.JsfHelper; import static edu.harvard.iq.dataverse.util.JsfHelper.JH; import edu.harvard.iq.dataverse.util.SystemConfig; +import edu.harvard.iq.dataverse.util.json.JsonUtil; import java.io.IOException; import java.time.format.DateTimeFormatter; import java.util.ArrayList; @@ -57,6 +58,9 @@ import javax.faces.view.ViewScoped; import javax.inject.Inject; import javax.inject.Named; +import javax.json.JsonArray; +import javax.json.JsonObject; +import javax.json.JsonValue; import javax.validation.ConstraintViolation; import org.primefaces.PrimeFaces; @@ -125,6 +129,8 @@ public class FilePage implements java.io.Serializable { ExternalToolServiceBean externalToolService; @EJB PrivateUrlServiceBean privateUrlService; + @EJB + AuxiliaryFileServiceBean auxiliaryFileService; @Inject DataverseRequestServiceBean dvRequestService; @@ -285,8 +291,15 @@ public void setDatasetVersionId(Long datasetVersionId) { this.datasetVersionId = datasetVersionId; } + // findPreviewTools would be a better name private List sortExternalTools(){ - List retList = externalToolService.findFileToolsByTypeAndContentType(ExternalTool.Type.PREVIEW, file.getContentType()); + List retList = new ArrayList<>(); + List previewTools = externalToolService.findFileToolsByTypeAndContentType(ExternalTool.Type.PREVIEW, file.getContentType()); + for (ExternalTool previewTool : previewTools) { + if (externalToolService.meetsRequirements(previewTool, file)) { + retList.add(previewTool); + } + } Collections.sort(retList, CompareExternalToolName); return retList; } diff --git a/src/main/java/edu/harvard/iq/dataverse/api/TestApi.java b/src/main/java/edu/harvard/iq/dataverse/api/TestApi.java index b532fbd4154..42caa95b9f5 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/TestApi.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/TestApi.java @@ -63,7 +63,9 @@ public Response getExternalToolsForFile(@PathParam("id") String idSupplied, @Que ApiToken apiToken = externalToolService.getApiToken(getRequestApiKey()); ExternalToolHandler externalToolHandler = new ExternalToolHandler(tool, dataFile, apiToken, dataFile.getFileMetadata(), null); JsonObjectBuilder toolToJson = externalToolService.getToolAsJsonWithQueryParameters(externalToolHandler); - tools.add(toolToJson); + if (externalToolService.meetsRequirements(tool, dataFile)) { + tools.add(toolToJson); + } } return ok(tools); } catch (WrappedResponse wr) { diff --git a/src/main/java/edu/harvard/iq/dataverse/externaltools/ExternalTool.java b/src/main/java/edu/harvard/iq/dataverse/externaltools/ExternalTool.java index 1789b7a90c3..0a238eb5198 100644 --- a/src/main/java/edu/harvard/iq/dataverse/externaltools/ExternalTool.java +++ b/src/main/java/edu/harvard/iq/dataverse/externaltools/ExternalTool.java @@ -39,6 +39,7 @@ public class ExternalTool implements Serializable { public static final String CONTENT_TYPE = "contentType"; public static final String TOOL_NAME = "toolName"; public static final String ALLOWED_API_CALLS = "allowedApiCalls"; + public static final String REQUIREMENTS = "requirements"; @Id @GeneratedValue(strategy = GenerationType.IDENTITY) @@ -103,6 +104,15 @@ public class ExternalTool implements Serializable { @Column(nullable = true, columnDefinition = "TEXT") private String allowedApiCalls; + /** + * When non-null, the tool has indicated that it has certain requirements + * that must be met before it should be shown to the user. This + * functionality was added for tools that operate on aux files rather than + * data files so "auxFilesExist" is one of the possible values. + */ + @Column(nullable = true, columnDefinition = "TEXT") + private String requirements; + /** * This default constructor is only here to prevent this error at * deployment: @@ -118,10 +128,10 @@ public ExternalTool() { } public ExternalTool(String displayName, String toolName, String description, List externalToolTypes, Scope scope, String toolUrl, String toolParameters, String contentType) { - this(displayName, toolName, description, externalToolTypes, scope, toolUrl, toolParameters, contentType, null); + this(displayName, toolName, description, externalToolTypes, scope, toolUrl, toolParameters, contentType, null, null); } - public ExternalTool(String displayName, String toolName, String description, List externalToolTypes, Scope scope, String toolUrl, String toolParameters, String contentType, String allowedApiCalls) { + public ExternalTool(String displayName, String toolName, String description, List externalToolTypes, Scope scope, String toolUrl, String toolParameters, String contentType, String allowedApiCalls, String requirements) { this.displayName = displayName; this.toolName = toolName; this.description = description; @@ -131,6 +141,7 @@ public ExternalTool(String displayName, String toolName, String description, Lis this.toolParameters = toolParameters; this.contentType = contentType; this.allowedApiCalls = allowedApiCalls; + this.requirements = requirements; } public enum Type { @@ -326,5 +337,12 @@ public void setAllowedApiCalls(String allowedApiCalls) { this.allowedApiCalls = allowedApiCalls; } + public String getRequirements() { + return requirements; + } + + public void setRequirements(String requirements) { + this.requirements = requirements; + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/externaltools/ExternalToolServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/externaltools/ExternalToolServiceBean.java index a65ad2427ba..f38cd7301ee 100644 --- a/src/main/java/edu/harvard/iq/dataverse/externaltools/ExternalToolServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/externaltools/ExternalToolServiceBean.java @@ -1,5 +1,7 @@ package edu.harvard.iq.dataverse.externaltools; +import edu.harvard.iq.dataverse.AuxiliaryFile; +import edu.harvard.iq.dataverse.AuxiliaryFileServiceBean; import edu.harvard.iq.dataverse.DataFile; import edu.harvard.iq.dataverse.DataFileServiceBean; import edu.harvard.iq.dataverse.authorization.users.ApiToken; @@ -30,6 +32,8 @@ import static edu.harvard.iq.dataverse.externaltools.ExternalTool.*; import java.util.stream.Collectors; import java.util.stream.Stream; +import javax.ejb.EJB; +import javax.json.JsonValue; @Stateless @Named @@ -40,6 +44,9 @@ public class ExternalToolServiceBean { @PersistenceContext(unitName = "VDCNet-ejbPU") private EntityManager em; + @EJB + AuxiliaryFileServiceBean auxiliaryFileService; + public List findAll() { TypedQuery typedQuery = em.createQuery("SELECT OBJECT(o) FROM ExternalTool AS o ORDER BY o.id", ExternalTool.class); return typedQuery.getResultList(); @@ -133,13 +140,13 @@ public ExternalTool save(ExternalTool externalTool) { * file supports The list of tools is passed in so it doesn't hit the * database each time */ - public static List findExternalToolsByFile(List allExternalTools, DataFile file) { + public List findExternalToolsByFile(List allExternalTools, DataFile file) { List externalTools = new ArrayList<>(); //Map tabular data to it's mimetype (the isTabularData() check assures that this code works the same as before, but it may need to change if tabular data is split into subtypes with differing mimetypes) final String contentType = file.isTabularData() ? DataFileServiceBean.MIME_TYPE_TSV_ALT : file.getContentType(); allExternalTools.forEach((externalTool) -> { - //Match tool and file type - if (contentType.equals(externalTool.getContentType())) { + //Match tool and file type, then check requirements + if (contentType.equals(externalTool.getContentType()) && meetsRequirements(externalTool, file)) { externalTools.add(externalTool); } }); @@ -147,6 +154,31 @@ public static List findExternalToolsByFile(List allE return externalTools; } + public boolean meetsRequirements(ExternalTool externalTool, DataFile dataFile) { + String requirements = externalTool.getRequirements(); + if (requirements == null) { + logger.fine("Data file id" + dataFile.getId() + ": no requirements for tool id " + externalTool.getId()); + return true; + } + boolean meetsRequirements = true; + JsonObject requirementsObj = JsonUtil.getJsonObject(requirements); + JsonArray auxFilesExist = requirementsObj.getJsonArray("auxFilesExist"); + for (JsonValue jsonValue : auxFilesExist) { + String formatTag = jsonValue.asJsonObject().getString("formatTag"); + String formatVersion = jsonValue.asJsonObject().getString("formatVersion"); + AuxiliaryFile auxFile = auxiliaryFileService.lookupAuxiliaryFile(dataFile, formatTag, formatVersion); + if (auxFile == null) { + logger.fine("Data file id" + dataFile.getId() + ": cannot find required aux file. formatTag=" + formatTag + ". formatVersion=" + formatVersion); + meetsRequirements = false; + break; + } else { + logger.fine("Data file id" + dataFile.getId() + ": found required aux file. formatTag=" + formatTag + ". formatVersion=" + formatVersion); + meetsRequirements = true; + } + } + return meetsRequirements; + } + public static ExternalTool parseAddExternalToolManifest(String manifest) { if (manifest == null || manifest.isEmpty()) { @@ -170,6 +202,7 @@ public static ExternalTool parseAddExternalToolManifest(String manifest) { JsonObject toolParametersObj = jsonObject.getJsonObject(TOOL_PARAMETERS); JsonArray queryParams = toolParametersObj.getJsonArray("queryParameters"); JsonArray allowedApiCallsArray = jsonObject.getJsonArray(ALLOWED_API_CALLS); + JsonObject requirementsObj = jsonObject.getJsonObject(REQUIREMENTS); boolean allRequiredReservedWordsFound = false; if (scope.equals(Scope.FILE)) { @@ -227,8 +260,12 @@ public static ExternalTool parseAddExternalToolManifest(String manifest) { if(allowedApiCallsArray !=null) { allowedApiCalls = allowedApiCallsArray.toString(); } + String requirements = null; + if (requirementsObj != null) { + requirements = requirementsObj.toString(); + } - return new ExternalTool(displayName, toolName, description, externalToolTypes, scope, toolUrl, toolParameters, contentType, allowedApiCalls); + return new ExternalTool(displayName, toolName, description, externalToolTypes, scope, toolUrl, toolParameters, contentType, allowedApiCalls, requirements); } private static String getRequiredTopLevelField(JsonObject jsonObject, String key) { diff --git a/src/main/resources/db/migration/V5.13.0.3__9153-extract-metadata.sql b/src/main/resources/db/migration/V5.13.0.3__9153-extract-metadata.sql new file mode 100644 index 00000000000..48230d21032 --- /dev/null +++ b/src/main/resources/db/migration/V5.13.0.3__9153-extract-metadata.sql @@ -0,0 +1 @@ +ALTER TABLE externaltool ADD COLUMN IF NOT EXISTS requirements TEXT; diff --git a/src/test/java/edu/harvard/iq/dataverse/api/ExternalToolsIT.java b/src/test/java/edu/harvard/iq/dataverse/api/ExternalToolsIT.java index 5508a6c57dc..cdebeddb7bc 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/ExternalToolsIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/ExternalToolsIT.java @@ -3,8 +3,11 @@ import com.jayway.restassured.RestAssured; import com.jayway.restassured.path.json.JsonPath; import com.jayway.restassured.response.Response; +import java.io.File; import java.io.IOException; import java.io.StringReader; +import java.nio.file.Path; +import java.nio.file.Paths; import javax.json.Json; import javax.json.JsonArray; import javax.json.JsonObject; @@ -442,4 +445,122 @@ public void createToolSpreadsheetViewer() { .statusCode(OK.getStatusCode()); } + @Test + public void testFileLevelToolWithAuxFileReq() throws IOException { + + // Delete all external tools before testing. + Response getTools = UtilIT.getExternalTools(); + getTools.prettyPrint(); + getTools.then().assertThat() + .statusCode(OK.getStatusCode()); + String body = getTools.getBody().asString(); + JsonReader bodyObject = Json.createReader(new StringReader(body)); + JsonArray tools = bodyObject.readObject().getJsonArray("data"); + for (int i = 0; i < tools.size(); i++) { + JsonObject tool = tools.getJsonObject(i); + int id = tool.getInt("id"); + Response deleteExternalTool = UtilIT.deleteExternalTool(id); + deleteExternalTool.prettyPrint(); + } + + Response createUser = UtilIT.createRandomUser(); + createUser.prettyPrint(); + createUser.then().assertThat() + .statusCode(OK.getStatusCode()); + String username = UtilIT.getUsernameFromResponse(createUser); + String apiToken = UtilIT.getApiTokenFromResponse(createUser); + + Response createDataverseResponse = UtilIT.createRandomDataverse(apiToken); + createDataverseResponse.prettyPrint(); + createDataverseResponse.then().assertThat() + .statusCode(CREATED.getStatusCode()); + + String dataverseAlias = UtilIT.getAliasFromResponse(createDataverseResponse); + + Response createDataset = UtilIT.createRandomDatasetViaNativeApi(dataverseAlias, apiToken); + createDataset.prettyPrint(); + createDataset.then().assertThat() + .statusCode(CREATED.getStatusCode()); + + Integer datasetId = UtilIT.getDatasetIdFromResponse(createDataset); + + // Not really an HDF5 file. Just random bytes. But the file extension makes it detected as HDF5. + Path pathToFalseHdf5 = Paths.get(java.nio.file.Files.createTempDirectory(null) + File.separator + "false.hdf5"); + byte[] bytes = {1, 2, 3, 4, 5}; + java.nio.file.Files.write(pathToFalseHdf5, bytes); + + Response uploadFalseHdf5 = UtilIT.uploadFileViaNative(datasetId.toString(), pathToFalseHdf5.toString(), apiToken); + uploadFalseHdf5.prettyPrint(); + uploadFalseHdf5.then().assertThat() + .statusCode(OK.getStatusCode()); + + Integer falseHdf5 = JsonPath.from(uploadFalseHdf5.getBody().asString()).getInt("data.files[0].dataFile.id"); + + String pathToTrueHdf5 = "src/test/resources/hdf/hdf5/vlen_string_dset"; + Response uploadTrueHdf5 = UtilIT.uploadFileViaNative(datasetId.toString(), pathToTrueHdf5, apiToken); + uploadTrueHdf5.prettyPrint(); + uploadTrueHdf5.then().assertThat() + .statusCode(OK.getStatusCode()); + + Integer trueHdf5 = JsonPath.from(uploadTrueHdf5.getBody().asString()).getInt("data.files[0].dataFile.id"); + + JsonObjectBuilder job = Json.createObjectBuilder(); + job.add("displayName", "HDF5 Tool"); + job.add("description", "Operates on HDF5 files"); + job.add("types", Json.createArrayBuilder().add("preview")); + job.add("scope", "file"); + job.add("contentType", "application/x-hdf5"); + job.add("toolUrl", "/dataexplore/dataverse-previewers/previewers/v1.3/TextPreview.html"); + job.add("toolParameters", Json.createObjectBuilder() + .add("queryParameters", Json.createArrayBuilder() + .add(Json.createObjectBuilder() + .add("fileid", "{fileId}") + .build()) + .add(Json.createObjectBuilder() + .add("siteUrl", "{siteUrl}") + .build()) + .add(Json.createObjectBuilder() + .add("key", "{apiToken}") + .build()) + .build()) + .build()); + job.add("requirements", Json.createObjectBuilder() + .add("auxFilesExist", Json.createArrayBuilder() + .add(Json.createObjectBuilder() + .add("formatTag", "NcML") + .add("formatVersion", "0.1") + ) + ) + ); + Response addExternalTool = UtilIT.addExternalTool(job.build()); + addExternalTool.prettyPrint(); + addExternalTool.then().assertThat() + .statusCode(OK.getStatusCode()) + .body("data.displayName", CoreMatchers.equalTo("HDF5 Tool")); + + long toolId = JsonPath.from(addExternalTool.getBody().asString()).getLong("data.id"); + + Response getTool = UtilIT.getExternalTool(toolId); + getTool.prettyPrint(); + getTool.then().assertThat() + .body("data.scope", CoreMatchers.equalTo("file")) + .statusCode(OK.getStatusCode()); + + // No tools for false HDF5 file. Aux file couldn't be extracted. Doesn't meet requirements. + Response getToolsForFalseHdf5 = UtilIT.getExternalToolsForFile(falseHdf5.toString(), "preview", apiToken); + getToolsForFalseHdf5.prettyPrint(); + getToolsForFalseHdf5.then().assertThat() + .statusCode(OK.getStatusCode()) + .body("data", Matchers.hasSize(0)); + + // The tool shows for a true HDF5 file. The NcML aux file is available. Requirements met. + Response getToolsForTrueHdf5 = UtilIT.getExternalToolsForFile(trueHdf5.toString(), "preview", apiToken); + getToolsForTrueHdf5.prettyPrint(); + getToolsForTrueHdf5.then().assertThat() + .statusCode(OK.getStatusCode()) + .body("data[0].displayName", CoreMatchers.equalTo("HDF5 Tool")) + .body("data[0].scope", CoreMatchers.equalTo("file")) + .body("data[0].contentType", CoreMatchers.equalTo("application/x-hdf5")); + } + } diff --git a/src/test/java/edu/harvard/iq/dataverse/externaltools/ExternalToolServiceBeanTest.java b/src/test/java/edu/harvard/iq/dataverse/externaltools/ExternalToolServiceBeanTest.java index 74e10d67352..631c22d959b 100644 --- a/src/test/java/edu/harvard/iq/dataverse/externaltools/ExternalToolServiceBeanTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/externaltools/ExternalToolServiceBeanTest.java @@ -19,7 +19,10 @@ public class ExternalToolServiceBeanTest { + private final ExternalToolServiceBean externalToolService; + public ExternalToolServiceBeanTest() { + this.externalToolService = new ExternalToolServiceBean(); } @Test @@ -49,7 +52,7 @@ public void testfindAll() { ExternalToolHandler externalToolHandler4 = new ExternalToolHandler(externalTool, dataFile, apiToken, fmd, null); List externalTools = new ArrayList<>(); externalTools.add(externalTool); - List availableExternalTools = ExternalToolServiceBean.findExternalToolsByFile(externalTools, dataFile); + List availableExternalTools = externalToolService.findExternalToolsByFile(externalTools, dataFile); assertEquals(availableExternalTools.size(), 1); } @@ -544,4 +547,67 @@ protected static ExternalTool getAllowedApiCallsTool() { return ExternalToolServiceBean.parseAddExternalToolManifest(tool); } + + @Test + public void testParseAddFileToolRequireAuxFile() { + JsonObjectBuilder job = Json.createObjectBuilder(); + job.add("displayName", "AwesomeTool"); + job.add("toolName", "explorer"); + job.add("description", "This tool is awesome."); + job.add("types", Json.createArrayBuilder().add("explore")); + job.add("scope", "file"); + job.add("hasPreviewMode", "false"); + job.add("toolUrl", "http://awesometool.com"); + job.add("toolParameters", Json.createObjectBuilder() + .add("queryParameters", Json.createArrayBuilder() + .add(Json.createObjectBuilder() + .add("filePid", "{filePid}") + .build()) + .add(Json.createObjectBuilder() + .add("key", "{apiToken}") + .build()) + .add(Json.createObjectBuilder() + .add("fileMetadataId", "{fileMetadataId}") + .build()) + .add(Json.createObjectBuilder() + .add("dvLocale", "{localeCode}") + .build()) + .build()) + .build()); + job.add("requirements", Json.createObjectBuilder() + .add("auxFilesExist", Json.createArrayBuilder() + .add(Json.createObjectBuilder() + .add("formatTag", "NcML") + .add("formatVersion", "0.1") + ) + ) + ); + job.add(ExternalTool.CONTENT_TYPE, DataFileServiceBean.MIME_TYPE_TSV_ALT); + String tool = job.build().toString(); + ExternalTool externalTool = ExternalToolServiceBean.parseAddExternalToolManifest(tool); + assertEquals("AwesomeTool", externalTool.getDisplayName()); + assertEquals("explorer", externalTool.getToolName()); + assertEquals("{\"auxFilesExist\":[{\"formatTag\":\"NcML\",\"formatVersion\":\"0.1\"}]}", externalTool.getRequirements()); + /* + DataFile dataFile = new DataFile(); + dataFile.setId(42l); + dataFile.setGlobalId(new GlobalId("doi:10.5072/FK2/RMQT6J/G9F1A1")); + FileMetadata fmd = new FileMetadata(); + fmd.setId(2L); + DatasetVersion dv = new DatasetVersion(); + Dataset ds = new Dataset(); + dv.setDataset(ds); + fmd.setDatasetVersion(dv); + List fmdl = new ArrayList(); + fmdl.add(fmd); + dataFile.setFileMetadatas(fmdl); + ApiToken apiToken = new ApiToken(); + apiToken.setTokenString("7196b5ce-f200-4286-8809-03ffdbc255d7"); + ExternalToolHandler externalToolHandler = new ExternalToolHandler(externalTool, dataFile, apiToken, fmd, "fr"); + String toolUrl = externalToolHandler.getToolUrlWithQueryParams(); + System.out.println("result: " + toolUrl); + assertEquals("http://awesometool.com?filePid=doi:10.5072/FK2/RMQT6J/G9F1A1&key=7196b5ce-f200-4286-8809-03ffdbc255d7&fileMetadataId=2&dvLocale=fr", toolUrl); +*/ + } + } From d2e14f06bb57da8a9ec8383b178c7bca8f94d148 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Tue, 20 Dec 2022 10:18:49 -0500 Subject: [PATCH 5/9] remove cruft #9153 --- .../ExternalToolServiceBeanTest.java | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/src/test/java/edu/harvard/iq/dataverse/externaltools/ExternalToolServiceBeanTest.java b/src/test/java/edu/harvard/iq/dataverse/externaltools/ExternalToolServiceBeanTest.java index 631c22d959b..3885c9b358c 100644 --- a/src/test/java/edu/harvard/iq/dataverse/externaltools/ExternalToolServiceBeanTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/externaltools/ExternalToolServiceBeanTest.java @@ -588,26 +588,6 @@ public void testParseAddFileToolRequireAuxFile() { assertEquals("AwesomeTool", externalTool.getDisplayName()); assertEquals("explorer", externalTool.getToolName()); assertEquals("{\"auxFilesExist\":[{\"formatTag\":\"NcML\",\"formatVersion\":\"0.1\"}]}", externalTool.getRequirements()); - /* - DataFile dataFile = new DataFile(); - dataFile.setId(42l); - dataFile.setGlobalId(new GlobalId("doi:10.5072/FK2/RMQT6J/G9F1A1")); - FileMetadata fmd = new FileMetadata(); - fmd.setId(2L); - DatasetVersion dv = new DatasetVersion(); - Dataset ds = new Dataset(); - dv.setDataset(ds); - fmd.setDatasetVersion(dv); - List fmdl = new ArrayList(); - fmdl.add(fmd); - dataFile.setFileMetadatas(fmdl); - ApiToken apiToken = new ApiToken(); - apiToken.setTokenString("7196b5ce-f200-4286-8809-03ffdbc255d7"); - ExternalToolHandler externalToolHandler = new ExternalToolHandler(externalTool, dataFile, apiToken, fmd, "fr"); - String toolUrl = externalToolHandler.getToolUrlWithQueryParams(); - System.out.println("result: " + toolUrl); - assertEquals("http://awesometool.com?filePid=doi:10.5072/FK2/RMQT6J/G9F1A1&key=7196b5ce-f200-4286-8809-03ffdbc255d7&fileMetadataId=2&dvLocale=fr", toolUrl); -*/ } } From 24a0b3e4b437c7896e35ad9a94683d578809d32d Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Thu, 22 Dec 2022 14:11:06 -0500 Subject: [PATCH 6/9] add NcML previewer to guides (merged upstream) #9153 Merged: https://github.com/gdcc/dataverse-previewers/pull/18 --- .../source/_static/admin/dataverse-external-tools.tsv | 2 +- doc/sphinx-guides/source/user/dataset-management.rst | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/sphinx-guides/source/_static/admin/dataverse-external-tools.tsv b/doc/sphinx-guides/source/_static/admin/dataverse-external-tools.tsv index fd1f0f27bc5..16623a6aeec 100644 --- a/doc/sphinx-guides/source/_static/admin/dataverse-external-tools.tsv +++ b/doc/sphinx-guides/source/_static/admin/dataverse-external-tools.tsv @@ -1,5 +1,5 @@ Tool Type Scope Description Data Explorer explore file A GUI which lists the variables in a tabular data file allowing searching, charting and cross tabulation analysis. See the README.md file at https://github.com/scholarsportal/dataverse-data-explorer-v2 for the instructions on adding Data Explorer to your Dataverse. Whole Tale explore dataset A platform for the creation of reproducible research packages that allows users to launch containerized interactive analysis environments based on popular tools such as Jupyter and RStudio. Using this integration, Dataverse users can launch Jupyter and RStudio environments to analyze published datasets. For more information, see the `Whole Tale User Guide `_. -File Previewers explore file A set of tools that display the content of files - including audio, html, `Hypothes.is `_ annotations, images, PDF, text, video, tabular data, spreadsheets, GeoJSON, and ZipFiles - allowing them to be viewed without downloading the file. The previewers can be run directly from github.io, so the only required step is using the Dataverse API to register the ones you want to use. Documentation, including how to optionally brand the previewers, and an invitation to contribute through github are in the README.md file. Initial development was led by the Qualitative Data Repository and the spreasdheet previewer was added by the Social Sciences and Humanities Open Cloud (SSHOC) project. https://github.com/gdcc/dataverse-previewers +File Previewers explore file A set of tools that display the content of files - including audio, html, `Hypothes.is `_ annotations, images, PDF, text, video, tabular data, spreadsheets, GeoJSON, zip, and NcML files - allowing them to be viewed without downloading the file. The previewers can be run directly from github.io, so the only required step is using the Dataverse API to register the ones you want to use. Documentation, including how to optionally brand the previewers, and an invitation to contribute through github are in the README.md file. Initial development was led by the Qualitative Data Repository and the spreasdheet previewer was added by the Social Sciences and Humanities Open Cloud (SSHOC) project. https://github.com/gdcc/dataverse-previewers Data Curation Tool configure file A GUI for curating data by adding labels, groups, weights and other details to assist with informed reuse. See the README.md file at https://github.com/scholarsportal/Dataverse-Data-Curation-Tool for the installation instructions. diff --git a/doc/sphinx-guides/source/user/dataset-management.rst b/doc/sphinx-guides/source/user/dataset-management.rst index e891ca72880..0c9c7c9e3c7 100755 --- a/doc/sphinx-guides/source/user/dataset-management.rst +++ b/doc/sphinx-guides/source/user/dataset-management.rst @@ -177,11 +177,15 @@ File Handling Certain file types in the Dataverse installation are supported by additional functionality, which can include downloading in different formats, previews, file-level metadata preservation, file-level data citation; and exploration through data visualization and analysis. See the sections below for information about special functionality for specific file types. +.. _file-previews: + File Previews ------------- Dataverse installations can add previewers for common file types uploaded by their research communities. The previews appear on the file page. If a preview tool for a specific file type is available, the preview will be created and will display automatically, after terms have been agreed to or a guestbook entry has been made, if necessary. File previews are not available for restricted files unless they are being accessed using a Private URL. See also :ref:`privateurl`. +Installation of previewers is explained in the :doc:`/admin/external-tools` section of in the Admin Guide. + Tabular Data Files ------------------ @@ -302,7 +306,7 @@ Metadata found in the header section of `Flexible Image Transport System (FITS) NetCDF and HDF5 --------------- -For NetCDF and HDF5 files, an attempt will be made to extract metadata in NcML_ (XML) format and save it as an auxiliary file. (See also :doc:`/developers/aux-file-support` in the Developer Guide.) +For NetCDF and HDF5 files, an attempt will be made to extract metadata in NcML_ (XML) format and save it as an auxiliary file. (See also :doc:`/developers/aux-file-support` in the Developer Guide.) A previewer for these NcML files is available (see :ref:`file-previews`). .. _NcML: https://docs.unidata.ucar.edu/netcdf-java/current/userguide/ncml_overview.html From e2066c854c534193b9fa9651a6a02bae82857e07 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Thu, 5 Jan 2023 16:09:55 -0500 Subject: [PATCH 7/9] add extractNcml API endpoint #9153 --- doc/release-notes/9153-extract-metadata.md | 2 + doc/sphinx-guides/source/api/native-api.rst | 41 ++++++ .../source/user/dataset-management.rst | 2 + .../edu/harvard/iq/dataverse/api/Files.java | 21 +++ .../dataverse/ingest/IngestServiceBean.java | 139 +++++++++++++----- .../harvard/iq/dataverse/api/NetcdfIT.java | 125 ++++++++++++++++ .../edu/harvard/iq/dataverse/api/UtilIT.java | 18 ++- 7 files changed, 304 insertions(+), 44 deletions(-) diff --git a/doc/release-notes/9153-extract-metadata.md b/doc/release-notes/9153-extract-metadata.md index ce4cc714805..be21c5ed739 100644 --- a/doc/release-notes/9153-extract-metadata.md +++ b/doc/release-notes/9153-extract-metadata.md @@ -1 +1,3 @@ For NetCDF and HDF5 files, an attempt will be made to extract metadata in NcML (XML) format and save it as an auxiliary file. + +An "extractNcml" API endpoint has been added, especially for installations with existing NetCDF and HDF5 files. After upgrading, they can iterate through these files and try to extract an NcML file. diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst index 76ca38fdc70..40011a7d175 100644 --- a/doc/sphinx-guides/source/api/native-api.rst +++ b/doc/sphinx-guides/source/api/native-api.rst @@ -2248,6 +2248,47 @@ Currently the following methods are used to detect file types: - The file extension (e.g. ".ipybn") is used, defined in a file called ``MimeTypeDetectionByFileExtension.properties``. - The file name (e.g. "Dockerfile") is used, defined in a file called ``MimeTypeDetectionByFileName.properties``. +.. _extractNcml: + +Extract NcML +~~~~~~~~~~~~ + +As explained in the :ref:`netcdf-and-hdf5` section of the User Guide, when those file types are uploaded, an attempt is made to extract an NcML file from them and store it as an auxiliary file. + +This happens automatically but superusers can also manually trigger this NcML extraction process with the API endpoint below. + +Note that "true" will be returned if an NcML file was created. "false" will be returned if there was an error or if the NcML file already exists (check server.log for details). + +.. code-block:: bash + + export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + export SERVER_URL=https://demo.dataverse.org + export ID=24 + + curl -H "X-Dataverse-key:$API_TOKEN" -X POST "$SERVER_URL/api/files/$ID/extractNcml" + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + + curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X POST "https://demo.dataverse.org/api/files/24/extractNcml + +A curl example using a PID: + +.. code-block:: bash + + export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + export SERVER_URL=https://demo.dataverse.org + export PERSISTENT_ID=doi:10.5072/FK2/AAA000 + + curl -H "X-Dataverse-key:$API_TOKEN" -X POST "$SERVER_URL/api/files/:persistentId/extractNcml?persistentId=$PERSISTENT_ID" + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + + curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X POST "https://demo.dataverse.org/api/files/:persistentId/extractNcml?persistentId=doi:10.5072/FK2/AAA000" + Replacing Files ~~~~~~~~~~~~~~~ diff --git a/doc/sphinx-guides/source/user/dataset-management.rst b/doc/sphinx-guides/source/user/dataset-management.rst index 0c9c7c9e3c7..1da31707749 100755 --- a/doc/sphinx-guides/source/user/dataset-management.rst +++ b/doc/sphinx-guides/source/user/dataset-management.rst @@ -303,6 +303,8 @@ Astronomy (FITS) Metadata found in the header section of `Flexible Image Transport System (FITS) files `_ are automatically extracted by the Dataverse Software, aggregated and displayed in the Astronomy Domain-Specific Metadata of the Dataset that the file belongs to. This FITS file metadata, is therefore searchable and browsable (facets) at the Dataset-level. +.. _netcdf-and-hdf5: + NetCDF and HDF5 --------------- diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Files.java b/src/main/java/edu/harvard/iq/dataverse/api/Files.java index af0f6be6d32..6cdbcf82c1b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Files.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Files.java @@ -625,6 +625,27 @@ public Response redetectDatafile(@PathParam("id") String id, @QueryParam("dryRun } } + @Path("{id}/extractNcml") + @POST + public Response extractNcml(@PathParam("id") String id) { + try { + AuthenticatedUser au = findAuthenticatedUserOrDie(); + if (!au.isSuperuser()) { + // We can always make a command in the future if there's a need + // for non-superusers to call this API. + return error(Response.Status.FORBIDDEN, "This API call can be used by superusers only"); + } + DataFile dataFileIn = findDataFileOrDie(id); + java.nio.file.Path tempLocationPath = null; + boolean successOrFail = ingestService.extractMetadataNcml(dataFileIn, tempLocationPath); + NullSafeJsonBuilder result = NullSafeJsonBuilder.jsonObjectBuilder() + .add("result", successOrFail); + return ok(result); + } catch (WrappedResponse wr) { + return wr.getResponse(); + } + } + /** * Attempting to run metadata export, for all the formats for which we have * metadata Exporters. diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java index b5934c1167f..f3fc56a54aa 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java @@ -242,43 +242,7 @@ public List saveAndAddFilesToDataset(DatasetVersion version, logger.fine("Success: permanently saved file " + dataFile.getFileMetadata().getLabel()); // TODO: reformat this file to remove the many tabs added in cc08330 - InputStream inputStream = null; - if (tempLocationPath != null) { - try ( NetcdfFile netcdfFile = NetcdfFiles.open(tempLocationPath.toString())) { - if (netcdfFile != null) { - // For now, empty string. What should we pass as a URL to toNcml()? The filename (including the path) most commonly at https://docs.unidata.ucar.edu/netcdf-java/current/userguide/ncml_cookbook.html - // With an empty string the XML will show 'location="file:"'. - String ncml = netcdfFile.toNcml(""); - inputStream = new ByteArrayInputStream(ncml.getBytes(StandardCharsets.UTF_8)); - } else { - logger.info("NetcdfFiles.open() could open file id " + dataFile.getId() + " (null returned)."); - } - } catch (IOException ex) { - logger.info("NetcdfFiles.open() could open file id " + dataFile.getId() + ". Exception caught: " + ex); - } - } else { - logger.info("tempLocationPath is null for file id " + dataFile.getId() + ". Can't extract NcML."); - } - if (inputStream != null) { - // If you change NcML, you must also change the previewer. - String formatTag = "NcML"; - // 0.1 is arbitrary. It's our first attempt to put out NcML so we're giving it a low number. - // If you bump the number here, be sure the bump the number in the previewer as well. - // We could use 2.2 here since that's the current version of NcML. - String formatVersion = "0.1"; - String origin = "netcdf-java"; - boolean isPublic = true; - // See also file.auxfiles.types.NcML in Bundle.properties. Used to group aux files in UI. - String type = "NcML"; - // XML because NcML doesn't have its own MIME/content type at https://www.iana.org/assignments/media-types/media-types.xhtml - MediaType mediaType = new MediaType("text", "xml"); - try { - AuxiliaryFile auxFile = auxiliaryFileService.processAuxiliaryFile(inputStream, dataFile, formatTag, formatVersion, origin, isPublic, type, mediaType, false); - logger.fine ("Aux file extracted from NetCDF/HDF5 file saved to storage (but not to the database yet) from file id " + dataFile.getId()); - } catch (Exception ex) { - logger.info("exception throw calling processAuxiliaryFile: " + ex); - } - } + extractMetadataNcml(dataFile, tempLocationPath); } catch (IOException ioex) { logger.warning("Failed to save the file, storage id " + dataFile.getStorageIdentifier() + " (" + ioex.getMessage() + ")"); @@ -392,7 +356,7 @@ public List saveAndAddFilesToDataset(DatasetVersion version, try { // FITS is the only type supported for metadata // extraction, as of now. -- L.A. 4.0 - // Consider adding other formats such as NetCDF/HDF5. + // Note that extractMetadataNcml() is used for NetCDF/HDF5. dataFile.setContentType("application/fits"); metadataExtracted = extractMetadata(tempFileLocation, dataFile, version); } catch (IOException mex) { @@ -1255,7 +1219,104 @@ public boolean extractMetadata(String tempFileLocation, DataFile dataFile, Datas return ingestSuccessful; } - + /** + * @param dataFile The DataFile from which to attempt NcML extraction + * (NetCDF or HDF5 format) + * @param tempLocationPath Null if the file is already saved to permanent + * storage. Otherwise, the path to the temp location of the files, as during + * initial upload. + * @return True if the Ncml files was created. False on any error or if the + * NcML file already exists. + */ + public boolean extractMetadataNcml(DataFile dataFile, Path tempLocationPath) { + boolean ncmlFileCreated = false; + logger.fine("extractMetadataNcml: dataFileIn: " + dataFile + ". tempLocationPath: " + tempLocationPath); + InputStream inputStream = null; + String dataFileLocation = null; + if (tempLocationPath != null) { + // This file was just uploaded and hasn't been saved to S3 or local storage. + dataFileLocation = tempLocationPath.toString(); + } else { + // This file is already on S3 or local storage. + File tempFile = null; + File localFile; + StorageIO storageIO; + try { + storageIO = dataFile.getStorageIO(); + storageIO.open(); + if (storageIO.isLocalFile()) { + localFile = storageIO.getFileSystemPath().toFile(); + dataFileLocation = localFile.getAbsolutePath(); + logger.fine("extractMetadataNcml: file is local. Path: " + dataFileLocation); + } else { + // Need to create a temporary local file: + tempFile = File.createTempFile("tempFileExtractMetadataNcml", ".tmp"); + try ( ReadableByteChannel targetFileChannel = (ReadableByteChannel) storageIO.getReadChannel(); FileChannel tempFileChannel = new FileOutputStream(tempFile).getChannel();) { + tempFileChannel.transferFrom(targetFileChannel, 0, storageIO.getSize()); + } + dataFileLocation = tempFile.getAbsolutePath(); + logger.fine("extractMetadataNcml: file is on S3. Downloaded and saved to temp path: " + dataFileLocation); + } + } catch (IOException ex) { + logger.info("While attempting to extract NcML, could not use storageIO for data file id " + dataFile.getId() + ". Exception: " + ex); + } + } + if (dataFileLocation != null) { + try ( NetcdfFile netcdfFile = NetcdfFiles.open(dataFileLocation)) { + logger.fine("trying to open " + dataFileLocation); + if (netcdfFile != null) { + // For now, empty string. What should we pass as a URL to toNcml()? The filename (including the path) most commonly at https://docs.unidata.ucar.edu/netcdf-java/current/userguide/ncml_cookbook.html + // With an empty string the XML will show 'location="file:"'. + String ncml = netcdfFile.toNcml(""); + inputStream = new ByteArrayInputStream(ncml.getBytes(StandardCharsets.UTF_8)); + } else { + logger.info("NetcdfFiles.open() could not open file id " + dataFile.getId() + " (null returned)."); + } + } catch (IOException ex) { + logger.info("NetcdfFiles.open() could not open file id " + dataFile.getId() + ". Exception caught: " + ex); + } + } else { + logger.info("dataFileLocation is null for file id " + dataFile.getId() + ". Can't extract NcML."); + } + if (inputStream != null) { + // If you change NcML, you must also change the previewer. + String formatTag = "NcML"; + // 0.1 is arbitrary. It's our first attempt to put out NcML so we're giving it a low number. + // If you bump the number here, be sure the bump the number in the previewer as well. + // We could use 2.2 here since that's the current version of NcML. + String formatVersion = "0.1"; + String origin = "netcdf-java"; + boolean isPublic = true; + // See also file.auxfiles.types.NcML in Bundle.properties. Used to group aux files in UI. + String type = "NcML"; + // XML because NcML doesn't have its own MIME/content type at https://www.iana.org/assignments/media-types/media-types.xhtml + MediaType mediaType = new MediaType("text", "xml"); + try { + // Let the cascade do the save if the file isn't yet on permanent storage. + boolean callSave = false; + if (tempLocationPath == null) { + callSave = true; + // Check for an existing NcML file + logger.fine("Checking for existing NcML aux file for file id " + dataFile.getId()); + AuxiliaryFile existingAuxiliaryFile = auxiliaryFileService.lookupAuxiliaryFile(dataFile, formatTag, formatVersion); + if (existingAuxiliaryFile != null) { + logger.fine("Aux file already exists for NetCDF/HDF5 file for file id " + dataFile.getId()); + return false; + } + } + AuxiliaryFile auxFile = auxiliaryFileService.processAuxiliaryFile(inputStream, dataFile, formatTag, formatVersion, origin, isPublic, type, mediaType, callSave); + logger.fine("Aux file extracted from NetCDF/HDF5 file saved to storage (but not to the database yet) from file id " + dataFile.getId()); + ncmlFileCreated = true; + } catch (Exception ex) { + logger.info("exception throw calling processAuxiliaryFile: " + ex); + } + } else { + logger.info("extractMetadataNcml: input stream is null! dataFileLocation was " + dataFileLocation); + } + + return ncmlFileCreated; + } + private void processDatasetMetadata(FileMetadataIngest fileMetadataIngest, DatasetVersion editVersion) throws IOException { diff --git a/src/test/java/edu/harvard/iq/dataverse/api/NetcdfIT.java b/src/test/java/edu/harvard/iq/dataverse/api/NetcdfIT.java index 74179b98833..9716e7aca13 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/NetcdfIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/NetcdfIT.java @@ -3,9 +3,16 @@ import com.jayway.restassured.RestAssured; import com.jayway.restassured.path.json.JsonPath; import com.jayway.restassured.response.Response; +import java.io.File; import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; import static javax.ws.rs.core.Response.Status.CREATED; +import static javax.ws.rs.core.Response.Status.FORBIDDEN; +import static javax.ws.rs.core.Response.Status.NOT_FOUND; import static javax.ws.rs.core.Response.Status.OK; +import org.hamcrest.CoreMatchers; +import static org.hamcrest.CoreMatchers.equalTo; import org.junit.BeforeClass; import org.junit.Test; @@ -53,5 +60,123 @@ public void testNmclFromNetcdf() throws IOException { downloadNcml.then().assertThat() .statusCode(OK.getStatusCode()) .contentType("text/xml; name=\"madis-raob.ncml_0.1.xml\";charset=UTF-8"); + + Response deleteNcml = UtilIT.deleteAuxFile(fileId, tag, version, apiToken); + deleteNcml.prettyPrint(); + deleteNcml.then().assertThat().statusCode(OK.getStatusCode()); + + Response downloadNcmlShouldFail = UtilIT.downloadAuxFile(fileId, tag, version, apiToken); + downloadNcmlShouldFail.then().assertThat() + .statusCode(NOT_FOUND.getStatusCode()); + + UtilIT.makeSuperUser(username).then().assertThat().statusCode(OK.getStatusCode()); + + Response extractNcml = UtilIT.extractNcml(fileId, apiToken); + extractNcml.prettyPrint(); + extractNcml.then().assertThat() + .statusCode(OK.getStatusCode()); + + Response downloadNcmlShouldWork = UtilIT.downloadAuxFile(fileId, tag, version, apiToken); + downloadNcmlShouldWork.then().assertThat() + .statusCode(OK.getStatusCode()); + } + + @Test + public void testNmclFromNetcdfErrorChecking() throws IOException { + Response createUser = UtilIT.createRandomUser(); + createUser.then().assertThat().statusCode(OK.getStatusCode()); + String apiToken = UtilIT.getApiTokenFromResponse(createUser); + String username = UtilIT.getUsernameFromResponse(createUser); + + Response createUserRandom = UtilIT.createRandomUser(); + createUserRandom.then().assertThat().statusCode(OK.getStatusCode()); + String apiTokenRandom = UtilIT.getApiTokenFromResponse(createUserRandom); + + String apiTokenNull = null; + + Response createDataverseResponse = UtilIT.createRandomDataverse(apiToken); + createDataverseResponse.prettyPrint(); + createDataverseResponse.then().assertThat() + .statusCode(CREATED.getStatusCode()); + + String dataverseAlias = UtilIT.getAliasFromResponse(createDataverseResponse); + + Response createDataset = UtilIT.createRandomDatasetViaNativeApi(dataverseAlias, apiToken); + createDataset.prettyPrint(); + createDataset.then().assertThat() + .statusCode(CREATED.getStatusCode()); + + Integer datasetId = UtilIT.getDatasetIdFromResponse(createDataset); + String datasetPid = UtilIT.getDatasetPersistentIdFromResponse(createDataset); + + String pathToFile = "src/test/resources/netcdf/madis-raob"; + + Response uploadFile = UtilIT.uploadFileViaNative(datasetId.toString(), pathToFile, apiToken); + uploadFile.prettyPrint(); + uploadFile.then().assertThat().statusCode(OK.getStatusCode()); + + long fileId = JsonPath.from(uploadFile.body().asString()).getLong("data.files[0].dataFile.id"); + String tag = "NcML"; + String version = "0.1"; + + Response downloadNcmlFail = UtilIT.downloadAuxFile(fileId, tag, version, apiTokenNull); + downloadNcmlFail.then().assertThat() + .statusCode(FORBIDDEN.getStatusCode()); + + Response downloadNcml = UtilIT.downloadAuxFile(fileId, tag, version, apiToken); + downloadNcml.then().assertThat() + .statusCode(OK.getStatusCode()) + .contentType("text/xml; name=\"madis-raob.ncml_0.1.xml\";charset=UTF-8"); + + Response deleteNcml = UtilIT.deleteAuxFile(fileId, tag, version, apiToken); + deleteNcml.prettyPrint(); + deleteNcml.then().assertThat().statusCode(OK.getStatusCode()); + + Response downloadNcmlShouldFail = UtilIT.downloadAuxFile(fileId, tag, version, apiToken); + downloadNcmlShouldFail.then().assertThat() + .statusCode(NOT_FOUND.getStatusCode()); + + Response extractNcmlFailRandomUser = UtilIT.extractNcml(fileId, apiTokenRandom); + extractNcmlFailRandomUser.prettyPrint(); + extractNcmlFailRandomUser.then().assertThat() + .statusCode(FORBIDDEN.getStatusCode()); + + UtilIT.makeSuperUser(username).then().assertThat().statusCode(OK.getStatusCode()); + + Response extractNcml = UtilIT.extractNcml(fileId, apiToken); + extractNcml.prettyPrint(); + extractNcml.then().assertThat() + .statusCode(OK.getStatusCode()) + .body("data.result", CoreMatchers.equalTo(true)); + + Response downloadNcmlShouldWork = UtilIT.downloadAuxFile(fileId, tag, version, apiToken); + downloadNcmlShouldWork.then().assertThat() + .statusCode(OK.getStatusCode()); + + Response extractNcmlFailExistsAlready = UtilIT.extractNcml(fileId, apiToken); + extractNcmlFailExistsAlready.prettyPrint(); + extractNcmlFailExistsAlready.then().assertThat() + .statusCode(OK.getStatusCode()) + .body("data.result", CoreMatchers.equalTo(false)); + + Path pathToTxt = Paths.get(java.nio.file.Files.createTempDirectory(null) + File.separator + "file.txt"); + String contentOfTxt = "Just a text file. Don't expect NcML out!"; + java.nio.file.Files.write(pathToTxt, contentOfTxt.getBytes()); + + Response uploadFileTxt = UtilIT.uploadFileViaNative(datasetId.toString(), pathToTxt.toString(), apiToken); + uploadFileTxt.then().assertThat() + .statusCode(OK.getStatusCode()) + .body("data.files[0].label", equalTo("file.txt")); + + long fileIdTxt = JsonPath.from(uploadFileTxt.body().asString()).getLong("data.files[0].dataFile.id"); + + Response extractNcmlFailText = UtilIT.extractNcml(fileIdTxt, apiToken); + extractNcmlFailText.prettyPrint(); + extractNcmlFailText.then().assertThat() + .statusCode(OK.getStatusCode()) + .body("data.result", CoreMatchers.equalTo(false)); + + } + } diff --git a/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java b/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java index 12ccaf2caff..36dce2978fa 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java @@ -743,10 +743,11 @@ static Response uploadAuxFile(Long fileId, String pathToFile, String formatTag, } static Response downloadAuxFile(Long fileId, String formatTag, String formatVersion, String apiToken) { - Response response = given() - .header(API_TOKEN_HTTP_HEADER, apiToken) - .get("/api/access/datafile/" + fileId + "/auxiliary/" + formatTag + "/" + formatVersion); - return response; + RequestSpecification requestSpecification = given(); + if (apiToken != null) { + requestSpecification.header(API_TOKEN_HTTP_HEADER, apiToken); + } + return requestSpecification.get("/api/access/datafile/" + fileId + "/auxiliary/" + formatTag + "/" + formatVersion); } static Response listAuxFilesByOrigin(Long fileId, String origin, String apiToken) { @@ -1170,7 +1171,14 @@ public static Response uningestFile(Long fileId, String apiToken) { .post("/api/files/" + fileId + "/uningest/?key=" + apiToken); return uningestFileResponse; } - + + public static Response extractNcml(Long fileId, String apiToken) { + Response response = given() + .header(API_TOKEN_HTTP_HEADER, apiToken) + .post("/api/files/" + fileId + "/extractNcml"); + return response; + } + //I don't understand why this blows up when I remove the key public static Response getDataFileMetadata(Long fileId, String apiToken) { Response fileResponse = given() From 8cead71fe610e434fbbecd5cccdfe0aae8ef2eac Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Tue, 10 Jan 2023 15:36:45 -0500 Subject: [PATCH 8/9] cleanup (remove extraneous println) #9153 --- .../java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java index f3fc56a54aa..9c6acd964c1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java @@ -305,7 +305,6 @@ public List saveAndAddFilesToDataset(DatasetVersion version, // Any necessary post-processing: // performPostProcessingTasks(dataFile); } else { - System.out.println("driver is not tmp"); try { StorageIO dataAccess = DataAccess.getStorageIO(dataFile); //Populate metadata From 230298902fbb7296c9623a355e66e72302f83174 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Thu, 19 Jan 2023 10:00:33 -0500 Subject: [PATCH 9/9] rename sql scripts #9153 "Use a version like '4.11.0.1' in the example above where the previously released version was 4.11" -- dev guide That is, these scripts should have been 5.12.1.whatever since the last release was 5.12.1. Fixing. (They were 5.13.whatever.) --- ...-sorting_licenses.sql => V5.12.1.1__8671-sorting_licenses.sql} | 0 ...ls-for-tools.sql => V5.12.1.2__7715-signed-urls-for-tools.sql} | 0 ...imates.sql => V5.12.1.3__8840-improve-guestbook-estimates.sql} | 0 ...-extract-metadata.sql => V5.12.1.4__9153-extract-metadata.sql} | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename src/main/resources/db/migration/{V5.13.0.1__8671-sorting_licenses.sql => V5.12.1.1__8671-sorting_licenses.sql} (100%) rename src/main/resources/db/migration/{V5.13.0.2__7715-signed-urls-for-tools.sql => V5.12.1.2__7715-signed-urls-for-tools.sql} (100%) rename src/main/resources/db/migration/{V5.13.0.3__8840-improve-guestbook-estimates.sql => V5.12.1.3__8840-improve-guestbook-estimates.sql} (100%) rename src/main/resources/db/migration/{V5.13.0.3__9153-extract-metadata.sql => V5.12.1.4__9153-extract-metadata.sql} (100%) diff --git a/src/main/resources/db/migration/V5.13.0.1__8671-sorting_licenses.sql b/src/main/resources/db/migration/V5.12.1.1__8671-sorting_licenses.sql similarity index 100% rename from src/main/resources/db/migration/V5.13.0.1__8671-sorting_licenses.sql rename to src/main/resources/db/migration/V5.12.1.1__8671-sorting_licenses.sql diff --git a/src/main/resources/db/migration/V5.13.0.2__7715-signed-urls-for-tools.sql b/src/main/resources/db/migration/V5.12.1.2__7715-signed-urls-for-tools.sql similarity index 100% rename from src/main/resources/db/migration/V5.13.0.2__7715-signed-urls-for-tools.sql rename to src/main/resources/db/migration/V5.12.1.2__7715-signed-urls-for-tools.sql diff --git a/src/main/resources/db/migration/V5.13.0.3__8840-improve-guestbook-estimates.sql b/src/main/resources/db/migration/V5.12.1.3__8840-improve-guestbook-estimates.sql similarity index 100% rename from src/main/resources/db/migration/V5.13.0.3__8840-improve-guestbook-estimates.sql rename to src/main/resources/db/migration/V5.12.1.3__8840-improve-guestbook-estimates.sql diff --git a/src/main/resources/db/migration/V5.13.0.3__9153-extract-metadata.sql b/src/main/resources/db/migration/V5.12.1.4__9153-extract-metadata.sql similarity index 100% rename from src/main/resources/db/migration/V5.13.0.3__9153-extract-metadata.sql rename to src/main/resources/db/migration/V5.12.1.4__9153-extract-metadata.sql