diff --git a/doc/sphinx-guides/source/admin/metadataexport.rst b/doc/sphinx-guides/source/admin/metadataexport.rst index c6ebef0ce15..8efb100f003 100644 --- a/doc/sphinx-guides/source/admin/metadataexport.rst +++ b/doc/sphinx-guides/source/admin/metadataexport.rst @@ -34,3 +34,13 @@ Export Failures --------------- An export batch job, whether started via the API, or by the application timer, will leave a detailed log in your configured logs directory. This is the same location where your main Glassfish server.log is found. The name of the log file is ``export_[timestamp].log`` - for example, *export_2016-08-23T03-35-23.log*. The log will contain the numbers of datasets processed successfully and those for which metadata export failed, with some information on the failures detected. Please attach this log file if you need to contact Dataverse support about metadata export problems. + +Downloading Metadata via GUI +---------------------------- + +The :doc:`/user/dataset-management` section of the User Guide explains how end users can download the metadata formats above from the Dataverse GUI. + +Downloading Metadata via API +---------------------------- + +The :doc:`/api/native-api` section of the API Guide explains how end users can download the metadata formats above via API. diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst index 5d4c6bf8db3..0af6106b42b 100644 --- a/doc/sphinx-guides/source/api/native-api.rst +++ b/doc/sphinx-guides/source/api/native-api.rst @@ -252,6 +252,16 @@ Export Metadata of a Dataset in Various Formats .. note:: Supported exporters (export formats) are ``ddi``, ``oai_ddi``, ``dcterms``, ``oai_dc``, ``schema.org`` , and ``dataverse_json``. +Schema.org JSON-LD +^^^^^^^^^^^^^^^^^^ + +Please note that the ``schema.org`` format has changed in backwards-incompatible ways after Dataverse 4.9.4: + +- "description" was a single string and now it is an array of strings. +- "citation" was an array of strings and now it is an array of objects. + +Both forms are valid according to Google's Structured Data Testing Tool at https://search.google.com/structured-data/testing-tool . (This tool will report "The property affiliation is not recognized by Google for an object of type Thing" and this known issue is being tracked at https://github.com/IQSS/dataverse/issues/5029 .) Schema.org JSON-LD is an evolving standard that permits a great deal of flexibility. For example, https://schema.org/docs/gs.html#schemaorg_expected indicates that even when objects are expected, it's ok to just use text. As with all metadata export formats, we will try to keep the Schema.org JSON-LD format Dataverse emits backward-compatible to made integrations more stable, despite the flexibility that's afforded by the standard. + List Files in a Dataset ~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 3311c288a44..1981ad328cc 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -755,6 +755,19 @@ This JVM option is used to configure the path where all the language specific pr If this value is not set, by default, a Dataverse installation will read the English language property files from the Java Application. +dataverse.files.hide-schema-dot-org-download-urls ++++++++++++++++++++++++++++++++++++++++++++++++++ + +Please note that this setting is experimental. + +By default, download URLs to files will be included in Schema.org JSON-LD output. To prevent these URLs from being included in the output, set ``dataverse.files.hide-schema-dot-org-download-urls`` to true as in the example below. + +``./asadmin create-jvm-options '-Ddataverse.files.hide-schema-dot-org-download-urls=true'`` + +Please note that there are other reasons why download URLs may not be included for certain files such as if a guestbook entry is required or if the file is restricted. + +For more on Schema.org JSON-LD, see the :doc:`/admin/metadataexport` section of the Admin Guide. + Database Settings ----------------- diff --git a/doc/sphinx-guides/source/user/dataset-management.rst b/doc/sphinx-guides/source/user/dataset-management.rst index d78ad26efb2..51c409cc657 100755 --- a/doc/sphinx-guides/source/user/dataset-management.rst +++ b/doc/sphinx-guides/source/user/dataset-management.rst @@ -20,7 +20,7 @@ A dataset contains three levels of metadata: For more details about what Citation and Domain Specific Metadata is supported please see our :ref:`user-appendix`. -Note that once a dataset has been published its metadata may be exported. A button on the dataset page's metadata tab will allow a user to export the metadata of the most recently published version of the dataset. Currently supported export formats are DDI, Dublin Core and JSON. +Note that once a dataset has been published its metadata may be exported. A button on the dataset page's metadata tab will allow a user to export the metadata of the most recently published version of the dataset. Currently supported export formats are DDI, Dublin Core, Schema.org JSON-LD, and Dataverse's native JSON format. Adding a New Dataset ==================== diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetAuthor.java b/src/main/java/edu/harvard/iq/dataverse/DatasetAuthor.java index b1f1e4b459c..ce8405a0164 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetAuthor.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetAuthor.java @@ -7,6 +7,7 @@ package edu.harvard.iq.dataverse; import java.util.Comparator; +import java.util.regex.Pattern; /** @@ -87,5 +88,60 @@ public boolean isEmpty() { && (name==null || name.getValue().trim().equals("")) ); } - + + /** + * https://support.orcid.org/hc/en-us/articles/360006897674-Structure-of-the-ORCID-Identifier + */ + final public static String REGEX_ORCID = "^\\d{4}-\\d{4}-\\d{4}-(\\d{4}|\\d{3}X)$"; + final public static String REGEX_ISNI = "^\\d*$"; + final public static String REGEX_LCNA = "^[a-z]+\\d+$"; + final public static String REGEX_VIAF = "^\\d*$"; + /** + * GND regex from https://www.wikidata.org/wiki/Property:P227 + */ + final public static String REGEX_GND = "^1[01]?\\d{7}[0-9X]|[47]\\d{6}-\\d|[1-9]\\d{0,7}-[0-9X]|3\\d{7}[0-9X]$"; + + /** + * Each author identification type has its own valid pattern/syntax. + */ + public static Pattern getValidPattern(String regex) { + return Pattern.compile(regex); + } + + public String getIdentifierAsUrl() { + if (idType != null && !idType.isEmpty() && idValue != null && !idValue.isEmpty()) { + DatasetFieldValueValidator datasetFieldValueValidator = new DatasetFieldValueValidator(); + switch (idType) { + case "ORCID": + if (datasetFieldValueValidator.isValidAuthorIdentifier(idValue, getValidPattern(REGEX_ORCID))) { + return "https://orcid.org/" + idValue; + } + break; + case "ISNI": + if (datasetFieldValueValidator.isValidAuthorIdentifier(idValue, getValidPattern(REGEX_ISNI))) { + return "http://www.isni.org/isni/" + idValue; + } + break; + case "LCNA": + if (datasetFieldValueValidator.isValidAuthorIdentifier(idValue, getValidPattern(REGEX_LCNA))) { + return "http://id.loc.gov/authorities/names/" + idValue; + } + break; + case "VIAF": + if (datasetFieldValueValidator.isValidAuthorIdentifier(idValue, getValidPattern(REGEX_VIAF))) { + return "https://viaf.org/viaf/" + idValue; + } + break; + case "GND": + if (datasetFieldValueValidator.isValidAuthorIdentifier(idValue, getValidPattern(REGEX_GND))) { + return "https://d-nb.info/gnd/" + idValue; + } + break; + default: + break; + } + } + return null; + } + } diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldValueValidator.java b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldValueValidator.java index ac79058fd98..e571fd89627 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldValueValidator.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldValueValidator.java @@ -14,6 +14,8 @@ import java.util.Date; import java.util.GregorianCalendar; import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import javax.validation.ConstraintValidator; import javax.validation.ConstraintValidatorContext; import org.apache.commons.lang.StringUtils; @@ -216,4 +218,8 @@ private boolean isValidDate(String dateString, String pattern) { return valid; } + public boolean isValidAuthorIdentifier(String userInput, Pattern pattern) { + return pattern.matcher(userInput).matches(); + } + } diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetRelPublication.java b/src/main/java/edu/harvard/iq/dataverse/DatasetRelPublication.java index 6bf55445d57..7680ebc16db 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetRelPublication.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetRelPublication.java @@ -14,7 +14,9 @@ public class DatasetRelPublication { - + /** + * The "text" is the citation of the related publication. + */ private String text; private String idType; private String idNumber; diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java index 371d36c6ce4..60072815dd7 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java @@ -2,20 +2,25 @@ import edu.harvard.iq.dataverse.util.MarkupChecker; import edu.harvard.iq.dataverse.DatasetFieldType.FieldType; +import edu.harvard.iq.dataverse.branding.BrandingUtil; +import edu.harvard.iq.dataverse.util.FileUtil; import edu.harvard.iq.dataverse.util.StringUtil; import edu.harvard.iq.dataverse.util.SystemConfig; +import edu.harvard.iq.dataverse.util.json.NullSafeJsonBuilder; import edu.harvard.iq.dataverse.workflows.WorkflowComment; import java.io.Serializable; -import java.math.BigDecimal; +import java.net.URL; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.Date; +import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; +import java.util.Map; import java.util.ResourceBundle; import java.util.Set; import java.util.logging.Level; @@ -23,6 +28,7 @@ import java.util.stream.Collectors; import javax.json.Json; +import javax.json.JsonArray; import javax.json.JsonArrayBuilder; import javax.json.JsonObjectBuilder; import javax.persistence.CascadeType; @@ -628,6 +634,27 @@ public String getDescription() { return ""; } + public List getDescriptions() { + List descriptions = new ArrayList<>(); + for (DatasetField dsf : this.getDatasetFields()) { + if (dsf.getDatasetFieldType().getName().equals(DatasetFieldConstant.description)) { + String descriptionString = ""; + if (dsf.getDatasetFieldCompoundValues() != null && !dsf.getDatasetFieldCompoundValues().isEmpty()) { + for (DatasetFieldCompoundValue descriptionValue : dsf.getDatasetFieldCompoundValues()) { + for (DatasetField subField : descriptionValue.getChildDatasetFields()) { + if (subField.getDatasetFieldType().getName().equals(DatasetFieldConstant.descriptionText) && !subField.isEmptyForDisplay()) { + descriptionString = subField.getValue(); + } + } + logger.log(Level.FINE, "pristine description: {0}", descriptionString); + descriptions.add(descriptionString); + } + } + } + } + return descriptions; + } + /** * @return Strip out all A string with the description of the dataset that * has been passed through the stripAllTags method to remove all HTML tags. @@ -636,6 +663,14 @@ public String getDescriptionPlainText() { return MarkupChecker.stripAllTags(getDescription()); } + public List getDescriptionsPlainText() { + List plainTextDescriptions = new ArrayList<>(); + for (String htmlDescription : getDescriptions()) { + plainTextDescriptions.add(MarkupChecker.stripAllTags(htmlDescription)); + } + return plainTextDescriptions; + } + /** * @return A string with the description of the dataset that has been passed * through the escapeHtml method to change the "less than" sign to "<" @@ -738,7 +773,50 @@ public List getDatasetAuthors() { } return retList; } - + + public List getFunders() { + List retList = new ArrayList<>(); + for (DatasetField dsf : this.getDatasetFields()) { + if (dsf.getDatasetFieldType().getName().equals(DatasetFieldConstant.contributor)) { + boolean addFunder = false; + for (DatasetFieldCompoundValue contributorValue : dsf.getDatasetFieldCompoundValues()) { + String contributorName = null; + String contributorType = null; + for (DatasetField subField : contributorValue.getChildDatasetFields()) { + if (subField.getDatasetFieldType().getName().equals(DatasetFieldConstant.contributorName)) { + contributorName = subField.getDisplayValue(); + } + if (subField.getDatasetFieldType().getName().equals(DatasetFieldConstant.contributorType)) { + contributorType = subField.getDisplayValue(); + // TODO: Consider how this will work in French, Chinese, etc. + String funderString = "Funder"; + if (funderString.equals(contributorType)) { + addFunder = true; + } + } + } + if (addFunder) { + retList.add(contributorName); + } + } + } + if (dsf.getDatasetFieldType().getName().equals(DatasetFieldConstant.grantNumber)) { + for (DatasetFieldCompoundValue grantObject : dsf.getDatasetFieldCompoundValues()) { + for (DatasetField subField : grantObject.getChildDatasetFields()) { + // It would be nice to do something with grantNumberValue (the actual number) but schema.org doesn't support it. + if (subField.getDatasetFieldType().getName().equals(DatasetFieldConstant.grantNumberAgency)) { + String grantAgency = subField.getDisplayValue(); + if (grantAgency != null && !grantAgency.isEmpty()) { + retList.add(grantAgency); + } + } + } + } + } + } + return retList; + } + public List getTimePeriodsCovered() { List retList = new ArrayList<>(); for (DatasetField dsf : this.getDatasetFields()) { @@ -872,7 +950,8 @@ public List getLanguages() { } return languages; } - + + // TODO: consider calling the newer getSpatialCoverages method below with the commaSeparated boolean set to true. public List getSpatialCoverages() { List retList = new ArrayList<>(); for (DatasetField dsf : this.getDatasetFields()) { @@ -914,19 +993,100 @@ public List getSpatialCoverages() { } return retList; } - + + public List getSpatialCoverages(boolean commaSeparated) { + List retList = new ArrayList<>(); + for (DatasetField dsf : this.getDatasetFields()) { + if (dsf.getDatasetFieldType().getName().equals(DatasetFieldConstant.geographicCoverage)) { + for (DatasetFieldCompoundValue geoValue : dsf.getDatasetFieldCompoundValues()) { + Map coverageHash = new HashMap<>(); + for (DatasetField subField : geoValue.getChildDatasetFields()) { + if (subField.getDatasetFieldType().getName().equals(DatasetFieldConstant.country)) { + if (!subField.isEmptyForDisplay()) { + coverageHash.put(DatasetFieldConstant.country, subField.getValue()); + } + } + if (subField.getDatasetFieldType().getName().equals(DatasetFieldConstant.state)) { + if (!subField.isEmptyForDisplay()) { + coverageHash.put(DatasetFieldConstant.state, subField.getValue()); + } + } + if (subField.getDatasetFieldType().getName().equals(DatasetFieldConstant.city)) { + if (!subField.isEmptyForDisplay()) { + coverageHash.put(DatasetFieldConstant.city, subField.getValue()); + } + } + if (subField.getDatasetFieldType().getName().equals(DatasetFieldConstant.otherGeographicCoverage)) { + if (!subField.isEmptyForDisplay()) { + coverageHash.put(DatasetFieldConstant.otherGeographicCoverage, subField.getValue()); + } + } + } + if (!coverageHash.isEmpty()) { + List coverageSorted = sortSpatialCoverage(coverageHash); + if (commaSeparated) { + retList.add(String.join(", ", coverageSorted)); + } else { + retList.addAll(coverageSorted); + } + } + } + } + } + return retList; + } + + private List sortSpatialCoverage(Map hash) { + List sorted = new ArrayList<>(); + String city = hash.get(DatasetFieldConstant.city); + if (city != null) { + sorted.add(city); + } + String state = hash.get(DatasetFieldConstant.state); + if (state != null) { + sorted.add(state); + } + String country = hash.get(DatasetFieldConstant.country); + if (country != null) { + sorted.add(country); + } + String otherGeographicCoverage = hash.get(DatasetFieldConstant.otherGeographicCoverage); + if (otherGeographicCoverage != null) { + sorted.add(otherGeographicCoverage); + } + return sorted; + } + /** * @return List of Strings containing the version's Keywords */ public List getKeywords() { return getCompoundChildFieldValues(DatasetFieldConstant.keyword, DatasetFieldConstant.keywordValue); } - - /** - * @return List of Strings containing the version's PublicationCitations - */ - public List getPublicationCitationValues() { - return getCompoundChildFieldValues(DatasetFieldConstant.publication, DatasetFieldConstant.publicationCitation); + + public List getRelatedPublications() { + List relatedPublications = new ArrayList<>(); + for (DatasetField dsf : this.getDatasetFields()) { + if (dsf.getDatasetFieldType().getName().equals(DatasetFieldConstant.publication)) { + for (DatasetFieldCompoundValue publication : dsf.getDatasetFieldCompoundValues()) { + DatasetRelPublication relatedPublication = new DatasetRelPublication(); + for (DatasetField subField : publication.getChildDatasetFields()) { + if (subField.getDatasetFieldType().getName().equals(DatasetFieldConstant.publicationCitation)) { + String citation = subField.getDisplayValue(); + relatedPublication.setText(citation); + } + if (subField.getDatasetFieldType().getName().equals(DatasetFieldConstant.publicationURL)) { + // Prevent href and target=_blank from getting into Schema.org JSON-LD output. + subField.getDatasetFieldType().setDisplayFormat("#VALUE"); + String url = subField.getDisplayValue(); + relatedPublication.setUrl(url); + } + } + relatedPublications.add(relatedPublication); + } + } + } + return relatedPublications; } /** @@ -1059,6 +1219,7 @@ public String getDistributorName() { return null; } + // TODO: Consider renaming this method since it's also used for getting the "provider" for Schema.org JSON-LD. public String getRootDataverseNameforCitation(){ //Get root dataverse name for Citation Dataverse root = this.getDataset().getOwner(); @@ -1369,7 +1530,10 @@ public String getPublicationDateAsString() { // released (published) version. This JSON fragment is generated for a // specific released version - and we can have multiple released versions. // So something will need to be modified to accommodate this. -- L.A. - + /** + * We call the export format "Schema.org JSON-LD" and extensive Javadoc can + * be found in {@link SchemaDotOrgExporter}. + */ public String getJsonLd() { // We show published datasets only for "datePublished" field below. if (!this.isPublished()) { @@ -1382,6 +1546,8 @@ public String getJsonLd() { JsonObjectBuilder job = Json.createObjectBuilder(); job.add("@context", "http://schema.org"); job.add("@type", "Dataset"); + // Note that whenever you use "@id" you should also use "identifier" and vice versa. + job.add("@id", this.getDataset().getPersistentURL()); job.add("identifier", this.getDataset().getPersistentURL()); job.add("name", this.getTitle()); JsonArrayBuilder authors = Json.createArrayBuilder(); @@ -1403,13 +1569,34 @@ public String getJsonLd() { if (!StringUtil.isEmpty(affiliation)) { author.add("affiliation", affiliation); } + String identifierAsUrl = datasetAuthor.getIdentifierAsUrl(); + if (identifierAsUrl != null) { + // It would be valid to provide an array of identifiers for authors but we have decided to only provide one. + author.add("@id", identifierAsUrl); + author.add("identifier", identifierAsUrl); + } authors.add(author); } - job.add("author", authors); + JsonArray authorsArray = authors.build(); + /** + * "creator" is being added along side "author" (below) as an + * experiment. We think Google Dataset Search might like "creator" + * better". + */ + job.add("creator", authorsArray); + /** + * "author" is still here for backward compatibility. Depending on how + * the "creator" experiment above goes, we may deprecate it in the + * future. + */ + job.add("author", authorsArray); /** * We are aware that there is a "datePublished" field but it means "Date * of first broadcast/publication." This only makes sense for a 1.0 * version. + * + * TODO: Should we remove the comment above about a 1.0 version? We + * included this "datePublished" field in Dataverse 4.8.4. */ String datePublished = this.getDataset().getPublicationDateFormattedYYYYMMDD(); if (datePublished != null) { @@ -1423,7 +1610,18 @@ public String getJsonLd() { */ job.add("dateModified", this.getPublicationDateAsString()); job.add("version", this.getVersionNumber().toString()); - job.add("description", this.getDescriptionPlainText()); + + JsonArrayBuilder descriptionsArray = Json.createArrayBuilder(); + List descriptions = this.getDescriptionsPlainText(); + for (String description : descriptions) { + descriptionsArray.add(description); + } + /** + * In Dataverse 4.8.4 "description" was a single string but now it's an + * array. + */ + job.add("description", descriptionsArray); + /** * "keywords" - contains subject(s), datasetkeyword(s) and topicclassification(s) * metadata fields for the version. -- L.A. @@ -1444,23 +1642,43 @@ public String getJsonLd() { } job.add("keywords", keywords); - + /** - * citation: - * (multiple) publicationCitation values, if present: + * citation: (multiple) related publication citation and URLs, if + * present. + * + * In Dataverse 4.8.4 "citation" was an array of strings but now it's an + * array of objects. */ - - List publicationCitations = getPublicationCitationValues(); - if (publicationCitations.size() > 0) { - JsonArrayBuilder citation = Json.createArrayBuilder(); - for (String pubCitation : publicationCitations) { - //citationEntry.add("@type", "Dataset"); - //citationEntry.add("text", pubCitation); - citation.add(pubCitation); + List relatedPublications = getRelatedPublications(); + if (!relatedPublications.isEmpty()) { + JsonArrayBuilder jsonArrayBuilder = Json.createArrayBuilder(); + for (DatasetRelPublication relatedPub : relatedPublications) { + boolean addToArray = false; + String pubCitation = relatedPub.getText(); + String pubUrl = relatedPub.getUrl(); + if (pubCitation != null || pubUrl != null) { + addToArray = true; + } + JsonObjectBuilder citationEntry = Json.createObjectBuilder(); + citationEntry.add("@type", "CreativeWork"); + if (pubCitation != null) { + citationEntry.add("text", pubCitation); + } + if (pubUrl != null) { + citationEntry.add("@id", pubUrl); + citationEntry.add("identifier", pubUrl); + } + if (addToArray) { + jsonArrayBuilder.add(citationEntry); + } + } + JsonArray jsonArray = jsonArrayBuilder.build(); + if (!jsonArray.isEmpty()) { + job.add("citation", jsonArray); } - job.add("citation", citation); } - + /** * temporalCoverage: * (if available) @@ -1474,22 +1692,18 @@ public String getJsonLd() { } job.add("temporalCoverage", temporalCoverage); } - - /** - * spatialCoverage (if available) - * TODO - * (punted, for now - see #2243) - * - */ - + /** - * funder (if available) - * TODO - * (punted, for now - see #2243) + * https://schema.org/version/3.4/ says, "Note that schema.org release + * numbers are not generally included when you use schema.org. In + * contexts (e.g. related standards work) when a particular release + * needs to be cited, this document provides the appropriate URL." + * + * For the reason above we decided to take out schemaVersion but we're + * leaving this Javadoc in here to remind us that we made this decision. + * We used to include "https://schema.org/version/3.3" in the output for + * "schemaVersion". */ - - job.add("schemaVersion", "https://schema.org/version/3.3"); - TermsOfUseAndAccess terms = this.getTermsOfUseAndAccess(); if (terms != null) { JsonObjectBuilder license = Json.createObjectBuilder().add("@type", "Dataset"); @@ -1513,10 +1727,73 @@ public String getJsonLd() { .add("url", SystemConfig.getDataverseSiteUrlStatic()) ); + String installationBrandName = BrandingUtil.getInstallationBrandName(getRootDataverseNameforCitation()); + /** + * Both "publisher" and "provider" are included but they have the same + * values. Some services seem to prefer one over the other. + */ + job.add("publisher", Json.createObjectBuilder() + .add("@type", "Organization") + .add("name", installationBrandName) + ); job.add("provider", Json.createObjectBuilder() .add("@type", "Organization") - .add("name", "Dataverse") + .add("name", installationBrandName) ); + + List funderNames = getFunders(); + if (!funderNames.isEmpty()) { + JsonArrayBuilder funderArray = Json.createArrayBuilder(); + for (String funderName : funderNames) { + JsonObjectBuilder funder = Json.createObjectBuilder(); + funder.add("@type", "Organization"); + funder.add("name", funderName); + funderArray.add(funder); + } + job.add("funder", funderArray); + } + + boolean commaSeparated = true; + List spatialCoverages = getSpatialCoverages(commaSeparated); + if (!spatialCoverages.isEmpty()) { + JsonArrayBuilder spatialArray = Json.createArrayBuilder(); + for (String spatialCoverage : spatialCoverages) { + spatialArray.add(spatialCoverage); + } + job.add("spatialCoverage", spatialArray); + } + + List fileMetadatasSorted = getFileMetadatasSorted(); + if (fileMetadatasSorted != null && !fileMetadatasSorted.isEmpty()) { + JsonArrayBuilder fileArray = Json.createArrayBuilder(); + String dataverseSiteUrl = SystemConfig.getDataverseSiteUrlStatic(); + for (FileMetadata fileMetadata : fileMetadatasSorted) { + JsonObjectBuilder fileObject = NullSafeJsonBuilder.jsonObjectBuilder(); + String filePidUrlAsString = null; + URL filePidUrl = fileMetadata.getDataFile().getGlobalId().toURL(); + if (filePidUrl != null) { + filePidUrlAsString = filePidUrl.toString(); + } + fileObject.add("@type", "DataDownload"); + fileObject.add("name", fileMetadata.getLabel()); + fileObject.add("fileFormat", fileMetadata.getDataFile().getContentType()); + fileObject.add("contentSize", fileMetadata.getDataFile().getFilesize()); + fileObject.add("description", fileMetadata.getDescription()); + fileObject.add("@id", filePidUrlAsString); + fileObject.add("identifier", filePidUrlAsString); + String hideFilesBoolean = System.getProperty(SystemConfig.FILES_HIDE_SCHEMA_DOT_ORG_DOWNLOAD_URLS); + if (hideFilesBoolean != null && hideFilesBoolean.equals("true")) { + // no-op + } else { + if (FileUtil.isPubliclyDownloadable(fileMetadata)) { + String nullDownloadType = null; + fileObject.add("contentUrl", dataverseSiteUrl + FileUtil.getFileDownloadUrlPath(nullDownloadType, fileMetadata.getDataFile().getId(), false)); + } + } + fileArray.add(fileObject); + } + job.add("distribution", fileArray); + } jsonLd = job.build().toString(); return jsonLd; } diff --git a/src/main/java/edu/harvard/iq/dataverse/export/SchemaDotOrgExporter.java b/src/main/java/edu/harvard/iq/dataverse/export/SchemaDotOrgExporter.java index bbdaec7954a..971f0e5afa5 100644 --- a/src/main/java/edu/harvard/iq/dataverse/export/SchemaDotOrgExporter.java +++ b/src/main/java/edu/harvard/iq/dataverse/export/SchemaDotOrgExporter.java @@ -13,6 +13,60 @@ import javax.json.JsonReader; import javax.ws.rs.core.MediaType; +/** + * Schema.org JSON-LD is used by Google Dataset Search and other services to + * make datasets more discoverable. It is embedded in the HTML of dataset pages + * and available as an export format. + *

+ * Do not make any backward incompatible changes unless it's absolutely + * necessary and list them in the API Guide. The existing list is in the + * "Native API" section. + *

+ * {@link SchemaDotOrgExporterTest} has most of the tests but + * {@link DatasetVersionTest} has some as well. See + * https://schema.org/docs/gs.html#schemaorg_expected for some discussion on + * what a flexible format Schema.org JSON-LD. Use of tools such as + * https://search.google.com/structured-data/testing-tool and + * https://webmaster.yandex.com/tools/microtest/ and + * http://linter.structured-data.org to make sure Dataverse continues to emit + * valid output is encouraged but you will find that these tools (and the + * underlying spec) can be extremely accommodating to fairly radical + * restructuring of the JSON output. Strings can become objects or arrays, for + * example, and Honey Badger don't care. Because we expect API users will make + * use of the JSON output, you should not change it or you will break their + * code. + *

+ * Copying and pasting output into + * https://search.google.com/structured-data/testing-tool to make sure it's + * still valid can get tedious but we are not aware of a better way. We looked + * at https://github.com/jessedc/ajv-cli (doesn't support JSON-LD, always + * reports "valid"), https://github.com/jsonld-java/jsonld-java and + * https://github.com/jsonld-java/jsonld-java-tools (unclear if they support + * validation), https://github.com/structured-data/linter (couldn't get it + * installed), https://github.com/json-ld/json-ld.org (couldn't get the test + * suite to detect changes) , https://tech.yandex.com/validator/ (requires API + * key), + * https://packagist.org/packages/padosoft/laravel-google-structured-data-testing-tool + * (may be promising). We use https://github.com/everit-org/json-schema in our + * app already to validate JSON Schema but JSON-LD is a different animal. + * https://schema.org/Dataset.jsonld appears to be the way to download just the + * "Dataset" definition ( https://schema.org/Dataset ) from schema.org but the + * official way to download these definitions is from + * https://schema.org/docs/developers.html#defs . Despite all this + * experimentation (some of these tools were found at + * https://medium.com/@vilcins/structured-data-markup-validation-and-testing-tools-1968bd5dea37 + * ), the accepted answer at + * https://webmasters.stackexchange.com/questions/56577/any-way-to-validate-schema-org-json-ld-before-publishing + * is to just copy and paste your output into one of the online tools so for + * now, just do that. + *

+ * Google provides a Schema.org JSON-LD example at + * https://developers.google.com/search/docs/data-types/dataset but we've also + * looked at examples from + * https://zenodo.org/record/1419226/export/schemaorg_jsonld#.W9NJjicpDUI , + * https://www.icpsr.umich.edu/icpsrweb/ICPSR/studies/23980/export , and + * https://doi.pangaea.de/10.1594/PANGAEA.884619 + */ @AutoService(Exporter.class) public class SchemaDotOrgExporter implements Exporter { diff --git a/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java b/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java index 8328854f481..ccf5bbef19e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java @@ -68,6 +68,12 @@ public class SystemConfig { */ public static final String FILES_DIRECTORY = "dataverse.files.directory"; + /** + * Some installations may not want download URLs to their files to be + * available in Schema.org JSON-LD output. + */ + public static final String FILES_HIDE_SCHEMA_DOT_ORG_DOWNLOAD_URLS = "dataverse.files.hide-schema-dot-org-download-urls"; + /** * A JVM option to override the number of minutes for which a password reset * token is valid ({@link #minutesUntilPasswordResetTokenExpires}). diff --git a/src/test/java/edu/harvard/iq/dataverse/DatasetAuthorTest.java b/src/test/java/edu/harvard/iq/dataverse/DatasetAuthorTest.java new file mode 100644 index 00000000000..a9e41659140 --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/DatasetAuthorTest.java @@ -0,0 +1,60 @@ +package edu.harvard.iq.dataverse; + +import static org.junit.Assert.assertEquals; +import org.junit.Test; + +public class DatasetAuthorTest { + + @Test + public void testGetIdentifierAsUrlOrcid() { + DatasetAuthor datasetAuthor = new DatasetAuthor(); + datasetAuthor.setIdType("ORCID"); + datasetAuthor.setIdValue("0000-0002-1825-0097"); + String result = datasetAuthor.getIdentifierAsUrl(); + assertEquals("https://orcid.org/0000-0002-1825-0097", result); + } + + @Test + public void testGetIdentifierAsUrlIsni() { + DatasetAuthor datasetAuthor = new DatasetAuthor(); + datasetAuthor.setIdType("ISNI"); + datasetAuthor.setIdValue("0000000121032683"); + String result = datasetAuthor.getIdentifierAsUrl(); + assertEquals("http://www.isni.org/isni/0000000121032683", result); + } + + @Test + public void testGetIdentifierAsUrlLcna() { + DatasetAuthor datasetAuthor = new DatasetAuthor(); + datasetAuthor.setIdType("LCNA"); + datasetAuthor.setIdValue("n82058243"); + String result = datasetAuthor.getIdentifierAsUrl(); + assertEquals("http://id.loc.gov/authorities/names/n82058243", result); + } + + @Test + public void testGetIdentifierAsUrlViaf() { + DatasetAuthor datasetAuthor = new DatasetAuthor(); + datasetAuthor.setIdType("VIAF"); + datasetAuthor.setIdValue("172389567"); + String result = datasetAuthor.getIdentifierAsUrl(); + assertEquals("https://viaf.org/viaf/172389567", result); + } + + @Test + public void testGetIdentifierAsUrlGnd() { + DatasetAuthor datasetAuthor = new DatasetAuthor(); + datasetAuthor.setIdType("GND"); + datasetAuthor.setIdValue("4079154-3"); + String result = datasetAuthor.getIdentifierAsUrl(); + assertEquals("https://d-nb.info/gnd/4079154-3", result); + } + + @Test + public void testGetIdentifierAsUrlNull() { + DatasetAuthor datasetAuthor = new DatasetAuthor(); + String result = datasetAuthor.getIdentifierAsUrl(); + assertEquals(null, result); + } + +} diff --git a/src/test/java/edu/harvard/iq/dataverse/DatasetFieldValueValidatorTest.java b/src/test/java/edu/harvard/iq/dataverse/DatasetFieldValueValidatorTest.java index aeceedc07f6..dedafe7722e 100644 --- a/src/test/java/edu/harvard/iq/dataverse/DatasetFieldValueValidatorTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/DatasetFieldValueValidatorTest.java @@ -5,6 +5,7 @@ */ package edu.harvard.iq.dataverse; +import java.util.regex.Pattern; import javax.validation.ConstraintValidatorContext; import org.junit.After; import org.junit.AfterClass; @@ -136,5 +137,51 @@ public void testIsValid() { assertEquals(false, result); } - + + @Test + public void testIsValidAuthorIdentifierOrcid() { + DatasetFieldValueValidator validator = new DatasetFieldValueValidator(); + Pattern pattern = DatasetAuthor.getValidPattern(DatasetAuthor.REGEX_ORCID); + assertTrue(validator.isValidAuthorIdentifier("0000-0002-1825-0097", pattern)); + // An "X" at the end of an ORCID is less common but still valid. + assertTrue(validator.isValidAuthorIdentifier("0000-0002-1694-233X", pattern)); + assertFalse(validator.isValidAuthorIdentifier("0000 0002 1825 0097", pattern)); + assertFalse(validator.isValidAuthorIdentifier(" 0000-0002-1825-0097", pattern)); + assertFalse(validator.isValidAuthorIdentifier("0000-0002-1825-0097 ", pattern)); + assertFalse(validator.isValidAuthorIdentifier("junk", pattern)); + } + + @Test + public void testIsValidAuthorIdentifierIsni() { + DatasetFieldValueValidator validator = new DatasetFieldValueValidator(); + Pattern pattern = DatasetAuthor.getValidPattern(DatasetAuthor.REGEX_ISNI); + assertTrue(validator.isValidAuthorIdentifier("0000000121032683", pattern)); + assertFalse(validator.isValidAuthorIdentifier("junk", pattern)); + } + + @Test + public void testIsValidAuthorIdentifierLcna() { + DatasetFieldValueValidator validator = new DatasetFieldValueValidator(); + Pattern pattern = DatasetAuthor.getValidPattern(DatasetAuthor.REGEX_LCNA); + assertTrue(validator.isValidAuthorIdentifier("n82058243", pattern)); + assertTrue(validator.isValidAuthorIdentifier("foobar123", pattern)); + assertFalse(validator.isValidAuthorIdentifier("junk", pattern)); + } + + @Test + public void testIsValidAuthorIdentifierViaf() { + DatasetFieldValueValidator validator = new DatasetFieldValueValidator(); + Pattern pattern = DatasetAuthor.getValidPattern(DatasetAuthor.REGEX_VIAF); + assertTrue(validator.isValidAuthorIdentifier("172389567", pattern)); + assertFalse(validator.isValidAuthorIdentifier("junk", pattern)); + } + + @Test + public void testIsValidAuthorIdentifierGnd() { + DatasetFieldValueValidator validator = new DatasetFieldValueValidator(); + Pattern pattern = DatasetAuthor.getValidPattern(DatasetAuthor.REGEX_GND); + assertTrue(validator.isValidAuthorIdentifier("4079154-3", pattern)); + assertFalse(validator.isValidAuthorIdentifier("junk", pattern)); + } + } diff --git a/src/test/java/edu/harvard/iq/dataverse/DatasetVersionTest.java b/src/test/java/edu/harvard/iq/dataverse/DatasetVersionTest.java index 500b30b2062..3ca69fa71d2 100644 --- a/src/test/java/edu/harvard/iq/dataverse/DatasetVersionTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/DatasetVersionTest.java @@ -1,6 +1,7 @@ package edu.harvard.iq.dataverse; import edu.harvard.iq.dataverse.mocks.MocksFactory; +import edu.harvard.iq.dataverse.util.json.JsonUtil; import java.io.StringReader; import java.sql.Timestamp; import java.text.ParseException; @@ -94,6 +95,9 @@ public void testIsInReview() { assertFalse(nonDraft.isInReview()); } + /** + * See also SchemaDotOrgExporterTest.java for more extensive tests. + */ @Test public void testGetJsonLd() throws ParseException { Dataset dataset = new Dataset(); @@ -115,14 +119,21 @@ public void testGetJsonLd() throws ParseException { Dataverse dataverse = new Dataverse(); dataverse.setName("LibraScholar"); dataset.setOwner(dataverse); + TermsOfUseAndAccess terms = new TermsOfUseAndAccess(); + terms.setLicense(TermsOfUseAndAccess.License.CC0); + datasetVersion.setTermsOfUseAndAccess(terms); String jsonLd = datasetVersion.getJsonLd(); - System.out.println("jsonLd: " + jsonLd); + System.out.println("jsonLd: " + JsonUtil.prettyPrint(jsonLd)); JsonReader jsonReader = Json.createReader(new StringReader(jsonLd)); JsonObject obj = jsonReader.readObject(); assertEquals("http://schema.org", obj.getString("@context")); assertEquals("Dataset", obj.getString("@type")); + assertEquals("https://doi.org/10.5072/FK2/LK0D1H", obj.getString("@id")); assertEquals("https://doi.org/10.5072/FK2/LK0D1H", obj.getString("identifier")); - assertEquals("https://schema.org/version/3.3", obj.getString("schemaVersion")); + assertEquals(null, obj.getString("schemaVersion", null)); + assertEquals("Dataset", obj.getJsonObject("license").getString("@type")); + assertEquals("CC0", obj.getJsonObject("license").getString("text")); + assertEquals("https://creativecommons.org/publicdomain/zero/1.0/", obj.getJsonObject("license").getString("url")); assertEquals("1955-11-05", obj.getString("dateModified")); assertEquals("1955-11-05", obj.getString("datePublished")); assertEquals("1", obj.getString("version")); @@ -130,10 +141,70 @@ public void testGetJsonLd() throws ParseException { assertEquals("", obj.getString("name")); // TODO: If it ever becomes easier to mock authors, test them. JsonArray emptyArray = Json.createArrayBuilder().build(); + assertEquals(emptyArray, obj.getJsonArray("creator")); assertEquals(emptyArray, obj.getJsonArray("author")); // TODO: If it ever becomes easier to mock subjects, test them. assertEquals(emptyArray, obj.getJsonArray("keywords")); - assertEquals("Dataverse", obj.getJsonObject("provider").getString("name")); + assertEquals("Organization", obj.getJsonObject("publisher").getString("@type")); + assertEquals("LibraScholar", obj.getJsonObject("publisher").getString("name")); + assertEquals("Organization", obj.getJsonObject("provider").getString("@type")); + assertEquals("LibraScholar", obj.getJsonObject("provider").getString("name")); + assertEquals("LibraScholar", obj.getJsonObject("includedInDataCatalog").getString("name")); + } + + @Test + public void testGetJsonLdNonCC0License() throws ParseException { + Dataset dataset = new Dataset(); + dataset.setProtocol("doi"); + dataset.setAuthority("10.5072/FK2"); + dataset.setIdentifier("LK0D1H"); + DatasetVersion datasetVersion = new DatasetVersion(); + datasetVersion.setDataset(dataset); + datasetVersion.setVersionState(DatasetVersion.VersionState.DRAFT); + assertEquals("", datasetVersion.getPublicationDateAsString()); + // Only published datasets return any JSON. + assertEquals("", datasetVersion.getJsonLd()); + datasetVersion.setVersionState(DatasetVersion.VersionState.RELEASED); + datasetVersion.setVersionNumber(1L); + SimpleDateFormat dateFmt = new SimpleDateFormat("yyyyMMdd"); + Date publicationDate = dateFmt.parse("19551105"); + datasetVersion.setReleaseTime(publicationDate); + dataset.setPublicationDate(new Timestamp(publicationDate.getTime())); + Dataverse dataverse = new Dataverse(); + dataverse.setName("LibraScholar"); + dataset.setOwner(dataverse); + + TermsOfUseAndAccess terms = new TermsOfUseAndAccess(); + terms.setLicense(TermsOfUseAndAccess.License.NONE); + terms.setTermsOfUse("Call me maybe"); + datasetVersion.setTermsOfUseAndAccess(terms); + + String jsonLd = datasetVersion.getJsonLd(); + System.out.println("jsonLd: " + JsonUtil.prettyPrint(jsonLd)); + JsonReader jsonReader = Json.createReader(new StringReader(jsonLd)); + JsonObject obj = jsonReader.readObject(); + assertEquals("http://schema.org", obj.getString("@context")); + assertEquals("Dataset", obj.getString("@type")); + assertEquals("https://doi.org/10.5072/FK2/LK0D1H", obj.getString("@id")); + assertEquals("https://doi.org/10.5072/FK2/LK0D1H", obj.getString("identifier")); + assertEquals(null, obj.getString("schemaVersion", null)); + assertEquals("Dataset", obj.getJsonObject("license").getString("@type")); + assertEquals("Call me maybe", obj.getJsonObject("license").getString("text")); + assertEquals("1955-11-05", obj.getString("dateModified")); + assertEquals("1955-11-05", obj.getString("datePublished")); + assertEquals("1", obj.getString("version")); + // TODO: if it ever becomes easier to mock a dataset title, test it. + assertEquals("", obj.getString("name")); + // TODO: If it ever becomes easier to mock authors, test them. + JsonArray emptyArray = Json.createArrayBuilder().build(); + assertEquals(emptyArray, obj.getJsonArray("creator")); + assertEquals(emptyArray, obj.getJsonArray("author")); + // TODO: If it ever becomes easier to mock subjects, test them. + assertEquals(emptyArray, obj.getJsonArray("keywords")); + assertEquals("Organization", obj.getJsonObject("publisher").getString("@type")); + assertEquals("LibraScholar", obj.getJsonObject("publisher").getString("name")); + assertEquals("Organization", obj.getJsonObject("provider").getString("@type")); + assertEquals("LibraScholar", obj.getJsonObject("provider").getString("name")); assertEquals("LibraScholar", obj.getJsonObject("includedInDataCatalog").getString("name")); } diff --git a/src/test/java/edu/harvard/iq/dataverse/export/SchemaDotOrgExporterTest.java b/src/test/java/edu/harvard/iq/dataverse/export/SchemaDotOrgExporterTest.java index a24f99ccf8a..a21ddc0e604 100644 --- a/src/test/java/edu/harvard/iq/dataverse/export/SchemaDotOrgExporterTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/export/SchemaDotOrgExporterTest.java @@ -1,22 +1,30 @@ package edu.harvard.iq.dataverse.export; import edu.harvard.iq.dataverse.ControlledVocabularyValue; +import edu.harvard.iq.dataverse.DataFile; import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetFieldType; import edu.harvard.iq.dataverse.DatasetVersion; import edu.harvard.iq.dataverse.Dataverse; +import edu.harvard.iq.dataverse.FileMetadata; +import edu.harvard.iq.dataverse.TermsOfUseAndAccess; import static edu.harvard.iq.dataverse.util.SystemConfig.SITE_URL; +import static edu.harvard.iq.dataverse.util.SystemConfig.FILES_HIDE_SCHEMA_DOT_ORG_DOWNLOAD_URLS; import edu.harvard.iq.dataverse.util.json.JsonParser; import edu.harvard.iq.dataverse.util.json.JsonUtil; import java.io.ByteArrayOutputStream; import java.io.File; +import java.io.PrintWriter; import java.io.StringReader; import java.nio.file.Files; import java.nio.file.Paths; +import java.sql.Timestamp; import java.text.SimpleDateFormat; +import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.HashSet; +import java.util.List; import java.util.Set; import javax.json.Json; import javax.json.JsonObject; @@ -28,6 +36,9 @@ import org.junit.Test; import static org.junit.Assert.*; +/** + * For docs see {@link SchemaDotOrgExporter}. + */ public class SchemaDotOrgExporterTest { private final SchemaDotOrgExporter schemaDotOrgExporter; @@ -88,6 +99,19 @@ public void setUp() { dsDescriptionType.setChildDatasetFieldTypes(dsDescriptionTypes); DatasetFieldType keywordType = datasetFieldTypeSvc.add(new DatasetFieldType("keyword", DatasetFieldType.FieldType.TEXT, true)); + Set keywordChildTypes = new HashSet<>(); + keywordChildTypes.add(datasetFieldTypeSvc.add(new DatasetFieldType("keywordValue", DatasetFieldType.FieldType.TEXT, false))); + keywordChildTypes.add(datasetFieldTypeSvc.add(new DatasetFieldType("keywordVocabulary", DatasetFieldType.FieldType.TEXT, false))); + keywordChildTypes.add(datasetFieldTypeSvc.add(new DatasetFieldType("keywordVocabularyURI", DatasetFieldType.FieldType.TEXT, false))); + keywordType.setChildDatasetFieldTypes(keywordChildTypes); + + DatasetFieldType topicClassificationType = datasetFieldTypeSvc.add(new DatasetFieldType("topicClassification", DatasetFieldType.FieldType.TEXT, true)); + Set topicClassificationTypes = new HashSet<>(); + topicClassificationTypes.add(datasetFieldTypeSvc.add(new DatasetFieldType("topicClassValue", DatasetFieldType.FieldType.TEXT, false))); + topicClassificationTypes.add(datasetFieldTypeSvc.add(new DatasetFieldType("topicClassVocab", DatasetFieldType.FieldType.TEXT, false))); + topicClassificationTypes.add(datasetFieldTypeSvc.add(new DatasetFieldType("topicClassVocabURI", DatasetFieldType.FieldType.TEXT, false))); + topicClassificationType.setChildDatasetFieldTypes(topicClassificationTypes); + DatasetFieldType descriptionType = datasetFieldTypeSvc.add(new DatasetFieldType("description", DatasetFieldType.FieldType.TEXTBOX, false)); DatasetFieldType subjectType = datasetFieldTypeSvc.add(new DatasetFieldType("subject", DatasetFieldType.FieldType.TEXT, true)); @@ -115,6 +139,82 @@ public void setUp() { t.setParentDatasetFieldType(compoundSingleType); } compoundSingleType.setChildDatasetFieldTypes(childTypes); + + DatasetFieldType contributorType = datasetFieldTypeSvc.add(new DatasetFieldType("contributor", DatasetFieldType.FieldType.TEXT, true)); + Set contributorChildTypes = new HashSet<>(); + contributorChildTypes.add(datasetFieldTypeSvc.add(new DatasetFieldType("contributorName", DatasetFieldType.FieldType.TEXT, false))); + DatasetFieldType contributorTypes = datasetFieldTypeSvc.add(new DatasetFieldType("contributorType", DatasetFieldType.FieldType.TEXT, false)); + contributorTypes.setAllowControlledVocabulary(true); + contributorTypes.setControlledVocabularyValues(Arrays.asList( + // Why aren't these enforced? + new ControlledVocabularyValue(1l, "Data Collector", contributorTypes), + new ControlledVocabularyValue(2l, "Data Curator", contributorTypes), + new ControlledVocabularyValue(3l, "Data Manager", contributorTypes), + new ControlledVocabularyValue(3l, "Editor", contributorTypes), + new ControlledVocabularyValue(3l, "Funder", contributorTypes), + new ControlledVocabularyValue(3l, "Hosting Institution", contributorTypes) + // Etc. There are more. + )); + contributorChildTypes.add(datasetFieldTypeSvc.add(contributorTypes)); + for (DatasetFieldType t : contributorChildTypes) { + t.setParentDatasetFieldType(contributorType); + } + contributorType.setChildDatasetFieldTypes(contributorChildTypes); + + DatasetFieldType grantNumberType = datasetFieldTypeSvc.add(new DatasetFieldType("grantNumber", DatasetFieldType.FieldType.TEXT, true)); + Set grantNumberChildTypes = new HashSet<>(); + grantNumberChildTypes.add(datasetFieldTypeSvc.add(new DatasetFieldType("grantNumberAgency", DatasetFieldType.FieldType.TEXT, false))); + grantNumberChildTypes.add(datasetFieldTypeSvc.add(new DatasetFieldType("grantNumberValue", DatasetFieldType.FieldType.TEXT, false))); + grantNumberType.setChildDatasetFieldTypes(grantNumberChildTypes); + + DatasetFieldType publicationType = datasetFieldTypeSvc.add(new DatasetFieldType("publication", DatasetFieldType.FieldType.TEXT, true)); + Set publicationChildTypes = new HashSet<>(); + publicationChildTypes.add(datasetFieldTypeSvc.add(new DatasetFieldType("publicationCitation", DatasetFieldType.FieldType.TEXT, false))); + DatasetFieldType publicationIdTypes = datasetFieldTypeSvc.add(new DatasetFieldType("publicationIDType", DatasetFieldType.FieldType.TEXT, false)); + publicationIdTypes.setAllowControlledVocabulary(true); + publicationIdTypes.setControlledVocabularyValues(Arrays.asList( + // Why aren't these enforced? + new ControlledVocabularyValue(1l, "ark", publicationIdTypes), + new ControlledVocabularyValue(2l, "arXiv", publicationIdTypes), + new ControlledVocabularyValue(3l, "bibcode", publicationIdTypes), + new ControlledVocabularyValue(4l, "doi", publicationIdTypes), + new ControlledVocabularyValue(5l, "ean13", publicationIdTypes), + new ControlledVocabularyValue(6l, "handle", publicationIdTypes) + // Etc. There are more. + )); + publicationChildTypes.add(datasetFieldTypeSvc.add(publicationIdTypes)); + publicationChildTypes.add(datasetFieldTypeSvc.add(new DatasetFieldType("publicationIDNumber", DatasetFieldType.FieldType.TEXT, false))); + DatasetFieldType publicationURLType = new DatasetFieldType("publicationURL", DatasetFieldType.FieldType.URL, false); + publicationURLType.setDisplayFormat("#VALUE"); + publicationChildTypes.add(datasetFieldTypeSvc.add(publicationURLType)); + publicationType.setChildDatasetFieldTypes(publicationChildTypes); + + DatasetFieldType timePeriodCoveredType = datasetFieldTypeSvc.add(new DatasetFieldType("timePeriodCovered", DatasetFieldType.FieldType.NONE, true)); + Set timePeriodCoveredChildTypes = new HashSet<>(); + timePeriodCoveredChildTypes.add(datasetFieldTypeSvc.add(new DatasetFieldType("timePeriodCoveredStart", DatasetFieldType.FieldType.DATE, false))); + timePeriodCoveredChildTypes.add(datasetFieldTypeSvc.add(new DatasetFieldType("timePeriodCoveredEnd", DatasetFieldType.FieldType.DATE, false))); + timePeriodCoveredType.setChildDatasetFieldTypes(timePeriodCoveredChildTypes); + + DatasetFieldType geographicCoverageType = datasetFieldTypeSvc.add(new DatasetFieldType("geographicCoverage", DatasetFieldType.FieldType.TEXT, true)); + Set geographicCoverageChildTypes = new HashSet<>(); + DatasetFieldType countries = datasetFieldTypeSvc.add(new DatasetFieldType("country", DatasetFieldType.FieldType.TEXT, false)); + countries.setAllowControlledVocabulary(true); + countries.setControlledVocabularyValues(Arrays.asList( + // Why aren't these enforced? + new ControlledVocabularyValue(1l, "Afghanistan", countries), + new ControlledVocabularyValue(2l, "Albania", countries) + // And many more countries. + )); + geographicCoverageChildTypes.add(datasetFieldTypeSvc.add(countries)); + geographicCoverageChildTypes.add(datasetFieldTypeSvc.add(new DatasetFieldType("state", DatasetFieldType.FieldType.TEXT, false))); + geographicCoverageChildTypes.add(datasetFieldTypeSvc.add(new DatasetFieldType("city", DatasetFieldType.FieldType.TEXT, false))); + geographicCoverageChildTypes.add(datasetFieldTypeSvc.add(new DatasetFieldType("otherGeographicCoverage", DatasetFieldType.FieldType.TEXT, false))); + geographicCoverageChildTypes.add(datasetFieldTypeSvc.add(new DatasetFieldType("geographicUnit", DatasetFieldType.FieldType.TEXT, false))); + for (DatasetFieldType t : geographicCoverageChildTypes) { + t.setParentDatasetFieldType(geographicCoverageType); + } + geographicCoverageType.setChildDatasetFieldTypes(geographicCoverageChildTypes); + } @After @@ -127,7 +227,7 @@ public void tearDown() { @Test public void testExportDataset() throws Exception { System.out.println("exportDataset"); - File datasetVersionJson = new File("src/test/resources/json/dataset-finch1.json"); + File datasetVersionJson = new File("src/test/resources/json/dataset-finch2.json"); String datasetVersionAsJson = new String(Files.readAllBytes(Paths.get(datasetVersionJson.getAbsolutePath()))); JsonReader jsonReader1 = Json.createReader(new StringReader(datasetVersionAsJson)); @@ -139,40 +239,112 @@ public void testExportDataset() throws Exception { Date publicationDate = dateFmt.parse("19551105"); version.setReleaseTime(publicationDate); version.setVersionNumber(1l); - // TODO: It might be nice to test TermsOfUseAndAccess some day - version.setTermsOfUseAndAccess(null); + TermsOfUseAndAccess terms = new TermsOfUseAndAccess(); + terms.setLicense(TermsOfUseAndAccess.License.CC0); + version.setTermsOfUseAndAccess(terms); + Dataset dataset = new Dataset(); dataset.setProtocol("doi"); - dataset.setAuthority("myAuthority"); - dataset.setIdentifier("myIdentifier"); + dataset.setAuthority("10.5072/FK2"); + dataset.setIdentifier("IMK5A4"); + dataset.setPublicationDate(new Timestamp(publicationDate.getTime())); version.setDataset(dataset); Dataverse dataverse = new Dataverse(); dataverse.setName("LibraScholar"); dataset.setOwner(dataverse); System.setProperty(SITE_URL, "https://librascholar.org"); + boolean hideFileUrls = false; + if (hideFileUrls) { + System.setProperty(FILES_HIDE_SCHEMA_DOT_ORG_DOWNLOAD_URLS, "true"); + } + + FileMetadata fmd = new FileMetadata(); + DataFile dataFile = new DataFile(); + dataFile.setId(42l); + dataFile.setFilesize(1234); + dataFile.setContentType("text/plain"); + dataFile.setProtocol("doi"); + dataFile.setAuthority("10.5072/FK2"); + dataFile.setIdentifier("7V5MPI"); + fmd.setDatasetVersion(version); + fmd.setDataFile(dataFile); + fmd.setLabel("README.md"); + fmd.setDescription("README file."); + List fileMetadatas = new ArrayList<>(); + fileMetadatas.add(fmd); + dataFile.setFileMetadatas(fileMetadatas);; + dataFile.setOwner(dataset); + version.setFileMetadatas(fileMetadatas); ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); schemaDotOrgExporter.exportDataset(version, json1, byteArrayOutputStream); String jsonLd = byteArrayOutputStream.toString(); - System.out.println("schema.org JSON-LD: " + JsonUtil.prettyPrint(jsonLd)); + String prettyJson = JsonUtil.prettyPrint(jsonLd); + System.out.println("schema.org JSON-LD: " + prettyJson); JsonReader jsonReader2 = Json.createReader(new StringReader(jsonLd)); JsonObject json2 = jsonReader2.readObject(); assertEquals("http://schema.org", json2.getString("@context")); assertEquals("Dataset", json2.getString("@type")); - assertEquals("https://doi.org/myAuthority/myIdentifier", json2.getString("identifier")); + assertEquals("https://doi.org/10.5072/FK2/IMK5A4", json2.getString("@id")); + assertEquals("https://doi.org/10.5072/FK2/IMK5A4", json2.getString("identifier")); assertEquals("Darwin's Finches", json2.getString("name")); + assertEquals("Finch, Fiona", json2.getJsonArray("creator").getJsonObject(0).getString("name")); + assertEquals("Birds Inc.", json2.getJsonArray("creator").getJsonObject(0).getString("affiliation")); + assertEquals("https://orcid.org/0000-0002-1825-0097", json2.getJsonArray("creator").getJsonObject(0).getString("@id")); + assertEquals("https://orcid.org/0000-0002-1825-0097", json2.getJsonArray("creator").getJsonObject(0).getString("identifier")); assertEquals("Finch, Fiona", json2.getJsonArray("author").getJsonObject(0).getString("name")); assertEquals("Birds Inc.", json2.getJsonArray("author").getJsonObject(0).getString("affiliation")); + assertEquals("https://orcid.org/0000-0002-1825-0097", json2.getJsonArray("author").getJsonObject(0).getString("@id")); + assertEquals("https://orcid.org/0000-0002-1825-0097", json2.getJsonArray("author").getJsonObject(0).getString("identifier")); + assertEquals("1955-11-05", json2.getString("datePublished")); assertEquals("1955-11-05", json2.getString("dateModified")); assertEquals("1", json2.getString("version")); - assertEquals("Darwin's finches (also known as the Galápagos finches) are a group of about fifteen species of passerine birds.", json2.getString("description")); + assertEquals("Darwin's finches (also known as the Galápagos finches) are a group of about fifteen species of passerine birds.", json2.getJsonArray("description").getString(0)); + assertEquals("Bird is the word.", json2.getJsonArray("description").getString(1)); + assertEquals(2, json2.getJsonArray("description").size()); assertEquals("Medicine, Health and Life Sciences", json2.getJsonArray("keywords").getString(0)); - assertEquals("https://schema.org/version/3.3", json2.getString("schemaVersion")); + assertEquals("tcTerm1", json2.getJsonArray("keywords").getString(1)); + assertEquals("KeywordTerm1", json2.getJsonArray("keywords").getString(2)); + assertEquals("KeywordTerm2", json2.getJsonArray("keywords").getString(3)); + // This dataset, for example, has multiple keywords separated by commas: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/24034&version=2.0 + assertEquals("keywords, with, commas", json2.getJsonArray("keywords").getString(4)); + assertEquals("CreativeWork", json2.getJsonArray("citation").getJsonObject(0).getString("@type")); + assertEquals("Finch, Fiona 2018. \"The Finches.\" American Ornithological Journal 60 (4): 990-1005.", json2.getJsonArray("citation").getJsonObject(0).getString("text")); + assertEquals("https://doi.org/10.5072/FK2/RV16HK", json2.getJsonArray("citation").getJsonObject(0).getString("@id")); + assertEquals("https://doi.org/10.5072/FK2/RV16HK", json2.getJsonArray("citation").getJsonObject(0).getString("identifier")); + assertEquals("2002/2005", json2.getJsonArray("temporalCoverage").getString(0)); + assertEquals("2001-10-01/2015-11-15", json2.getJsonArray("temporalCoverage").getString(1)); + assertEquals(null, json2.getString("schemaVersion", null)); + assertEquals("Dataset", json2.getJsonObject("license").getString("@type")); + assertEquals("CC0", json2.getJsonObject("license").getString("text")); + assertEquals("https://creativecommons.org/publicdomain/zero/1.0/", json2.getJsonObject("license").getString("url")); assertEquals("DataCatalog", json2.getJsonObject("includedInDataCatalog").getString("@type")); assertEquals("LibraScholar", json2.getJsonObject("includedInDataCatalog").getString("name")); assertEquals("https://librascholar.org", json2.getJsonObject("includedInDataCatalog").getString("url")); + assertEquals("Organization", json2.getJsonObject("publisher").getString("@type")); + assertEquals("LibraScholar", json2.getJsonObject("provider").getString("name")); assertEquals("Organization", json2.getJsonObject("provider").getString("@type")); - assertEquals("Dataverse", json2.getJsonObject("provider").getString("name")); + assertEquals("LibraScholar", json2.getJsonObject("provider").getString("name")); + assertEquals("Organization", json2.getJsonArray("funder").getJsonObject(0).getString("@type")); + assertEquals("National Science Foundation", json2.getJsonArray("funder").getJsonObject(0).getString("name")); + // The NIH grant number is not shown because don't have anywhere in schema.org to put it. :( + assertEquals("National Institutes of Health", json2.getJsonArray("funder").getJsonObject(1).getString("name")); + assertEquals(2, json2.getJsonArray("funder").size()); + assertEquals("Columbus, Ohio, United States, North America", json2.getJsonArray("spatialCoverage").getString(0)); + assertEquals("Wisconsin, United States", json2.getJsonArray("spatialCoverage").getString(1)); + assertEquals(2, json2.getJsonArray("spatialCoverage").size()); + assertEquals("DataDownload", json2.getJsonArray("distribution").getJsonObject(0).getString("@type")); + assertEquals("README.md", json2.getJsonArray("distribution").getJsonObject(0).getString("name")); + assertEquals("text/plain", json2.getJsonArray("distribution").getJsonObject(0).getString("fileFormat")); + assertEquals(1234, json2.getJsonArray("distribution").getJsonObject(0).getInt("contentSize")); + assertEquals("README file.", json2.getJsonArray("distribution").getJsonObject(0).getString("description")); + assertEquals("https://doi.org/10.5072/FK2/7V5MPI", json2.getJsonArray("distribution").getJsonObject(0).getString("@id")); + assertEquals("https://doi.org/10.5072/FK2/7V5MPI", json2.getJsonArray("distribution").getJsonObject(0).getString("identifier")); + assertEquals("https://librascholar.org/api/access/datafile/42", json2.getJsonArray("distribution").getJsonObject(0).getString("contentUrl")); + assertEquals(1, json2.getJsonArray("distribution").size()); + try (PrintWriter printWriter = new PrintWriter("/tmp/dvjsonld.json")) { + printWriter.println(prettyJson); + } } /** diff --git a/src/test/resources/json/dataset-finch2.json b/src/test/resources/json/dataset-finch2.json new file mode 100644 index 00000000000..b3c01eb3d82 --- /dev/null +++ b/src/test/resources/json/dataset-finch2.json @@ -0,0 +1,354 @@ +{ + "datasetVersion": { + "metadataBlocks": { + "citation": { + "fields": [ + { + "value": "Darwin's Finches", + "typeClass": "primitive", + "multiple": false, + "typeName": "title" + }, + { + "value": [ + { + "authorName": { + "value": "Finch, Fiona", + "typeClass": "primitive", + "multiple": false, + "typeName": "authorName" + }, + "authorIdentifierScheme": { + "typeName": "authorIdentifierScheme", + "multiple": false, + "typeClass": "controlledVocabulary", + "value": "ORCID" + }, + "authorIdentifier": { + "typeName": "authorIdentifier", + "multiple": false, + "typeClass": "primitive", + "value": "0000-0002-1825-0097" + }, + "authorAffiliation": { + "value": "Birds Inc.", + "typeClass": "primitive", + "multiple": false, + "typeName": "authorAffiliation" + } + } + ], + "typeClass": "compound", + "multiple": true, + "typeName": "author" + }, + { + "value": [ + { + "datasetContactEmail": { + "typeClass": "primitive", + "multiple": false, + "typeName": "datasetContactEmail", + "value": "finch@mailinator.com" + } + } + ], + "typeClass": "compound", + "multiple": true, + "typeName": "datasetContact" + }, + { + "value": [ + { + "dsDescriptionValue": { + "value": "Darwin's finches (also known as the Galápagos finches) are a group of about fifteen species of passerine birds.", + "multiple": false, + "typeClass": "primitive", + "typeName": "dsDescriptionValue" + } + }, + { + "dsDescriptionValue": { + "value": "Bird is the word.", + "multiple": false, + "typeClass": "primitive", + "typeName": "dsDescriptionValue" + } + } + ], + "typeClass": "compound", + "multiple": true, + "typeName": "dsDescription" + }, + { + "value": [ + "Medicine, Health and Life Sciences" + ], + "typeClass": "controlledVocabulary", + "multiple": true, + "typeName": "subject" + }, + { + "typeName": "keyword", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "keywordValue": { + "typeName": "keywordValue", + "multiple": false, + "typeClass": "primitive", + "value": "KeywordTerm1" + }, + "keywordVocabulary": { + "typeName": "keywordVocabulary", + "multiple": false, + "typeClass": "primitive", + "value": "KeywordVocabulary1" + }, + "keywordVocabularyURI": { + "typeName": "keywordVocabularyURI", + "multiple": false, + "typeClass": "primitive", + "value": "http://KeywordVocabularyURL1.org" + } + }, + { + "keywordValue": { + "typeName": "keywordValue", + "multiple": false, + "typeClass": "primitive", + "value": "KeywordTerm2" + }, + "keywordVocabulary": { + "typeName": "keywordVocabulary", + "multiple": false, + "typeClass": "primitive", + "value": "KeywordVocabulary2" + }, + "keywordVocabularyURI": { + "typeName": "keywordVocabularyURI", + "multiple": false, + "typeClass": "primitive", + "value": "http://KeywordVocabularyURL2.org" + } + }, + { + "keywordValue": { + "typeName": "keywordValue", + "multiple": false, + "typeClass": "primitive", + "value": "keywords, with, commas" + } + } + ] + }, + { + "typeName": "topicClassification", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "topicClassValue": { + "typeName": "topicClassValue", + "multiple": false, + "typeClass": "primitive", + "value": "tcTerm1" + }, + "topicClassVocab": { + "typeName": "topicClassVocab", + "multiple": false, + "typeClass": "primitive", + "value": "tcVocab1" + }, + "topicClassVocabURI": { + "typeName": "topicClassVocabURI", + "multiple": false, + "typeClass": "primitive", + "value": "http://example.com/tcTerm1" + } + } + ] + }, + { + "typeName": "contributor", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "contributorType": { + "typeName": "contributorType", + "multiple": false, + "typeClass": "controlledVocabulary", + "value": "Data Collector" + }, + "contributorName": { + "typeName": "contributorName", + "multiple": false, + "typeClass": "primitive", + "value": "Holmes, Sherlock" + } + }, + { + "contributorType": { + "typeName": "contributorType", + "multiple": false, + "typeClass": "controlledVocabulary", + "value": "Funder" + }, + "contributorName": { + "typeName": "contributorName", + "multiple": false, + "typeClass": "primitive", + "value": "National Science Foundation" + } + } + ] + }, + { + "typeName": "grantNumber", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "grantNumberAgency": { + "typeName": "grantNumberAgency", + "multiple": false, + "typeClass": "primitive", + "value": "National Institutes of Health" + }, + "grantNumberValue": { + "typeName": "grantNumberValue", + "multiple": false, + "typeClass": "primitive", + "value": "1245" + } + } + ] + }, + { + "typeName": "publication", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "publicationCitation": { + "typeName": "publicationCitation", + "multiple": false, + "typeClass": "primitive", + "value": "Finch, Fiona 2018. \"The Finches.\" American Ornithological Journal 60 (4): 990-1005." + }, + "publicationIDType": { + "typeName": "publicationIDType", + "multiple": false, + "typeClass": "controlledVocabulary", + "value": "doi" + }, + "publicationIDNumber": { + "typeName": "publicationIDNumber", + "multiple": false, + "typeClass": "primitive", + "value": "10.5072/FK2/RV16HK" + }, + "publicationURL": { + "typeName": "publicationURL", + "multiple": false, + "typeClass": "primitive", + "value": "https://doi.org/10.5072/FK2/RV16HK" + } + } + ] + }, + { + "typeName": "timePeriodCovered", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "timePeriodCoveredStart": { + "typeName": "timePeriodCoveredStart", + "multiple": false, + "typeClass": "primitive", + "value": "2002" + }, + "timePeriodCoveredEnd": { + "typeName": "timePeriodCoveredEnd", + "multiple": false, + "typeClass": "primitive", + "value": "2005" + } + }, + { + "timePeriodCoveredStart": { + "typeName": "timePeriodCoveredStart", + "multiple": false, + "typeClass": "primitive", + "value": "2001-10-01" + }, + "timePeriodCoveredEnd": { + "typeName": "timePeriodCoveredEnd", + "multiple": false, + "typeClass": "primitive", + "value": "2015-11-15" + } + } + ] + } + ], + "displayName": "Citation Metadata" + }, + "geospatial": { + "displayName": "Geospatial Metadata", + "fields": [ + { + "typeName": "geographicCoverage", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "city": { + "typeName": "city", + "multiple": false, + "typeClass": "primitive", + "value": "Columbus" + }, + "state": { + "typeName": "state", + "multiple": false, + "typeClass": "primitive", + "value": "Ohio" + }, + "country": { + "typeName": "country", + "multiple": false, + "typeClass": "controlledVocabulary", + "value": "United States" + }, + "otherGeographicCoverage": { + "typeName": "otherGeographicCoverage", + "multiple": false, + "typeClass": "primitive", + "value": "North America" + } + }, + { + "country": { + "typeName": "country", + "multiple": false, + "typeClass": "controlledVocabulary", + "value": "United States" + }, + "state": { + "typeName": "state", + "multiple": false, + "typeClass": "primitive", + "value": "Wisconsin" + } + } + ] + } + ] + } + } + } +}