Merge pull request #9175 from GlobalDataverseCommunityConsortium/DANS…

…-external_exporters DANS - Exporters in external jars
IQSS · May 30, 2023 · 7df17dc · 7df17dc
2 parents e2eb6d9 + 4322d50
commit 7df17dc
Show file tree

Hide file tree

Showing 53 changed files with 1,810 additions and 866 deletions.
diff --git a/.github/workflows/container_app_pr.yml b/.github/workflows/container_app_pr.yml
@@ -57,8 +57,18 @@ jobs:
               run: |
                   echo "IMAGE_TAG=$(echo "${{ github.event.client_payload.pull_request.head.ref }}" | tr '\\/_:&+,;#*' '-')" >> $GITHUB_ENV
 
+            # Necessary to split as otherwise the submodules are not available (deploy skips install)
+            - name: Build app container image with local architecture and submodules (profile will skip tests)
+              run: >
+                  mvn -B -f modules/dataverse-parent
+                  -P ct -pl edu.harvard.iq:dataverse -am
+                  install
             - name: Deploy multi-arch application container image
-              run: mvn -Pct deploy -Dapp.image.tag=${{ env.IMAGE_TAG }} -Dbase.image.tag=${{ env.BASE_IMAGE_TAG }} -Ddocker.registry=ghcr.io -Ddocker.platforms=${{ env.PLATFORMS }}
+              run: >
+                  mvn 
+                  -Dapp.image.tag=${{ env.IMAGE_TAG }} -Dbase.image.tag=${{ env.BASE_IMAGE_TAG }}
+                  ${{ env.REGISTRY }} -Ddocker.platforms=${{ env.PLATFORMS }}
+                  -P ct deploy
 
             - uses: marocchino/sticky-pull-request-comment@v2
               with:

diff --git a/.github/workflows/container_app_push.yml b/.github/workflows/container_app_push.yml
@@ -39,18 +39,16 @@ jobs:
               uses: actions/setup-java@v3
               with:
                   java-version: "11"
-                  distribution: 'adopt'
-            - name: Cache Maven packages
-              uses: actions/cache@v3
-              with:
-                  path: ~/.m2
-                  key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
-                  restore-keys: ${{ runner.os }}-m2
+                  distribution: temurin
+                  cache: maven
 
-            - name: Build app container image with local architecture
-              run: mvn -Pct package
+            - name: Build app container image with local architecture and submodules (profile will skip tests)
+              run: >
+                  mvn -B -f modules/dataverse-parent
+                  -P ct -pl edu.harvard.iq:dataverse -am
+                  install
 
-            # TODO: add smoke / integration testing here
+            # TODO: add smoke / integration testing here (add "-Pct -DskipIntegrationTests=false")
 
     hub-description:
         needs: build
@@ -100,12 +98,7 @@ jobs:
             - uses: actions/setup-java@v3
               with:
                   java-version: "11"
-                  distribution: 'adopt'
-            - uses: actions/cache@v3
-              with:
-                  path: ~/.m2
-                  key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
-                  restore-keys: ${{ runner.os }}-m2
+                  distribution: temurin
 
             # Depending on context, we push to different targets. Login accordingly.
             - if: ${{ github.event_name != 'pull_request' }}
@@ -136,8 +129,18 @@ jobs:
                   echo "IMAGE_TAG=$(echo "$GITHUB_HEAD_REF" | tr '\\/_:&+,;#*' '-')" >> $GITHUB_ENV
                   echo "REGISTRY='-Ddocker.registry=ghcr.io'" >> $GITHUB_ENV
 
+            # Necessary to split as otherwise the submodules are not available (deploy skips install)
+            - name: Build app container image with local architecture and submodules (profile will skip tests)
+              run: >
+                  mvn -B -f modules/dataverse-parent
+                  -P ct -pl edu.harvard.iq:dataverse -am
+                  install
             - name: Deploy multi-arch application container image
-              run: mvn -Pct deploy -Dapp.image.tag=${{ env.IMAGE_TAG }} -Dbase.image.tag=${{ env.BASE_IMAGE_TAG }} ${{ env.REGISTRY }} -Ddocker.platforms=${{ env.PLATFORMS }}
+              run: >
+                  mvn 
+                  -Dapp.image.tag=${{ env.IMAGE_TAG }} -Dbase.image.tag=${{ env.BASE_IMAGE_TAG }}
+                  ${{ env.REGISTRY }} -Ddocker.platforms=${{ env.PLATFORMS }}
+                  -P ct deploy
 
             - uses: marocchino/sticky-pull-request-comment@v2
               if: ${{ github.event_name == 'pull_request' }}

diff --git a/.github/workflows/maven_unit_test.yml b/.github/workflows/maven_unit_test.yml
@@ -6,11 +6,15 @@ on:
             - "**.java"
             - "pom.xml"
             - "modules/**/pom.xml"
+            - "!modules/container-base/**"
+            - "!modules/dataverse-spi/**"
     pull_request:
         paths:
             - "**.java"
             - "pom.xml"
             - "modules/**/pom.xml"
+            - "!modules/container-base/**"
+            - "!modules/dataverse-spi/**"
 
 jobs:
     unittest:
@@ -33,25 +37,37 @@ jobs:
         continue-on-error: ${{ matrix.experimental }}
         runs-on: ubuntu-latest
         steps:
-          - uses: actions/checkout@v2
+          - uses: actions/checkout@v3
           - name: Set up JDK ${{ matrix.jdk }}
-            uses: actions/setup-java@v2
+            uses: actions/setup-java@v3
             with:
                 java-version: ${{ matrix.jdk }}
-                distribution: 'adopt'
-          - name: Cache Maven packages
-            uses: actions/cache@v2
-            with:
-                path: ~/.m2
-                key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
-                restore-keys: ${{ runner.os }}-m2
+                distribution: temurin
+                cache: maven
+
+          # The reason why we use "install" here is that we want the submodules to be available in the next step.
+          # Also, we can cache them this way for jobs triggered by this one.
           - name: Build with Maven
-            run: mvn -DcompilerArgument=-Xlint:unchecked -Dtarget.java.version=${{ matrix.jdk }} -P all-unit-tests clean test
+            run: > 
+                mvn -B -f modules/dataverse-parent
+                -Dtarget.java.version=${{ matrix.jdk }}
+                -DcompilerArgument=-Xlint:unchecked -P all-unit-tests
+                -pl edu.harvard.iq:dataverse -am
+                install
+
           - name: Maven Code Coverage
             env:
                 CI_NAME: github
                 COVERALLS_SECRET: ${{ secrets.GITHUB_TOKEN }}
-            run: mvn -V -B jacoco:report coveralls:report -DrepoToken=${COVERALLS_SECRET} -DpullRequest=${{ github.event.number }}
+            # The coverage commit is sometimes flaky. Don't bail out just because this optional step failed.
+            continue-on-error: true
+            run: >
+                mvn -B
+                -DrepoToken=${COVERALLS_SECRET} -DpullRequest=${{ github.event.number }}
+                jacoco:report coveralls:report
+
+          # We don't want to cache the WAR file, so delete it
+          - run: rm -rf ~/.m2/repository/edu/harvard/iq/dataverse
     push-app-img:
         name: Publish App Image
         permissions:

diff --git a/.github/workflows/spi_release.yml b/.github/workflows/spi_release.yml
@@ -0,0 +1,94 @@
+name: Dataverse SPI
+
+on:
+    push:
+        branch:
+            - "develop"
+        paths:
+            - "modules/dataverse-spi/**"
+    pull_request:
+        branch:
+            - "develop"
+        paths:
+            - "modules/dataverse-spi/**"
+
+jobs:
+    # Note: Pushing packages to Maven Central requires access to secrets, which pull requests from remote forks
+    #       don't have. Skip in these cases.
+    check-secrets:
+        name: Check for Secrets Availability
+        runs-on: ubuntu-latest
+        outputs:
+            available: ${{ steps.secret-check.outputs.available }}
+        steps:
+            -   id: secret-check
+                # perform secret check & put boolean result as an output
+                shell: bash
+                run: |
+                    if [ "${{ secrets.DATAVERSEBOT_SONATYPE_USERNAME }}" != '' ]; then
+                        echo "available=true" >> $GITHUB_OUTPUT;
+                    else
+                        echo "available=false" >> $GITHUB_OUTPUT;
+                    fi
+
+    snapshot:
+        name: Release Snapshot
+        needs: check-secrets
+        runs-on: ubuntu-latest
+        if: github.event_name == 'pull_request' && needs.check-secrets.outputs.available == 'true'
+        steps:
+            - uses: actions/checkout@v3
+            - uses: actions/setup-java@v3
+              with:
+                  java-version: '11'
+                  distribution: 'adopt'
+                  server-id: ossrh
+                  server-username: MAVEN_USERNAME
+                  server-password: MAVEN_PASSWORD
+            - uses: actions/cache@v2
+              with:
+                  path: ~/.m2
+                  key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
+                  restore-keys: ${{ runner.os }}-m2
+
+            - name: Deploy Snapshot
+              run: mvn -f modules/dataverse-spi -Dproject.version.suffix="-PR${{ github.event.number }}-SNAPSHOT" deploy
+              env:
+                  MAVEN_USERNAME: ${{ secrets.DATAVERSEBOT_SONATYPE_USERNAME }}
+                  MAVEN_PASSWORD: ${{ secrets.DATAVERSEBOT_SONATYPE_TOKEN }}
+
+    release:
+        name: Release
+        needs: check-secrets
+        runs-on: ubuntu-latest
+        if: github.event_name == 'push' && needs.check-secrets.outputs.available == 'true'
+        steps:
+            -   uses: actions/checkout@v3
+            -   uses: actions/setup-java@v3
+                with:
+                    java-version: '11'
+                    distribution: 'adopt'
+            -   uses: actions/cache@v2
+                with:
+                    path: ~/.m2
+                    key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
+                    restore-keys: ${{ runner.os }}-m2
+
+            # Running setup-java again overwrites the settings.xml - IT'S MANDATORY TO DO THIS SECOND SETUP!!!
+            -   name: Set up Maven Central Repository
+                uses: actions/setup-java@v3
+                with:
+                    java-version: '11'
+                    distribution: 'adopt'
+                    server-id: ossrh
+                    server-username: MAVEN_USERNAME
+                    server-password: MAVEN_PASSWORD
+                    gpg-private-key: ${{ secrets.DATAVERSEBOT_GPG_KEY }}
+                    gpg-passphrase: MAVEN_GPG_PASSPHRASE
+
+            -   name: Sign + Publish Release
+                run: mvn -f modules/dataverse-spi -P release deploy
+                env:
+                    MAVEN_USERNAME: ${{ secrets.DATAVERSEBOT_SONATYPE_USERNAME }}
+                    MAVEN_PASSWORD: ${{ secrets.DATAVERSEBOT_SONATYPE_TOKEN }}
+                    MAVEN_GPG_PASSPHRASE: ${{ secrets.DATAVERSEBOT_GPG_PASSWORD }}
diff --git a/doc/release-notes/9175-external-exporters.md b/doc/release-notes/9175-external-exporters.md
@@ -0,0 +1,11 @@
+## Ability to Create New Exporters
+
+It is now possible for third parties to develop and share code to provide new metadata export formats for Dataverse. Export formats can be made available via the Dataverse UI and API or configured for use in Harvesting. Dataverse now provides developers with a separate dataverse-spi JAR file that contains the Java interfaces and classes required to create a new metadata Exporter. Once a new Exporter has been created and packaged as a JAR file, administrators can use it by specifying a local directory for third party Exporters, dropping then Exporter JAR there, and restarting Payara. This mechanism also allows new Exporters to replace any of Dataverse's existing metadata export formats.
+
+## Backward Incompatibilities
+
+Care should be taken when replacing Dataverse's internal metadata export formats as third party code, including other third party Exporters may depend on the contents of those export formats. When replacing an existing format, one must also remember to delete the cached metadata export files or run the reExport command for the metadata exports of existing datasets to be updated.
+
+## New JVM/MicroProfile Settings
+
+dataverse.spi.export.directory - specifies a directory, readable by the Dataverse server. Any Exporter JAR files placed in this directory will be read by Dataverse and used to add/replace the specified metadata format.
diff --git a/doc/sphinx-guides/source/developers/index.rst b/doc/sphinx-guides/source/developers/index.rst
@@ -4,7 +4,7 @@
    contain the root `toctree` directive.
 
 Developer Guide
-=======================================================
+===============
 
 **Contents:**
 
@@ -27,6 +27,7 @@ Developer Guide
    deployment
    containers
    making-releases
+   metadataexport
    tools
    unf/index
    make-data-count

diff --git a/doc/sphinx-guides/source/developers/metadataexport.rst b/doc/sphinx-guides/source/developers/metadataexport.rst
@@ -0,0 +1,88 @@
+=======================
+Metadata Export Formats
+=======================
+
+.. contents:: |toctitle|
+    :local:
+
+Introduction
+------------
+
+Dataverse ships with a number of metadata export formats available for published datasets. A given metadata export
+format may be available for user download (via the UI and API) and/or be available for use in Harvesting between
+Dataverse instances.
+
+As of v5.14, Dataverse provides a mechanism for third-party developers to create new metadata Exporters than implement
+new metadata formats or that replace existing formats. All the necessary dependencies are packaged in an interface JAR file
+available from Maven Central. Developers can distribute their new Exporters as JAR files which can be dynamically loaded
+into Dataverse instances - see :ref:`external-exporters`. Developers are encouraged to make their Exporter code available
+via https://github.com/gdcc/dataverse-exporters (or minimally, to list their existence in the README there). 
+
+Exporter Basics
+---------------
+
+New Exports must implement the ``io.gdcc.spi.export.Exporter`` interface. The interface includes a few methods for the Exporter
+to provide Dataverse with the format it produces, a display name, format mimetype, and whether the format is for download 
+and/or harvesting use, etc. It also includes a main ``exportDataset(ExportDataProvider dataProvider, OutputStream outputStream)``
+method through which the Exporter receives metadata about the given dataset (via the ``ExportDataProvider``, described further 
+below) and writes its output (as an OutputStream).
+
+Exporters that create an XML format must implement the ``io.gdcc.spi.export.XMLExporter`` interface (which extends the Exporter
+interface). XMLExporter adds a few methods through which the XMLExporter provides information to Dataverse about the XML 
+namespace and version being used.
+
+Exporters also need to use the ``@AutoService(Exporter.class)`` which makes the class discoverable as an Exporter implementation.
+
+The ``ExportDataProvider`` interface provides several methods through which your Exporter can receive dataset and file metadata
+in various formats. Your exporter would parse the information in one or more of these inputs to retrieve the values needed to
+generate the Exporter's output format.
+
+The most important methods/input formats are:
+
+- ``getDatasetJson()`` - metadata in the internal Dataverse JSON format used in the native API and available via the built-in JSON metadata export.
+- ``getDatasetORE()`` - metadata in the OAI_ORE format available as a built-in metadata format and as used in Dataverse's BagIT-based Archiving capability. 
+- ``getDatasetFileDetails`` - detailed file-level metadata for ingested tabular files.
+
+The first two of these provide ~complete metadata about the dataset along with the metadata common to all files. This includes all metadata
+entries from all metadata blocks, PIDs, tags, Licenses and custom terms, etc. Almost all built-in exporters today use the JSON input.
+The newer OAI_ORE export, which is JSON-LD-based, provides a flatter structure and references metadata terms by their external vocabulary ids
+(e.g. http://purl.org/dc/terms/title) which may make it a prefereable starting point in some cases.
+
+The last method above provides a new JSON-formatted serialization of the variable-level file metadata Dataverse generates during ingest of tabular files.
+This information has only been included in the built-in DDI export, as the content of a ``dataDscr`` element. (Hence inspecting the edu.harvard.iq.dataverse.export.DDIExporter and related classes would be a good way to explore how the JSON is structured.) 
+
+The interface also provides
+
+- ``getDatasetSchemaDotOrg();`` and
+- ``getDataCiteXml();``.
+
+These provide subsets of metadata in the indicated formats. They may be useful starting points if your exporter will, for example, only add one or two additional fields to the given format.
+
+If an Exporter cannot create a requested metadata format for some reason, it should throw an ``io.gdcc.spi.export.ExportException``.
+
+Building an Exporter
+--------------------
+
+The example at https://github.com/gdcc/dataverse-exporters provides a Maven pom.xml file suitable for building an Exporter JAR file and that repository provides additional development guidance.
+
+There are four dependencies needed to build an Exporter:
+
+- ``io.gdcc dataverse-spi`` library containing the interfaces discussed above and the ExportException class
+- ``com.google.auto.service auto-service``, which provides the @AutoService annotation
+- ``jakarta.json jakarata.json-api`` for JSON classes
+- ``jakarta.ws.rs jakarta.ws.rs-api``, which provides a MediaType enumeration for specifying mime types.
+
+Specifying a Prerequisite Export
+--------------------------------
+
+An advanced feature of the Exporter mechanism allows a new Exporter to specify that it requires, as input, 
+the output of another Exporter. An example of this is the builting HTMLExporter which requires the output 
+of the DDI XML Exporter to produce an HTML document with the same DDI content.
+
+This is configured by providing the metadata format name via the ``Exporter.getPrerequisiteFormatName()`` method.
+When this method returns a non-empty format name, Dataverse will provide the requested format to the Exporter via
+the ``ExportDataProvider.getPrerequisiteInputStream()`` method.
+
+Developers and administrators deploying Exporters using this mechanism should be aware that, since metadata formats
+can be changed by other Exporters, the InputStream received may not hold the expected metadata. Developers should clearly
+document their compatability with the built-in or third-party Exporters they support as prerequisites.
diff --git a/doc/sphinx-guides/source/installation/advanced.rst b/doc/sphinx-guides/source/installation/advanced.rst
@@ -115,3 +115,29 @@ To activate in your Dataverse installation::
 
    curl -X PUT -d '/cgi-bin/zipdownload' http://localhost:8080/api/admin/settings/:CustomZipDownloadServiceUrl
 
+.. _external-exporters:
+
+Installing External Metadata Exporters
+++++++++++++++++++++++++++++++++++++++
+
+As of Dataverse Software 5.14 Dataverse supports the use of external Exporters as a way to add additional metadata
+export formats to Dataverse or replace the built-in formats. This should be considered an **experimental** capability
+in that the mechanism is expected to evolve and using it may require additional effort when upgrading to new Dataverse
+versions.
+
+This capability is enabled by specifying a directory in which Dataverse should look for third-party Exporters. See
+:ref:`dataverse.spi.exporters.directory`.
+
+See :doc:`/developers/metadataexport` for details about how to develop new Exporters.
+
+An minimal example Exporter is available at https://github.com/gdcc/dataverse-exporters. The community is encourage to 
+add additional exporters (and/or links to exporters elsewhere) in this repository. Once you have downloaded the 
+dataverse-spi-export-examples-1.0.0.jar (or other exporter jar), installed it in the directory specified above, and 
+restarted your Payara server, the new exporter should be available. 
+
+The example dataverse-spi-export-examples-1.0.0.jar replaces the ``JSON`` export with a ``MyJSON in <locale>`` version
+that just wraps the existing JSON export object in a new JSON object with the key ``inputJson`` containing the original
+JSON.(Note that the ``MyJSON in <locale>`` label will appear in the dataset Metadata Export download menu immediately,
+but the content for already published datasets will only be updated after you delete the cached exports and/or use a
+reExport API call (see :ref:`batch-exports-through-the-api`).)
+