Merge pull request #7579 from IQSS/7084-crawlable-file-access

7084 crawlable file access
IQSS · Feb 17, 2021 · a7a54ae · a7a54ae
2 parents a82b7c3 + 78ca6d5
commit a7a54ae
Show file tree

Hide file tree

Showing 14 changed files with 566 additions and 3 deletions.
diff --git a/doc/release-notes/7084-crawlable-file-access.md b/doc/release-notes/7084-crawlable-file-access.md
@@ -0,0 +1,29 @@
+ ## Release Highlights
+
+### A new file access API
+
+A new api offers *crawlable* access view of the folders and files within a datset:
+
+```
+  /api/datasets/<dataset id>/dirindex/
+```
+
+will output a simple html listing, based on the standard Apache
+directory index, with Access API download links for individual files,
+and recursive calls to the API above for sub-folders. (See the
+documentation entry in the guides for more information).
+
+Using this API, ``wget --recursive`` (or similar crawling client) can
+be used to download all the files in a dataset, preserving the file
+names and folder structure; without having to use the download-as-zip
+API. In addition to being faster (zipping is a relatively
+resource-intensive operation on the server side), this process can be
+restarted if interrupted (with ``wget --continue`` or equivalent) -
+unlike zipped multi-file downloads that always have to start from the
+beginning.
+
+On a system that uses S3 with download redirects, the individual file
+downloads will be handled by S3 directly, without having to be proxied
+through the Dataverse application.
+
+
diff --git a/doc/sphinx-guides/source/api/img/dataset_page_files_view.png b/doc/sphinx-guides/source/api/img/dataset_page_files_view.png
diff --git a/doc/sphinx-guides/source/api/img/dataset_page_tree_view.png b/doc/sphinx-guides/source/api/img/dataset_page_tree_view.png
diff --git a/doc/sphinx-guides/source/api/img/index_view_subfolder.png b/doc/sphinx-guides/source/api/img/index_view_subfolder.png
diff --git a/doc/sphinx-guides/source/api/img/index_view_top.png b/doc/sphinx-guides/source/api/img/index_view_top.png
diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst
@@ -783,9 +783,92 @@ List Files in a Dataset
 The fully expanded example above (without environment variables) looks like this:
 
 .. code-block:: bash
-
+ 
   curl https://demo.dataverse.org/api/datasets/24/versions/1.0/files
 
+View Dataset Files and Folders as a Directory Index
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+|CORS| Provides a *crawlable* view of files and folders within the given dataset and version:
+
+.. code-block:: bash
+
+  curl $SERVER_URL/api/datasets/$ID/dirindex/
+
+Optional parameters:
+
+* ``folder`` - A subfolder within the dataset (default: top-level view of the dataset)
+* ``version`` - Specifies the version (default: latest published version)
+* ``original=true`` - Download original versions of ingested tabular files. 
+  
+This API outputs a simple html listing, based on the standard Apache
+directory index, with Access API download links for individual files,
+and recursive calls to the API above for sub-folders.
+
+Using this API, ``wget --recursive`` (or a similar crawling client) can
+be used to download all the files in a dataset, preserving the file
+names and folder structure; without having to use the download-as-zip
+API. In addition to being faster (zipping is a relatively
+resource-intensive operation on the server side), this process can be
+restarted if interrupted (with ``wget --continue`` or equivalent) -
+unlike zipped multi-file downloads that always have to start from the
+beginning.
+
+On a system that uses S3 with download redirects, the individual file
+downloads will be handled by S3 directly, without having to be proxied
+through the Dataverse application.
+
+For example, if you have a dataset version with 2 files, one with the folder named "subfolder":
+
+.. |image1| image:: ./img/dataset_page_files_view.png
+
+or, as viewed as a tree on the dataset page:
+
+.. |image2| image:: ./img/dataset_page_tree_view.png
+
+The output of the API for the top-level folder (``/api/datasets/{dataset}/dirindex/``) will be as follows:
+
+.. |image3| image:: ./img/index_view_top.png
+
+with the underlying html source:
+
+.. code-block:: html
+
+    <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
+    <html><head><title>Index of folder /</title></head>
+    <body><h1>Index of folder / in dataset doi:XXX/YY/ZZZZ (v. MM)</h1>
+    <table>
+    <tr><th>Name</th><th>Last Modified</th><th>Size</th><th>Description</th></tr>
+    <tr><th colspan="4"><hr></th></tr>
+    <tr><td><a href="/api/datasets/NNNN/dirindex/?folder=subfolder">subfolder/</a></td><td align="right"> - </td><td align="right"> - </td><td align="right">&nbsp;</td></tr>
+    <tr><td><a href="/api/access/datafile/KKKK">testfile.txt</a></td><td align="right">13-January-2021 22:35</td><td align="right">19 B</td><td align="right">&nbsp;</td></tr>
+    </table></body></html>
+
+The ``/dirindex/?folder=subfolder`` link above will produce the following view:
+
+.. |image4| image:: ./img/index_view_subfolder.png
+
+with the html source as follows:
+
+.. code-block:: html
+
+    <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
+    <html><head><title>Index of folder /subfolder</title></head>
+    <body><h1>Index of folder /subfolder in dataset doi:XXX/YY/ZZZZ (v. MM)</h1>
+    <table>
+    <tr><th>Name</th><th>Last Modified</th><th>Size</th><th>Description</th></tr>
+    <tr><th colspan="4"><hr></th></tr>
+    <tr><td><a href="/api/access/datafile/subfolder/LLLL">50by1000.tab</a></td><td align="right">11-January-2021 09:31</td><td align="right">102.5 KB</td><td align="right">&nbsp;</td></tr>
+    </table></body></html>
+
+An example of a ``wget`` command line for crawling ("recursive downloading") of the files and folders in a dataset: 
+
+.. code-block:: bash
+
+  wget -r -e robots=off -nH --cut-dirs=3 --content-disposition https://demo.dataverse.org/api/datasets/24/dirindex/
+
+.. note:: In addition to the files and folders in the dataset, the command line above will also save the directory index of each folder, in a separate folder "dirindex".
+
 List All Metadata Blocks for a Dataset
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java
@@ -214,6 +214,33 @@ public List<FileMetadata> getFileMetadatasSortedByLabelAndFolder() {
         Collections.sort(fileMetadatasCopy, FileMetadata.compareByLabelAndFolder);
         return fileMetadatasCopy;
     }
+
+    public List<FileMetadata> getFileMetadatasFolderListing(String folderName) {
+        ArrayList<FileMetadata> fileMetadatasCopy = new ArrayList<>();
+        HashSet<String> subFolders = new HashSet<>();
+
+        for (FileMetadata fileMetadata : fileMetadatas) {
+            String thisFolder = fileMetadata.getDirectoryLabel() == null ? "" : fileMetadata.getDirectoryLabel(); 
+
+            if (folderName.equals(thisFolder)) {
+                fileMetadatasCopy.add(fileMetadata);
+            } else if (thisFolder.startsWith(folderName)) {
+                String subFolder = "".equals(folderName) ? thisFolder : thisFolder.substring(folderName.length() + 1);
+                if (subFolder.indexOf('/') > 0) {
+                    subFolder = subFolder.substring(0, subFolder.indexOf('/'));
+                }
+
+                if (!subFolders.contains(subFolder)) {
+                    fileMetadatasCopy.add(fileMetadata);
+                    subFolders.add(subFolder);
+                }
+
+            }
+        }
+        Collections.sort(fileMetadatasCopy, FileMetadata.compareByFullPath);
+
+        return fileMetadatasCopy; 
+    }
 
     public void setFileMetadatas(List<FileMetadata> fileMetadatas) {
         this.fileMetadatas = fileMetadatas;

diff --git a/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java b/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java
@@ -575,6 +575,16 @@ public int compare(FileMetadata o1, FileMetadata o2) {
         }
     };
 
+    public static final Comparator<FileMetadata> compareByFullPath = new Comparator<FileMetadata>() {
+        @Override
+        public int compare(FileMetadata o1, FileMetadata o2) {
+            String folder1 = StringUtil.isEmpty(o1.getDirectoryLabel()) ? "" : o1.getDirectoryLabel().toUpperCase() + "/";
+            String folder2 = StringUtil.isEmpty(o2.getDirectoryLabel()) ? "" : o2.getDirectoryLabel().toUpperCase() + "/";
+
+            return folder1.concat(o1.getLabel().toUpperCase()).compareTo(folder2.concat(o2.getLabel().toUpperCase()));
+        }
+    };
+
 
     public String toPrettyJSON(){
 

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Access.java b/src/main/java/edu/harvard/iq/dataverse/api/Access.java
@@ -270,11 +270,21 @@ private DataFile findDataFileOrDieWrapper(String fileId){
     }
 
 
-    @Path("datafile/{fileId}")
+    @Path("datafile/{fileId:.+}")
     @GET
     @Produces({"application/xml"})
     public DownloadInstance datafile(@PathParam("fileId") String fileId, @QueryParam("gbrecs") boolean gbrecs, @QueryParam("key") String apiToken, @Context UriInfo uriInfo, @Context HttpHeaders headers, @Context HttpServletResponse response) /*throws NotFoundException, ServiceUnavailableException, PermissionDeniedException, AuthorizationRequiredException*/ {
-
+
+        if (fileId.indexOf('/') > -1) {
+            // This is for embedding folder names into the Access API URLs;
+            // something like /api/access/datafile/folder/subfolder/1234
+            // instead of the normal /api/access/datafile/1234 notation. 
+            // this is supported only for recreating folders during recursive downloads - 
+            // i.e. they are embedded into the URL for the remote client like wget,
+            // but can be safely ignored here. 
+            fileId = fileId.substring(fileId.lastIndexOf('/') + 1);
+        }
+
         DataFile df = findDataFileOrDieWrapper(fileId);
         GuestbookResponse gbr = null;
 

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java
@@ -156,6 +156,7 @@
 import org.glassfish.jersey.media.multipart.FormDataParam;
 
 import com.amazonaws.services.s3.model.PartETag;
+import edu.harvard.iq.dataverse.FileMetadata;
 import java.util.Map.Entry;
 
 @Path("datasets")
@@ -466,6 +467,42 @@ public Response getVersionFiles( @PathParam("id") String datasetId, @PathParam("
                          getDatasetVersionOrDie(req, versionId, findDatasetOrDie(datasetId), uriInfo, headers).getFileMetadatas())));
     }
 
+    @GET
+    @Path("{id}/dirindex")
+    @Produces("text/html")
+    public Response getFileAccessFolderView(@PathParam("id") String datasetId, @QueryParam("version") String versionId, @QueryParam("folder") String folderName, @QueryParam("original") Boolean originals, @Context UriInfo uriInfo, @Context HttpHeaders headers, @Context HttpServletResponse response) {
+
+        folderName = folderName == null ? "" : folderName;
+        versionId = versionId == null ? ":latest-published" : versionId; 
+
+        DatasetVersion version; 
+        try {
+            DataverseRequest req = createDataverseRequest(findUserOrDie());
+            version = getDatasetVersionOrDie(req, versionId, findDatasetOrDie(datasetId), uriInfo, headers);
+        } catch (WrappedResponse wr) {
+            return wr.getResponse();
+        }
+
+        String output = FileUtil.formatFolderListingHtml(folderName, version, "", originals != null && originals);
+
+        // return "NOT FOUND" if there is no such folder in the dataset version:
+
+        if ("".equals(output)) {
+            return notFound("Folder " + folderName + " does not exist");
+        }
+
+
+        String indexFileName = folderName.equals("") ? ".index.html"
+                : ".index-" + folderName.replace('/', '_') + ".html";
+        response.setHeader("Content-disposition", "attachment; filename=\"" + indexFileName + "\"");
+
+
+        return Response.ok()
+                .entity(output)
+                //.type("application/html").
+                .build();
+    }
+
     @GET
     @Path("{id}/versions/{versionId}/metadata")
     public Response getVersionMetadata( @PathParam("id") String datasetId, @PathParam("versionId") String versionId, @Context UriInfo uriInfo, @Context HttpHeaders headers) {