feat: support for pdf download (#109)

adobe · Feb 6, 2023 · 2e633f8 · 2e633f8
1 parent 4dc3722
commit 2e633f8
Show file tree

Hide file tree

Showing 6 changed files with 104 additions and 15 deletions.
diff --git a/docs/download-pdf.md b/docs/download-pdf.md
@@ -0,0 +1,56 @@
+# Case study: convert a page and download the PDFs referenced on the page
+
+Page to consider: https://main--hlxsite--kptdobe.hlx.page/content/page-with-pdf
+
+This is dummy page, do not worry about the look and feel or if the content is meaningful. The goal is to convert that page to docx AND to tell the importer to download the referenced PDF.
+
+Here is an import.js example:
+
+```js
+export default {
+  transform: ({
+    // eslint-disable-next-line no-unused-vars
+    document,
+    url,
+  }) => {
+    const main = document.body;
+    const results = [];
+
+    // main page import - "element" is provided, i.e. a docx will be created
+    results.push({
+      element: main,
+      path: new URL(url).pathname
+    });
+
+    // find pdf links
+    main.querySelectorAll('a').forEach((a) => {
+      const href = a.getAttribute('href');
+      if (href && href.endsWith('.pdf')) {
+        const u = new URL(href, url);
+        const newPath = WebImporter.FileUtils.sanitizePath(u.pathname);
+        // no "element", the "from" property is provided instead - importer will download the "from" resource as "path"
+        results.push({
+          path: newPath,
+          from: u.toString(),
+        });
+
+        // update the link to new path on the target host
+        // this is required to be able to follow the links in Word
+        // you will need to replace "main--repo--owner" by your project setup
+        const newHref = new URL(newPath, 'https://main--repo--owner.hlx.page').toString();
+        a.setAttribute('href', newPath);
+      }
+    });
+
+    return results;
+  },
+};
+```
+
+Using the `transform` method and multiple outputs, you can return the page to be transformed and each pdf to be downloaded. When running the import on the url above, you get the following files:
+
+![image](https://user-images.githubusercontent.com/474200/216992850-1ae0304f-b364-45c0-888b-685f8c1ebc19.png)
+
+Notes:
+- if you do not need the page as a docx, you can remove the first part - only the PDFs will be downloaded
+- this would potentially work with other type of resource
diff --git a/import-bulk.html b/import-bulk.html
@@ -47,7 +47,7 @@ <h2>Import - Bulk</h2>
                                         <sp-number-field class="option-field" id="import-pageload-timeout" value="100" min="0" step="100" format-options='{ "style": "unit", "unit": "millisecond", "unitDisplay": "short" }'></sp-number-field>
 
                                         <sp-checkbox class="option-field" id="import-local-save" checked>
-                                            Save locally as .docx
+                                            Save files locally
                                         </sp-checkbox>
 
                                         <sp-checkbox class="option-field" id="import-enable-js" checked>

diff --git a/import.html b/import.html
@@ -54,7 +54,7 @@ <h2>Import - Workbench</h2>
                                         <sp-number-field class="option-field" id="import-pageload-timeout" value="100" min="0" step="100" format-options='{ "style": "unit", "unit": "millisecond", "unitDisplay": "short" }'></sp-number-field>
 
                                         <sp-checkbox class="option-field" id="import-local-save" checked>
-                                            Save locally as .docx
+                                            Save files locally
                                         </sp-checkbox>
 
                                         <sp-checkbox class="option-field" id="import-enable-js" checked>

diff --git a/importer-guidelines.md b/importer-guidelines.md
@@ -503,6 +503,12 @@ With this method, you can construct an `xlsx` spreadsheet with the site data you
 
 Sites in the https://github.com/hlxsites/ organization have all be imported. There are many different implementations that cover a lot of use cases.
 
+## Case studies
+
+Here is growing list of case studies to help you drive some more sophisticated imports.
+
+1. [Convert a page and download all PDF files referenced on the page](./docs/download-pdf.md)
+
 ## Helpers
 
 The `DOMUtils` and `Blocks` objects are exposed. Their implementation can be found here: 

diff --git a/js/import/import.ui.js b/js/import/import.ui.js
@@ -41,7 +41,6 @@ const BULK_URLS_LIST = document.querySelector('#import-result ul');
 
 const IMPORT_FILE_PICKER_CONTAINER = document.getElementById('import-file-picker-container');
 
-const DOWNLOAD_BINARY_TYPES = ['pdf'];
 const REPORT_FILENAME = 'import-report.xlsx';
 
 const ui = {};
@@ -196,18 +195,46 @@ const getProxyURLSetup = (url, origin) => {
 
 const postSuccessfulStep = async (results, originalURL) => {
   await asyncForEach(results, async ({
-    docx, filename, path, report,
+    docx, filename, path, report, from,
   }) => {
     const data = {
-      status: 'Success',
       url: originalURL,
       path,
     };
 
-    const includeDocx = !!docx;
-    if (includeDocx) {
-      await saveFile(dirHandle, filename, docx);
-      data.docx = filename;
+    if (docx) {
+      if (dirHandle) {
+        await saveFile(dirHandle, filename, docx);
+        data.file = filename;
+        data.status = 'Success';
+      } else {
+        data.status = 'Success - No file created';
+      }
+    } else if (from) {
+      try {
+        const res = await fetch(from);
+        if (res && res.ok) {
+          if (res.redirected) {
+            data.status = 'Redirect';
+            data.redirect = res.url;
+          } else if (dirHandle) {
+            const blob = await res.blob();
+            await saveFile(dirHandle, path, blob);
+            data.file = path;
+            data.status = 'Success';
+          } else {
+            data.status = 'Success - No file created';
+          }
+        } else {
+          data.status = `Error: Failed to download ${from} - ${res.status} ${res.statusText}`;
+        }
+      } catch (e) {
+        // eslint-disable-next-line no-console
+        console.error(`Failed to download ${from} to ${path}`, e);
+        data.status = `Error: Failed to download ${from} - ${e.message}`;
+      }
+    } else {
+      data.status = 'Success - No file created';
     }
 
     if (report) {
@@ -229,7 +256,7 @@ const getReport = async () => {
   const workbook = new ExcelJS.Workbook();
   const worksheet = workbook.addWorksheet('Sheet 1');
 
-  const headers = ['URL', 'path', 'docx', 'status', 'redirect'].concat(importStatus.extraCols);
+  const headers = ['URL', 'path', 'file', 'status', 'redirect'].concat(importStatus.extraCols);
 
   // create Excel auto Filters for the first row / header
   worksheet.autoFilter = {
@@ -241,7 +268,7 @@ const getReport = async () => {
     headers,
   ].concat(importStatus.rows.map((row) => {
     const {
-      url, path, docx, status, redirect, report,
+      url, path, file, status, redirect, report,
     } = row;
     const extra = [];
     if (report) {
@@ -263,7 +290,7 @@ const getReport = async () => {
         }
       });
     }
-    return [url, path, docx || '', status, redirect || ''].concat(extra);
+    return [url, path, file || '', status, redirect || ''].concat(extra);
   })));
 
   return workbook.xlsx.writeBuffer();
@@ -385,6 +412,7 @@ const attachListeners = () => {
         try {
           res = await fetch(src);
         } catch (e) {
+          // eslint-disable-next-line no-console
           console.error(`Unexpected error when trying to fetch ${src} - CORS issue ?`, e);
         }
         if (res && res.ok) {
@@ -455,8 +483,7 @@ const attachListeners = () => {
               current.removeEventListener('transformation-complete', processNext);
 
               current.replaceWith(frame);
-            } else if (IS_BULK
-              && DOWNLOAD_BINARY_TYPES.filter((t) => contentType.includes(t)).length > 0) {
+            } else if (dirHandle) {
               const blob = await res.blob();
               const u = new URL(src);
               const path = WebImporter.FileUtils.sanitizePath(u.pathname);

diff --git a/package.json b/package.json
@@ -14,7 +14,7 @@
     "semantic-release": "semantic-release"
   },
   "dependencies": {
-    "@adobe/helix-importer": "2.6.0",
+    "@adobe/helix-importer": "2.7.0",
     "@adobe/mdast-util-gridtables": "1.0.6",
     "@adobe/remark-gridtables": "1.0.2",
     "@spectrum-web-components/bundle": "0.28.7",