feat(sites-29416)!: Add support for importing non-image assets (#7)

ManasMaji · Ben Helleman · web-flow · commit ef4b44478528 · 2025-02-26T21:39:07.000+05:30
BREAKING CHANGE: The API getImageUrlsFromMarkdown has been renamed to getAssetUrlsFromMarkdown, to align more accurately to what it is doing.

Co-authored-by: Ben Helleman &lt;bhellema@adobe.com&gt;
diff --git a/src/index.js b/src/index.js
@@ -10,9 +10,9 @@
  * governing permissions and limitations under the License.
  */
 import { createJcrPackage } from './package/packaging.js';
-import { getImageUrlsFromMarkdown } from './package/image-mapping.js';
+import { getAssetUrlsFromMarkdown } from './package/asset-mapping.js';
 
 export {
   createJcrPackage,
-  getImageUrlsFromMarkdown,
+  getAssetUrlsFromMarkdown,
 };
diff --git a/src/package/asset-mapping.js b/src/package/asset-mapping.js
@@ -18,6 +18,9 @@ const imageRegex = /!\[([^\]]*)]\(([^) "]+)(?: *"([^"]*)")?\)|!\[([^\]]*)]\[([^\
 // Regex for reference definitions
 const referenceRegex = /\[([^\]]+)]:\s*(\S+)/g;
 
+// Regex for non-image asset links (PDFs, docs, excel etc.)
+const nonImageAssetRegex = /(?:\[(.*?)\]|\[.*?\])\(([^)]+\.(?:pdf|doc|docx|xls|xlsx|ppt|pptx|odt|ods|odp|rtf|txt|csv))\)|\[(.*?)\]:\s*(\S+\.(?:pdf|doc|docx|xls|xlsx|ppt|pptx|odt|ods|odp|rtf|txt|csv))/gi;
+
 /**
  * Function to find reference definitions in a markdown file.
  *
@@ -36,51 +39,60 @@ const findReferenceDefinitionsInMarkdown = (markdownContent) => {
 };
 
 /**
- * Function to scan for images in a markdown file.
+ * Function to scan for assets in a markdown file.
  *
  * @param markdownContent - The content of the markdown file
- * @returns {Array<string>} A Map of image urls as key
+ * @returns {Array<string>} A Map of asset urls as key
  */
-const findImagesInMarkdown = (markdownContent) => {
+const findAssetsInMarkdown = (markdownContent) => {
   const references = findReferenceDefinitionsInMarkdown(markdownContent);
 
-  const imageUrls = [];
+  const assetUrls = [];
 
   // Identify each image url in the markdown content
   let match;
+  let url;
   // eslint-disable-next-line no-cond-assign
   while ((match = imageRegex.exec(markdownContent)) !== null) {
-    let url;
     if (match[2]) { // Inline image
       // eslint-disable-next-line prefer-destructuring
       url = match[2];
     } else if (match[5]) { // Reference-style image
       url = references[match[5]] || null; // Resolve URL from reference map
     }
     if (url) {
-      imageUrls.push(url);
+      assetUrls.push(url);
+    }
+  }
+
+  // Find and add only non-image asset links
+  // eslint-disable-next-line no-cond-assign
+  while ((match = nonImageAssetRegex.exec(markdownContent)) !== null) {
+    url = match[2] || match[3];
+    if (url) {
+      assetUrls.push(url);
     }
   }
 
-  return imageUrls;
+  return assetUrls;
 };
 
 /**
- * Get the list image urls present in the markdown.
+ * Get the list asset urls present in the markdown.
  * @param {string} markdownContent - The content of the markdown file
- * @returns {Array<string>} An array of image urls.
+ * @returns {Array<string>} An array of asset urls.
  */
-const getImageUrlsFromMarkdown = (markdownContent) => {
+const getAssetUrlsFromMarkdown = (markdownContent) => {
   try {
-    return findImagesInMarkdown(markdownContent);
+    return findAssetsInMarkdown(markdownContent);
   } catch (error) {
     // eslint-disable-next-line no-console
-    console.warn('Error getting image urls from markdown:', error);
+    console.warn('Error getting asset urls from markdown:', error);
     return [];
   }
 };
 
 export {
   // eslint-disable-next-line import/prefer-default-export
-  getImageUrlsFromMarkdown,
+  getAssetUrlsFromMarkdown,
 };
diff --git a/src/package/packaging.js b/src/package/packaging.js
@@ -24,7 +24,7 @@ import {
 import { saveFile } from '../shared/filesystem.js';
 
 let jcrPages = [];
-const IMAGE_MAPPING_FILE = 'image-mappings.json';
+const ASSET_MAPPING_FILE = 'asset-mappings.json';
 
 const init = () => {
   jcrPages = [];
@@ -40,10 +40,10 @@ const addPage = async (page, dir, prefix, zip) => {
  * @param {string} xml - The xml content of the page
  * @param {string} pageUrl - The url of the site page
  * @param {string} assetFolderName - The name of the asset folder(s) in AEM
- * @param {Map} imageMappings - A map to store the image urls and their corresponding jcr paths
+ * @param {Map} assetMappings - A map to store the asset urls and their corresponding jcr paths
  * @returns {Promise<*|string>} - The updated xml content
  */
-export const updateAssetReferences = async (xml, pageUrl, assetFolderName, imageMappings) => {
+export const updateAssetReferences = async (xml, pageUrl, assetFolderName, assetMappings) => {
   let doc;
   try {
     doc = getParsedXml(xml);
@@ -54,14 +54,14 @@ export const updateAssetReferences = async (xml, pageUrl, assetFolderName, image
   }
 
   // Start traversal from the document root and update the asset references
-  traverseAndUpdateAssetReferences(doc.documentElement, pageUrl, assetFolderName, imageMappings);
+  traverseAndUpdateAssetReferences(doc.documentElement, pageUrl, assetFolderName, assetMappings);
 
   const serializer = new XMLSerializer();
   return serializer.serializeToString(doc);
 };
 
 // eslint-disable-next-line max-len
-export const getJcrPages = async (pages, siteFolderName, assetFolderName, imageMappings) => Promise.all(pages.map(async (page) => ({
+export const getJcrPages = async (pages, siteFolderName, assetFolderName, assetMappings) => Promise.all(pages.map(async (page) => ({
   path: page.path,
   sourceXml: page.data,
   pageProperties: getPageProperties(page.data),
@@ -70,7 +70,7 @@ export const getJcrPages = async (pages, siteFolderName, assetFolderName, imageM
     page.data,
     page.url,
     assetFolderName,
-    imageMappings,
+    assetMappings,
   ),
   jcrPath: getJcrPagePath(page.path, siteFolderName),
   contentXmlPath: `jcr_root${getJcrPagePath(page.path, siteFolderName)}/.content.xml`,
@@ -120,31 +120,31 @@ const getEmptyAncestorPages = (pages) => {
 };
 
 /**
- * Save the image mappings to a file.
- * @param {Map} imageMappings - A map of image urls and their corresponding jcr paths
+ * Save the asset mappings to a file.
+ * @param {Map} assetMappings - A map of asset urls and their corresponding jcr paths
  * @param {*} outputDirectory - The directory handle
  */
-const saveImageMappings = async (imageMappings, outputDirectory) => {
+const saveAssetMappings = async (assetMappings, outputDirectory) => {
   // Convert Map to a plain object
-  const obj = Object.fromEntries(imageMappings);
+  const obj = Object.fromEntries(assetMappings);
 
-  // Save the updated image mapping content into a file
-  await saveFile(outputDirectory, IMAGE_MAPPING_FILE, JSON.stringify(obj, null, 2));
+  // Save the updated asset mapping content into a file
+  await saveFile(outputDirectory, ASSET_MAPPING_FILE, JSON.stringify(obj, null, 2));
 };
 
 /**
  * Creates a JCR content package from a directory containing pages.
  * @param {*} outputDirectory - The directory handle
  * @param {Array} pages - An array of pages
- * @param {Array<string>} imageUrls - An array of image urls that were found in the markdown.
+ * @param {Array<string>} assetUrls - An array of asset urls that were found in the markdown.
  * @param {string} siteFolderName - The name of the site folder(s) in AEM
  * @param {string} assetFolderName - The name of the asset folder(s) in AEM
  * @returns {Promise} The file handle for the generated package.
  */
 export const createJcrPackage = async (
   outputDirectory,
   pages,
-  imageUrls,
+  assetUrls,
   siteFolderName,
   assetFolderName,
 ) => {
@@ -158,10 +158,10 @@ export const createJcrPackage = async (
   const prefix = 'jcr';
 
   // create a map using the provided asset urls as keys (values will be populated later)
-  const imageMappings = new Map(imageUrls.map((url) => [url, '']));
+  const assetMappings = new Map(assetUrls.map((url) => [url, '']));
 
   // add the pages
-  jcrPages = await getJcrPages(pages, siteFolderName, assetFolderName, imageMappings);
+  jcrPages = await getJcrPages(pages, siteFolderName, assetFolderName, assetMappings);
   for (let i = 0; i < jcrPages.length; i += 1) {
     const page = jcrPages[i];
     // eslint-disable-next-line no-await-in-loop
@@ -188,5 +188,5 @@ export const createJcrPackage = async (
   await zip.generateAsync({ type: outputType })
     .then(async (blob) => saveFile(outputDirectory, `${packageName}.zip`, blob));
 
-  await saveImageMappings(imageMappings, outputDirectory);
+  await saveAssetMappings(assetMappings, outputDirectory);
 };
diff --git a/src/package/packaging.utils.js b/src/package/packaging.utils.js
@@ -241,7 +241,7 @@ export function getFullAssetUrl(assetReference, pageUrl) {
 
   // If the asset reference starts with './', it is a relative file path
   if (assetReference.startsWith('./')) {
-    return new URL(assetReference, pageUrlObj.href).pathname;
+    return new URL(assetReference, pageUrlObj.href).href;
   }
 
   // Absolute asset reference, appending the asset path to the host
diff --git a/test/package/asset-mapping.test.js b/test/package/asset-mapping.test.js
@@ -0,0 +1,121 @@
+/*
+ * Copyright 2025 Adobe. All rights reserved.
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License. You may obtain a copy
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
+ * OF ANY KIND, either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+/* eslint-env mocha */
+import { readFile } from 'fs/promises';
+import { expect } from 'chai';
+import { getAssetUrlsFromMarkdown } from '../../src/package/asset-mapping.js';
+
+const loadFile = async (file) => readFile(new URL(file, import.meta.url), 'utf-8');
+
+describe('getAssetUrlsFromMarkdown', () => {
+  it('should return an array of asset urls (reference urls)', () => {
+    const markdownContent = `+-----------------------+
+| Hero                  |
++=======================+
+| ![Car 1][image0]      |
+| ![Car 2][image1]      |
++-----------------------+
+
+[image0]: https://aem.live/car.jpeg
+[image1]: https://aem.live/car2.jpeg`;
+
+    const imageUrls = getAssetUrlsFromMarkdown(markdownContent);
+    expect(imageUrls).to.have.lengthOf(2);
+    expect(imageUrls[0]).to.equal('https://aem.live/car.jpeg');
+    expect(imageUrls[1]).to.equal('https://aem.live/car2.jpeg');
+  });
+
+  it('should return an array of asset urls (inlined urls)', () => {
+    const markdownContent = `+------------------------------------------+
+| Hero                                     |
++==========================================+
+| ![Car 1](https://aem.live/car.jpeg)      |
+| ![Car 2](https://aem.live/car2.jpeg)     |
++------------------------------------------+`;
+
+    const imageUrls = getAssetUrlsFromMarkdown(markdownContent);
+    expect(imageUrls).to.have.lengthOf(2);
+    expect(imageUrls[0]).to.equal('https://aem.live/car.jpeg');
+    expect(imageUrls[1]).to.equal('https://aem.live/car2.jpeg');
+  });
+
+  it('should return non-image asset URLs for various document types', () => {
+    const markdownContent = `
+      Click [PDF](/content/dam/doe/foo/bar.pdf) to download the guide.
+      View [DOC](/content/dam/doe/docs/sample.doc).
+      Open [DOCX](/content/dam/doe/docs/sample.docx).
+      See [XLS](/content/dam/doe/spreadsheets/data.xls).
+      Try [XLSX](/content/dam/doe/spreadsheets/data.xlsx).
+      Read [PPT](/content/dam/doe/presentations/slide.ppt).
+      Check [PPTX](/content/dam/doe/presentations/slide.pptx).
+      Open [ODT](/content/dam/doe/texts/note.odt).
+      Review [ODS](/content/dam/doe/spreadsheets/sheet.ods).
+      Access [ODP](/content/dam/doe/presentations/deck.odp).
+      Check [RTF](/content/dam/doe/docs/document.rtf).
+      Read [TXT](/content/dam/doe/texts/readme.txt).
+      Get [CSV](/content/dam/doe/data/records.csv).
+      Invalid [Fake](/content/dam/doe/fake/myimage.pdf.png).
+      Also check [here](https://example.live/siteFoo.html).
+    `;
+
+    const assetUrls = getAssetUrlsFromMarkdown(markdownContent);
+
+    expect(assetUrls).to.have.lengthOf(13); // 13 valid document URLs
+    expect(assetUrls).to.include('/content/dam/doe/foo/bar.pdf');
+    expect(assetUrls).to.include('/content/dam/doe/docs/sample.doc');
+    expect(assetUrls).to.include('/content/dam/doe/docs/sample.docx');
+    expect(assetUrls).to.include('/content/dam/doe/spreadsheets/data.xls');
+    expect(assetUrls).to.include('/content/dam/doe/spreadsheets/data.xlsx');
+    expect(assetUrls).to.include('/content/dam/doe/presentations/slide.ppt');
+    expect(assetUrls).to.include('/content/dam/doe/presentations/slide.pptx');
+    expect(assetUrls).to.include('/content/dam/doe/texts/note.odt');
+    expect(assetUrls).to.include('/content/dam/doe/spreadsheets/sheet.ods');
+    expect(assetUrls).to.include('/content/dam/doe/presentations/deck.odp');
+    expect(assetUrls).to.include('/content/dam/doe/docs/document.rtf');
+    expect(assetUrls).to.include('/content/dam/doe/texts/readme.txt');
+    expect(assetUrls).to.include('/content/dam/doe/data/records.csv');
+
+    // Ensure the invalid case is excluded
+    expect(assetUrls).to.not.include('/content/dam/doe/fake/myimage.pdf.png');
+  });
+
+  it('should return an array with no image urls', () => {
+    const markdownContent = 'This is a markdown file with no images.';
+
+    const imageUrls = getAssetUrlsFromMarkdown(markdownContent);
+    expect(imageUrls).to.have.lengthOf(0);
+  });
+
+  it('should return an array of image urls (absolute/relative urls)', () => {
+    const markdownContent = `+------------------------------------------+
+| Hero                                     |
++==========================================+
+| ![Car 1](/car.jpeg)                      |
+| ![Car 2][image0]                         |
++------------------------------------------+
+
+[image0]: /test/car2.jpeg`;
+
+    const imageUrls = getAssetUrlsFromMarkdown(markdownContent);
+    expect(imageUrls).to.have.lengthOf(2);
+    expect(imageUrls[0]).to.equal('/car.jpeg');
+    expect(imageUrls[1]).to.equal('/test/car2.jpeg');
+  });
+
+  // should call getAssetUrlsFromMarkdown with the correct arguments
+  it('test getAssetUrlsFromMarkdown', async () => {
+    const markdown = await loadFile('../fixtures/mystique/hero.md');
+    const imageUrl = await getAssetUrlsFromMarkdown(markdown);
+    expect(imageUrl).to.be.an('array');
+    expect(imageUrl).to.have.lengthOf(1);
+  });
+});
diff --git a/test/package/image-mapping.test.js b/test/package/image-mapping.test.js

Original file line number	Diff line number	Diff line change
`@@ -241,7 +241,7 @@ export function getFullAssetUrl(assetReference, pageUrl) {`
`241`	`241`
`242`	`242`	`// If the asset reference starts with './', it is a relative file path`
`243`	`243`	`if (assetReference.startsWith('./')) {`
`244`		`- return new URL(assetReference, pageUrlObj.href).pathname;`
	`244`	`+ return new URL(assetReference, pageUrlObj.href).href;`
`245`	`245`	`}`
`246`	`246`
`247`	`247`	`// Absolute asset reference, appending the asset path to the host`