Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(sites-29416)!: Add support for importing non-image assets #7

Merged
merged 18 commits into from
Feb 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
* governing permissions and limitations under the License.
*/
import { createJcrPackage } from './package/packaging.js';
import { getImageUrlsFromMarkdown } from './package/image-mapping.js';
import { getAssetUrlsFromMarkdown } from './package/asset-mapping.js';
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

requires a full version bump when releasing!


export {
createJcrPackage,
getImageUrlsFromMarkdown,
getAssetUrlsFromMarkdown,
};
38 changes: 25 additions & 13 deletions src/package/image-mapping.js → src/package/asset-mapping.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ const imageRegex = /!\[([^\]]*)]\(([^) "]+)(?: *"([^"]*)")?\)|!\[([^\]]*)]\[([^\
// Regex for reference definitions
const referenceRegex = /\[([^\]]+)]:\s*(\S+)/g;

// Regex for non-image asset links (PDFs, docs, excel etc.)
const nonImageAssetRegex = /(?:\[(.*?)\]|\[.*?\])\(([^)]+\.(?:pdf|doc|docx|xls|xlsx|ppt|pptx|odt|ods|odp|rtf|txt|csv))\)|\[(.*?)\]:\s*(\S+\.(?:pdf|doc|docx|xls|xlsx|ppt|pptx|odt|ods|odp|rtf|txt|csv))/gi;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need a unit test to validate that we are catching the different types of assets.


/**
* Function to find reference definitions in a markdown file.
*
Expand All @@ -36,51 +39,60 @@ const findReferenceDefinitionsInMarkdown = (markdownContent) => {
};

/**
* Function to scan for images in a markdown file.
* Function to scan for assets in a markdown file.
*
* @param markdownContent - The content of the markdown file
* @returns {Array<string>} A Map of image urls as key
* @returns {Array<string>} A Map of asset urls as key
*/
const findImagesInMarkdown = (markdownContent) => {
const findAssetsInMarkdown = (markdownContent) => {
const references = findReferenceDefinitionsInMarkdown(markdownContent);

const imageUrls = [];
const assetUrls = [];

// Identify each image url in the markdown content
let match;
let url;
// eslint-disable-next-line no-cond-assign
while ((match = imageRegex.exec(markdownContent)) !== null) {
let url;
if (match[2]) { // Inline image
// eslint-disable-next-line prefer-destructuring
url = match[2];
} else if (match[5]) { // Reference-style image
url = references[match[5]] || null; // Resolve URL from reference map
}
if (url) {
imageUrls.push(url);
assetUrls.push(url);
}
}

// Find and add only non-image asset links
// eslint-disable-next-line no-cond-assign
while ((match = nonImageAssetRegex.exec(markdownContent)) !== null) {
url = match[2] || match[3];
if (url) {
assetUrls.push(url);
}
}

return imageUrls;
return assetUrls;
};

/**
* Get the list image urls present in the markdown.
* Get the list asset urls present in the markdown.
* @param {string} markdownContent - The content of the markdown file
* @returns {Array<string>} An array of image urls.
* @returns {Array<string>} An array of asset urls.
*/
const getImageUrlsFromMarkdown = (markdownContent) => {
const getAssetUrlsFromMarkdown = (markdownContent) => {
try {
return findImagesInMarkdown(markdownContent);
return findAssetsInMarkdown(markdownContent);
} catch (error) {
// eslint-disable-next-line no-console
console.warn('Error getting image urls from markdown:', error);
console.warn('Error getting asset urls from markdown:', error);
return [];
}
};

export {
// eslint-disable-next-line import/prefer-default-export
getImageUrlsFromMarkdown,
getAssetUrlsFromMarkdown,
};
34 changes: 17 additions & 17 deletions src/package/packaging.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import {
import { saveFile } from '../shared/filesystem.js';

let jcrPages = [];
const IMAGE_MAPPING_FILE = 'image-mappings.json';
const ASSET_MAPPING_FILE = 'asset-mappings.json';

const init = () => {
jcrPages = [];
Expand All @@ -40,10 +40,10 @@ const addPage = async (page, dir, prefix, zip) => {
* @param {string} xml - The xml content of the page
* @param {string} pageUrl - The url of the site page
* @param {string} assetFolderName - The name of the asset folder(s) in AEM
* @param {Map} imageMappings - A map to store the image urls and their corresponding jcr paths
* @param {Map} assetMappings - A map to store the asset urls and their corresponding jcr paths
* @returns {Promise<*|string>} - The updated xml content
*/
export const updateAssetReferences = async (xml, pageUrl, assetFolderName, imageMappings) => {
export const updateAssetReferences = async (xml, pageUrl, assetFolderName, assetMappings) => {
let doc;
try {
doc = getParsedXml(xml);
Expand All @@ -54,14 +54,14 @@ export const updateAssetReferences = async (xml, pageUrl, assetFolderName, image
}

// Start traversal from the document root and update the asset references
traverseAndUpdateAssetReferences(doc.documentElement, pageUrl, assetFolderName, imageMappings);
traverseAndUpdateAssetReferences(doc.documentElement, pageUrl, assetFolderName, assetMappings);

const serializer = new XMLSerializer();
return serializer.serializeToString(doc);
};

// eslint-disable-next-line max-len
export const getJcrPages = async (pages, siteFolderName, assetFolderName, imageMappings) => Promise.all(pages.map(async (page) => ({
export const getJcrPages = async (pages, siteFolderName, assetFolderName, assetMappings) => Promise.all(pages.map(async (page) => ({
path: page.path,
sourceXml: page.data,
pageProperties: getPageProperties(page.data),
Expand All @@ -70,7 +70,7 @@ export const getJcrPages = async (pages, siteFolderName, assetFolderName, imageM
page.data,
page.url,
assetFolderName,
imageMappings,
assetMappings,
),
jcrPath: getJcrPagePath(page.path, siteFolderName),
contentXmlPath: `jcr_root${getJcrPagePath(page.path, siteFolderName)}/.content.xml`,
Expand Down Expand Up @@ -120,31 +120,31 @@ const getEmptyAncestorPages = (pages) => {
};

/**
* Save the image mappings to a file.
* @param {Map} imageMappings - A map of image urls and their corresponding jcr paths
* Save the asset mappings to a file.
* @param {Map} assetMappings - A map of asset urls and their corresponding jcr paths
* @param {*} outputDirectory - The directory handle
*/
const saveImageMappings = async (imageMappings, outputDirectory) => {
const saveAssetMappings = async (assetMappings, outputDirectory) => {
// Convert Map to a plain object
const obj = Object.fromEntries(imageMappings);
const obj = Object.fromEntries(assetMappings);

// Save the updated image mapping content into a file
await saveFile(outputDirectory, IMAGE_MAPPING_FILE, JSON.stringify(obj, null, 2));
// Save the updated asset mapping content into a file
await saveFile(outputDirectory, ASSET_MAPPING_FILE, JSON.stringify(obj, null, 2));
};

/**
* Creates a JCR content package from a directory containing pages.
* @param {*} outputDirectory - The directory handle
* @param {Array} pages - An array of pages
* @param {Array<string>} imageUrls - An array of image urls that were found in the markdown.
* @param {Array<string>} assetUrls - An array of asset urls that were found in the markdown.
* @param {string} siteFolderName - The name of the site folder(s) in AEM
* @param {string} assetFolderName - The name of the asset folder(s) in AEM
* @returns {Promise} The file handle for the generated package.
*/
export const createJcrPackage = async (
outputDirectory,
pages,
imageUrls,
assetUrls,
siteFolderName,
assetFolderName,
) => {
Expand All @@ -158,10 +158,10 @@ export const createJcrPackage = async (
const prefix = 'jcr';

// create a map using the provided asset urls as keys (values will be populated later)
const imageMappings = new Map(imageUrls.map((url) => [url, '']));
const assetMappings = new Map(assetUrls.map((url) => [url, '']));

// add the pages
jcrPages = await getJcrPages(pages, siteFolderName, assetFolderName, imageMappings);
jcrPages = await getJcrPages(pages, siteFolderName, assetFolderName, assetMappings);
for (let i = 0; i < jcrPages.length; i += 1) {
const page = jcrPages[i];
// eslint-disable-next-line no-await-in-loop
Expand All @@ -188,5 +188,5 @@ export const createJcrPackage = async (
await zip.generateAsync({ type: outputType })
.then(async (blob) => saveFile(outputDirectory, `${packageName}.zip`, blob));

await saveImageMappings(imageMappings, outputDirectory);
await saveAssetMappings(assetMappings, outputDirectory);
};
2 changes: 1 addition & 1 deletion src/package/packaging.utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ export function getFullAssetUrl(assetReference, pageUrl) {

// If the asset reference starts with './', it is a relative file path
if (assetReference.startsWith('./')) {
return new URL(assetReference, pageUrlObj.href).pathname;
return new URL(assetReference, pageUrlObj.href).href;
}

// Absolute asset reference, appending the asset path to the host
Expand Down
121 changes: 121 additions & 0 deletions test/package/asset-mapping.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
/*
* Copyright 2025 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
/* eslint-env mocha */
import { readFile } from 'fs/promises';
import { expect } from 'chai';
import { getAssetUrlsFromMarkdown } from '../../src/package/asset-mapping.js';

const loadFile = async (file) => readFile(new URL(file, import.meta.url), 'utf-8');

describe('getAssetUrlsFromMarkdown', () => {
it('should return an array of asset urls (reference urls)', () => {
const markdownContent = `+-----------------------+
| Hero |
+=======================+
| ![Car 1][image0] |
| ![Car 2][image1] |
+-----------------------+

[image0]: https://aem.live/car.jpeg
[image1]: https://aem.live/car2.jpeg`;

const imageUrls = getAssetUrlsFromMarkdown(markdownContent);
expect(imageUrls).to.have.lengthOf(2);
expect(imageUrls[0]).to.equal('https://aem.live/car.jpeg');
expect(imageUrls[1]).to.equal('https://aem.live/car2.jpeg');
});

it('should return an array of asset urls (inlined urls)', () => {
const markdownContent = `+------------------------------------------+
| Hero |
+==========================================+
| ![Car 1](https://aem.live/car.jpeg) |
| ![Car 2](https://aem.live/car2.jpeg) |
+------------------------------------------+`;

const imageUrls = getAssetUrlsFromMarkdown(markdownContent);
expect(imageUrls).to.have.lengthOf(2);
expect(imageUrls[0]).to.equal('https://aem.live/car.jpeg');
expect(imageUrls[1]).to.equal('https://aem.live/car2.jpeg');
});

it('should return non-image asset URLs for various document types', () => {
const markdownContent = `
Click [PDF](/content/dam/doe/foo/bar.pdf) to download the guide.
View [DOC](/content/dam/doe/docs/sample.doc).
Open [DOCX](/content/dam/doe/docs/sample.docx).
See [XLS](/content/dam/doe/spreadsheets/data.xls).
Try [XLSX](/content/dam/doe/spreadsheets/data.xlsx).
Read [PPT](/content/dam/doe/presentations/slide.ppt).
Check [PPTX](/content/dam/doe/presentations/slide.pptx).
Open [ODT](/content/dam/doe/texts/note.odt).
Review [ODS](/content/dam/doe/spreadsheets/sheet.ods).
Access [ODP](/content/dam/doe/presentations/deck.odp).
Check [RTF](/content/dam/doe/docs/document.rtf).
Read [TXT](/content/dam/doe/texts/readme.txt).
Get [CSV](/content/dam/doe/data/records.csv).
Invalid [Fake](/content/dam/doe/fake/myimage.pdf.png).
Also check [here](https://example.live/siteFoo.html).
`;

const assetUrls = getAssetUrlsFromMarkdown(markdownContent);

expect(assetUrls).to.have.lengthOf(13); // 13 valid document URLs
expect(assetUrls).to.include('/content/dam/doe/foo/bar.pdf');
expect(assetUrls).to.include('/content/dam/doe/docs/sample.doc');
expect(assetUrls).to.include('/content/dam/doe/docs/sample.docx');
expect(assetUrls).to.include('/content/dam/doe/spreadsheets/data.xls');
expect(assetUrls).to.include('/content/dam/doe/spreadsheets/data.xlsx');
expect(assetUrls).to.include('/content/dam/doe/presentations/slide.ppt');
expect(assetUrls).to.include('/content/dam/doe/presentations/slide.pptx');
expect(assetUrls).to.include('/content/dam/doe/texts/note.odt');
expect(assetUrls).to.include('/content/dam/doe/spreadsheets/sheet.ods');
expect(assetUrls).to.include('/content/dam/doe/presentations/deck.odp');
expect(assetUrls).to.include('/content/dam/doe/docs/document.rtf');
expect(assetUrls).to.include('/content/dam/doe/texts/readme.txt');
expect(assetUrls).to.include('/content/dam/doe/data/records.csv');

// Ensure the invalid case is excluded
expect(assetUrls).to.not.include('/content/dam/doe/fake/myimage.pdf.png');
});

it('should return an array with no image urls', () => {
const markdownContent = 'This is a markdown file with no images.';

const imageUrls = getAssetUrlsFromMarkdown(markdownContent);
expect(imageUrls).to.have.lengthOf(0);
});

it('should return an array of image urls (absolute/relative urls)', () => {
const markdownContent = `+------------------------------------------+
| Hero |
+==========================================+
| ![Car 1](/car.jpeg) |
| ![Car 2][image0] |
+------------------------------------------+

[image0]: /test/car2.jpeg`;

const imageUrls = getAssetUrlsFromMarkdown(markdownContent);
expect(imageUrls).to.have.lengthOf(2);
expect(imageUrls[0]).to.equal('/car.jpeg');
expect(imageUrls[1]).to.equal('/test/car2.jpeg');
});

// should call getAssetUrlsFromMarkdown with the correct arguments
it('test getAssetUrlsFromMarkdown', async () => {
const markdown = await loadFile('../fixtures/mystique/hero.md');
const imageUrl = await getAssetUrlsFromMarkdown(markdown);
expect(imageUrl).to.be.an('array');
expect(imageUrl).to.have.lengthOf(1);
});
});
Loading