Skip to content

Commit ef4b444

Browse files
ManasMajiBen Helleman
and
Ben Helleman
authored
feat(sites-29416)!: Add support for importing non-image assets (#7)
BREAKING CHANGE: The API getImageUrlsFromMarkdown has been renamed to getAssetUrlsFromMarkdown, to align more accurately to what it is doing. Co-authored-by: Ben Helleman <bhellema@adobe.com>
1 parent a446d5c commit ef4b444

File tree

6 files changed

+166
-114
lines changed

6 files changed

+166
-114
lines changed

src/index.js

+2-2
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@
1010
* governing permissions and limitations under the License.
1111
*/
1212
import { createJcrPackage } from './package/packaging.js';
13-
import { getImageUrlsFromMarkdown } from './package/image-mapping.js';
13+
import { getAssetUrlsFromMarkdown } from './package/asset-mapping.js';
1414

1515
export {
1616
createJcrPackage,
17-
getImageUrlsFromMarkdown,
17+
getAssetUrlsFromMarkdown,
1818
};

src/package/image-mapping.js src/package/asset-mapping.js

+25-13
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ const imageRegex = /!\[([^\]]*)]\(([^) "]+)(?: *"([^"]*)")?\)|!\[([^\]]*)]\[([^\
1818
// Regex for reference definitions
1919
const referenceRegex = /\[([^\]]+)]:\s*(\S+)/g;
2020

21+
// Regex for non-image asset links (PDFs, docs, excel etc.)
22+
const nonImageAssetRegex = /(?:\[(.*?)\]|\[.*?\])\(([^)]+\.(?:pdf|doc|docx|xls|xlsx|ppt|pptx|odt|ods|odp|rtf|txt|csv))\)|\[(.*?)\]:\s*(\S+\.(?:pdf|doc|docx|xls|xlsx|ppt|pptx|odt|ods|odp|rtf|txt|csv))/gi;
23+
2124
/**
2225
* Function to find reference definitions in a markdown file.
2326
*
@@ -36,51 +39,60 @@ const findReferenceDefinitionsInMarkdown = (markdownContent) => {
3639
};
3740

3841
/**
39-
* Function to scan for images in a markdown file.
42+
* Function to scan for assets in a markdown file.
4043
*
4144
* @param markdownContent - The content of the markdown file
42-
* @returns {Array<string>} A Map of image urls as key
45+
* @returns {Array<string>} A Map of asset urls as key
4346
*/
44-
const findImagesInMarkdown = (markdownContent) => {
47+
const findAssetsInMarkdown = (markdownContent) => {
4548
const references = findReferenceDefinitionsInMarkdown(markdownContent);
4649

47-
const imageUrls = [];
50+
const assetUrls = [];
4851

4952
// Identify each image url in the markdown content
5053
let match;
54+
let url;
5155
// eslint-disable-next-line no-cond-assign
5256
while ((match = imageRegex.exec(markdownContent)) !== null) {
53-
let url;
5457
if (match[2]) { // Inline image
5558
// eslint-disable-next-line prefer-destructuring
5659
url = match[2];
5760
} else if (match[5]) { // Reference-style image
5861
url = references[match[5]] || null; // Resolve URL from reference map
5962
}
6063
if (url) {
61-
imageUrls.push(url);
64+
assetUrls.push(url);
65+
}
66+
}
67+
68+
// Find and add only non-image asset links
69+
// eslint-disable-next-line no-cond-assign
70+
while ((match = nonImageAssetRegex.exec(markdownContent)) !== null) {
71+
url = match[2] || match[3];
72+
if (url) {
73+
assetUrls.push(url);
6274
}
6375
}
6476

65-
return imageUrls;
77+
return assetUrls;
6678
};
6779

6880
/**
69-
* Get the list image urls present in the markdown.
81+
* Get the list asset urls present in the markdown.
7082
* @param {string} markdownContent - The content of the markdown file
71-
* @returns {Array<string>} An array of image urls.
83+
* @returns {Array<string>} An array of asset urls.
7284
*/
73-
const getImageUrlsFromMarkdown = (markdownContent) => {
85+
const getAssetUrlsFromMarkdown = (markdownContent) => {
7486
try {
75-
return findImagesInMarkdown(markdownContent);
87+
return findAssetsInMarkdown(markdownContent);
7688
} catch (error) {
7789
// eslint-disable-next-line no-console
78-
console.warn('Error getting image urls from markdown:', error);
90+
console.warn('Error getting asset urls from markdown:', error);
7991
return [];
8092
}
8193
};
8294

8395
export {
8496
// eslint-disable-next-line import/prefer-default-export
85-
getImageUrlsFromMarkdown,
97+
getAssetUrlsFromMarkdown,
8698
};

src/package/packaging.js

+17-17
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ import {
2424
import { saveFile } from '../shared/filesystem.js';
2525

2626
let jcrPages = [];
27-
const IMAGE_MAPPING_FILE = 'image-mappings.json';
27+
const ASSET_MAPPING_FILE = 'asset-mappings.json';
2828

2929
const init = () => {
3030
jcrPages = [];
@@ -40,10 +40,10 @@ const addPage = async (page, dir, prefix, zip) => {
4040
* @param {string} xml - The xml content of the page
4141
* @param {string} pageUrl - The url of the site page
4242
* @param {string} assetFolderName - The name of the asset folder(s) in AEM
43-
* @param {Map} imageMappings - A map to store the image urls and their corresponding jcr paths
43+
* @param {Map} assetMappings - A map to store the asset urls and their corresponding jcr paths
4444
* @returns {Promise<*|string>} - The updated xml content
4545
*/
46-
export const updateAssetReferences = async (xml, pageUrl, assetFolderName, imageMappings) => {
46+
export const updateAssetReferences = async (xml, pageUrl, assetFolderName, assetMappings) => {
4747
let doc;
4848
try {
4949
doc = getParsedXml(xml);
@@ -54,14 +54,14 @@ export const updateAssetReferences = async (xml, pageUrl, assetFolderName, image
5454
}
5555

5656
// Start traversal from the document root and update the asset references
57-
traverseAndUpdateAssetReferences(doc.documentElement, pageUrl, assetFolderName, imageMappings);
57+
traverseAndUpdateAssetReferences(doc.documentElement, pageUrl, assetFolderName, assetMappings);
5858

5959
const serializer = new XMLSerializer();
6060
return serializer.serializeToString(doc);
6161
};
6262

6363
// eslint-disable-next-line max-len
64-
export const getJcrPages = async (pages, siteFolderName, assetFolderName, imageMappings) => Promise.all(pages.map(async (page) => ({
64+
export const getJcrPages = async (pages, siteFolderName, assetFolderName, assetMappings) => Promise.all(pages.map(async (page) => ({
6565
path: page.path,
6666
sourceXml: page.data,
6767
pageProperties: getPageProperties(page.data),
@@ -70,7 +70,7 @@ export const getJcrPages = async (pages, siteFolderName, assetFolderName, imageM
7070
page.data,
7171
page.url,
7272
assetFolderName,
73-
imageMappings,
73+
assetMappings,
7474
),
7575
jcrPath: getJcrPagePath(page.path, siteFolderName),
7676
contentXmlPath: `jcr_root${getJcrPagePath(page.path, siteFolderName)}/.content.xml`,
@@ -120,31 +120,31 @@ const getEmptyAncestorPages = (pages) => {
120120
};
121121

122122
/**
123-
* Save the image mappings to a file.
124-
* @param {Map} imageMappings - A map of image urls and their corresponding jcr paths
123+
* Save the asset mappings to a file.
124+
* @param {Map} assetMappings - A map of asset urls and their corresponding jcr paths
125125
* @param {*} outputDirectory - The directory handle
126126
*/
127-
const saveImageMappings = async (imageMappings, outputDirectory) => {
127+
const saveAssetMappings = async (assetMappings, outputDirectory) => {
128128
// Convert Map to a plain object
129-
const obj = Object.fromEntries(imageMappings);
129+
const obj = Object.fromEntries(assetMappings);
130130

131-
// Save the updated image mapping content into a file
132-
await saveFile(outputDirectory, IMAGE_MAPPING_FILE, JSON.stringify(obj, null, 2));
131+
// Save the updated asset mapping content into a file
132+
await saveFile(outputDirectory, ASSET_MAPPING_FILE, JSON.stringify(obj, null, 2));
133133
};
134134

135135
/**
136136
* Creates a JCR content package from a directory containing pages.
137137
* @param {*} outputDirectory - The directory handle
138138
* @param {Array} pages - An array of pages
139-
* @param {Array<string>} imageUrls - An array of image urls that were found in the markdown.
139+
* @param {Array<string>} assetUrls - An array of asset urls that were found in the markdown.
140140
* @param {string} siteFolderName - The name of the site folder(s) in AEM
141141
* @param {string} assetFolderName - The name of the asset folder(s) in AEM
142142
* @returns {Promise} The file handle for the generated package.
143143
*/
144144
export const createJcrPackage = async (
145145
outputDirectory,
146146
pages,
147-
imageUrls,
147+
assetUrls,
148148
siteFolderName,
149149
assetFolderName,
150150
) => {
@@ -158,10 +158,10 @@ export const createJcrPackage = async (
158158
const prefix = 'jcr';
159159

160160
// create a map using the provided asset urls as keys (values will be populated later)
161-
const imageMappings = new Map(imageUrls.map((url) => [url, '']));
161+
const assetMappings = new Map(assetUrls.map((url) => [url, '']));
162162

163163
// add the pages
164-
jcrPages = await getJcrPages(pages, siteFolderName, assetFolderName, imageMappings);
164+
jcrPages = await getJcrPages(pages, siteFolderName, assetFolderName, assetMappings);
165165
for (let i = 0; i < jcrPages.length; i += 1) {
166166
const page = jcrPages[i];
167167
// eslint-disable-next-line no-await-in-loop
@@ -188,5 +188,5 @@ export const createJcrPackage = async (
188188
await zip.generateAsync({ type: outputType })
189189
.then(async (blob) => saveFile(outputDirectory, `${packageName}.zip`, blob));
190190

191-
await saveImageMappings(imageMappings, outputDirectory);
191+
await saveAssetMappings(assetMappings, outputDirectory);
192192
};

src/package/packaging.utils.js

+1-1
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ export function getFullAssetUrl(assetReference, pageUrl) {
241241

242242
// If the asset reference starts with './', it is a relative file path
243243
if (assetReference.startsWith('./')) {
244-
return new URL(assetReference, pageUrlObj.href).pathname;
244+
return new URL(assetReference, pageUrlObj.href).href;
245245
}
246246

247247
// Absolute asset reference, appending the asset path to the host

test/package/asset-mapping.test.js

+121
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
/*
2+
* Copyright 2025 Adobe. All rights reserved.
3+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
4+
* you may not use this file except in compliance with the License. You may obtain a copy
5+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
6+
*
7+
* Unless required by applicable law or agreed to in writing, software distributed under
8+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9+
* OF ANY KIND, either express or implied. See the License for the specific language
10+
* governing permissions and limitations under the License.
11+
*/
12+
/* eslint-env mocha */
13+
import { readFile } from 'fs/promises';
14+
import { expect } from 'chai';
15+
import { getAssetUrlsFromMarkdown } from '../../src/package/asset-mapping.js';
16+
17+
const loadFile = async (file) => readFile(new URL(file, import.meta.url), 'utf-8');
18+
19+
describe('getAssetUrlsFromMarkdown', () => {
20+
it('should return an array of asset urls (reference urls)', () => {
21+
const markdownContent = `+-----------------------+
22+
| Hero |
23+
+=======================+
24+
| ![Car 1][image0] |
25+
| ![Car 2][image1] |
26+
+-----------------------+
27+
28+
[image0]: https://aem.live/car.jpeg
29+
[image1]: https://aem.live/car2.jpeg`;
30+
31+
const imageUrls = getAssetUrlsFromMarkdown(markdownContent);
32+
expect(imageUrls).to.have.lengthOf(2);
33+
expect(imageUrls[0]).to.equal('https://aem.live/car.jpeg');
34+
expect(imageUrls[1]).to.equal('https://aem.live/car2.jpeg');
35+
});
36+
37+
it('should return an array of asset urls (inlined urls)', () => {
38+
const markdownContent = `+------------------------------------------+
39+
| Hero |
40+
+==========================================+
41+
| ![Car 1](https://aem.live/car.jpeg) |
42+
| ![Car 2](https://aem.live/car2.jpeg) |
43+
+------------------------------------------+`;
44+
45+
const imageUrls = getAssetUrlsFromMarkdown(markdownContent);
46+
expect(imageUrls).to.have.lengthOf(2);
47+
expect(imageUrls[0]).to.equal('https://aem.live/car.jpeg');
48+
expect(imageUrls[1]).to.equal('https://aem.live/car2.jpeg');
49+
});
50+
51+
it('should return non-image asset URLs for various document types', () => {
52+
const markdownContent = `
53+
Click [PDF](/content/dam/doe/foo/bar.pdf) to download the guide.
54+
View [DOC](/content/dam/doe/docs/sample.doc).
55+
Open [DOCX](/content/dam/doe/docs/sample.docx).
56+
See [XLS](/content/dam/doe/spreadsheets/data.xls).
57+
Try [XLSX](/content/dam/doe/spreadsheets/data.xlsx).
58+
Read [PPT](/content/dam/doe/presentations/slide.ppt).
59+
Check [PPTX](/content/dam/doe/presentations/slide.pptx).
60+
Open [ODT](/content/dam/doe/texts/note.odt).
61+
Review [ODS](/content/dam/doe/spreadsheets/sheet.ods).
62+
Access [ODP](/content/dam/doe/presentations/deck.odp).
63+
Check [RTF](/content/dam/doe/docs/document.rtf).
64+
Read [TXT](/content/dam/doe/texts/readme.txt).
65+
Get [CSV](/content/dam/doe/data/records.csv).
66+
Invalid [Fake](/content/dam/doe/fake/myimage.pdf.png).
67+
Also check [here](https://example.live/siteFoo.html).
68+
`;
69+
70+
const assetUrls = getAssetUrlsFromMarkdown(markdownContent);
71+
72+
expect(assetUrls).to.have.lengthOf(13); // 13 valid document URLs
73+
expect(assetUrls).to.include('/content/dam/doe/foo/bar.pdf');
74+
expect(assetUrls).to.include('/content/dam/doe/docs/sample.doc');
75+
expect(assetUrls).to.include('/content/dam/doe/docs/sample.docx');
76+
expect(assetUrls).to.include('/content/dam/doe/spreadsheets/data.xls');
77+
expect(assetUrls).to.include('/content/dam/doe/spreadsheets/data.xlsx');
78+
expect(assetUrls).to.include('/content/dam/doe/presentations/slide.ppt');
79+
expect(assetUrls).to.include('/content/dam/doe/presentations/slide.pptx');
80+
expect(assetUrls).to.include('/content/dam/doe/texts/note.odt');
81+
expect(assetUrls).to.include('/content/dam/doe/spreadsheets/sheet.ods');
82+
expect(assetUrls).to.include('/content/dam/doe/presentations/deck.odp');
83+
expect(assetUrls).to.include('/content/dam/doe/docs/document.rtf');
84+
expect(assetUrls).to.include('/content/dam/doe/texts/readme.txt');
85+
expect(assetUrls).to.include('/content/dam/doe/data/records.csv');
86+
87+
// Ensure the invalid case is excluded
88+
expect(assetUrls).to.not.include('/content/dam/doe/fake/myimage.pdf.png');
89+
});
90+
91+
it('should return an array with no image urls', () => {
92+
const markdownContent = 'This is a markdown file with no images.';
93+
94+
const imageUrls = getAssetUrlsFromMarkdown(markdownContent);
95+
expect(imageUrls).to.have.lengthOf(0);
96+
});
97+
98+
it('should return an array of image urls (absolute/relative urls)', () => {
99+
const markdownContent = `+------------------------------------------+
100+
| Hero |
101+
+==========================================+
102+
| ![Car 1](/car.jpeg) |
103+
| ![Car 2][image0] |
104+
+------------------------------------------+
105+
106+
[image0]: /test/car2.jpeg`;
107+
108+
const imageUrls = getAssetUrlsFromMarkdown(markdownContent);
109+
expect(imageUrls).to.have.lengthOf(2);
110+
expect(imageUrls[0]).to.equal('/car.jpeg');
111+
expect(imageUrls[1]).to.equal('/test/car2.jpeg');
112+
});
113+
114+
// should call getAssetUrlsFromMarkdown with the correct arguments
115+
it('test getAssetUrlsFromMarkdown', async () => {
116+
const markdown = await loadFile('../fixtures/mystique/hero.md');
117+
const imageUrl = await getAssetUrlsFromMarkdown(markdown);
118+
expect(imageUrl).to.be.an('array');
119+
expect(imageUrl).to.have.lengthOf(1);
120+
});
121+
});

0 commit comments

Comments
 (0)