diff --git a/importer-guidelines.md b/importer-guidelines.md index b2f261a5..005fe9b2 100644 --- a/importer-guidelines.md +++ b/importer-guidelines.md @@ -393,30 +393,81 @@ You can do something like: transform: ({ document, params }) => { const main = document.querySelector('main'); - WebImporter.DOMUtils.remove(main, [ - '.hero', - ]); - const listOfAllImages = [...main.querySelectorAll('img')].map((img) => img.src); + const listOfAllMeta = [...document.querySelectorAll('meta')].map((meta) => { + const name = meta.getAttribute('name') || meta.getAttribute('property'); + if (name) { + return { name, content: meta.content } + } + return null; + }).filter((meta) => meta); return [{ element: main, - path: '/index', + path: new URL(params.originalURL).pathname.replace(/\/$/, '').replace(/\.html$/, ''), report: { title: document.title, - "List Of All Images": listOfAllImages - } + "List of images": listOfAllImages, + metadata: listOfAllMeta, + }, }]; }, } ``` -For each imported entry, this will add 2 columns to the report: +For each imported entry, a `docx` file is created and 3 columns are added to the report: - `title` column: the document title -- `List Of All Images`column: a JSON stringified value of the list of all the images in the `main` element, +- `List of images`column: a JSON stringified value of the list of all the images in the `main` element +- `metadata` column: a JSON stringified value of the list of all the metadata in the document + +The report would look like this: + +| URL | path | docx | status | redirect | title | List of images | metadata | +|-------------|-----------------|-----------------|-----------------|-----------------|-----------------|-----------------|-----------------| +| https://www.sample.com/ | / | | Success | | Sample page title | ["https://www.sample.com/img1", "https://www.sample.com/img2"] | [{"name":"viewport","content":"width=device-width,initial-scale=1"},{"name":"description","content":"Sample site homepage description"},...] | +| https://www.sample.com/page1.html | /page1 | | Success | | Sample page 1 title | ["https://www.sample.com/img3", "https://www.otherdomain.com/img"] | [{"name":"viewport","content":"width=device-width,initial-scale=1"},{"name":"description","content":"Sample site page 1 description"},...] | + +The report extra columns are created based on the top level properties in the `report` object. We recommand the value to be a string for easiness to consume in Excel but, in theory, it can be anything that can be `JSON.stringify`. + +Depending on your Excel skills and your needs you can be creative and easily customise the report. + +### Collect data vs importing content + +The report capability previously described can be used as another feature: collect site data in one Excel file. The `element` property of the returned object(s) is optional, i.e. if you omit it, you can create an import that will only collect some data on each page and report them back in the report file. + +With the same code as above, just remove the `element` property of the returned object: + +```js +{ + transform: ({ document, params }) => { + const main = document.querySelector('main'); + + const listOfAllImages = [...main.querySelectorAll('img')].map((img) => img.src); + const listOfAllMeta = [...document.querySelectorAll('meta')].map((meta) => { + const name = meta.getAttribute('name') || meta.getAttribute('property'); + if (name) { + return { name, content: meta.content } + } + return null; + }).filter((meta) => meta); + + return [{ + // do not return an element + // element: main, + path: new URL(params.originalURL).pathname.replace(/\/$/, '').replace(/\.html$/, ''), + report: { + title: document.title, + "List of images": listOfAllImages, + metadata: listOfAllMeta, + }, + }]; + }, +} +``` + +For each URL of the import, this will NOT create a `docx` per URL but only feed the report with extra columns for each row / URL imported: `title`, `List of images` and `meta` columns will be appended to the report. -The report extra columns will be created based on the top level properties in the `report` object. We recommand the value to be a string for easiness to consume in Excel but, in theory, it can be anything that can be `JSON.stringify`. -You can be creative and customise the report as needed. +With this method, you can construct an `xlsx` spreadsheet with the site data you want to collect without creating the corresponding `docx` files. ### More samples diff --git a/package-lock.json b/package-lock.json index b1758d1a..0a346f3b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -8,7 +8,7 @@ "name": "helix-import-ui", "version": "1.20.0", "dependencies": { - "@adobe/helix-importer": "2.4.1", + "@adobe/helix-importer": "2.5.0", "@adobe/mdast-util-gridtables": "1.0.3", "@adobe/remark-gridtables": "1.0.0", "@spectrum-web-components/bundle": "0.28.5", @@ -101,9 +101,9 @@ } }, "node_modules/@adobe/helix-importer": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/@adobe/helix-importer/-/helix-importer-2.4.1.tgz", - "integrity": "sha512-aA+2xm7cHvmesnaDwJavoUX4GB6xv5lcFAa79Pe/9NVChTn23+QAX8cDz9McJRmSZ09Ui2Eimjj7e4FMppJqCA==", + "version": "2.5.0", + "resolved": "https://registry.npmjs.org/@adobe/helix-importer/-/helix-importer-2.5.0.tgz", + "integrity": "sha512-gC5PfllGHlLiHxVYMVuNqCA9Q5zyvIlR/GdSEsuvjxrtz9Fsky7/wl5Nr0OeLEClo8P/aRx6+ZmVUuekQ8hEnw==", "dependencies": { "@adobe/helix-markdown-support": "6.0.0", "@adobe/helix-md2docx": "2.0.28", @@ -18333,9 +18333,9 @@ } }, "@adobe/helix-importer": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/@adobe/helix-importer/-/helix-importer-2.4.1.tgz", - "integrity": "sha512-aA+2xm7cHvmesnaDwJavoUX4GB6xv5lcFAa79Pe/9NVChTn23+QAX8cDz9McJRmSZ09Ui2Eimjj7e4FMppJqCA==", + "version": "2.5.0", + "resolved": "https://registry.npmjs.org/@adobe/helix-importer/-/helix-importer-2.5.0.tgz", + "integrity": "sha512-gC5PfllGHlLiHxVYMVuNqCA9Q5zyvIlR/GdSEsuvjxrtz9Fsky7/wl5Nr0OeLEClo8P/aRx6+ZmVUuekQ8hEnw==", "requires": { "@adobe/helix-markdown-support": "6.0.0", "@adobe/helix-md2docx": "2.0.28", diff --git a/package.json b/package.json index 077300ac..7c533c6f 100644 --- a/package.json +++ b/package.json @@ -14,7 +14,7 @@ "semantic-release": "semantic-release" }, "dependencies": { - "@adobe/helix-importer": "2.4.1", + "@adobe/helix-importer": "2.5.0", "@adobe/mdast-util-gridtables": "1.0.3", "@adobe/remark-gridtables": "1.0.0", "@spectrum-web-components/bundle": "0.28.5",