From b72fe70d1b7b22ce7c0ec7d9fefe72db2f472885 Mon Sep 17 00:00:00 2001 From: Tobias Bocanegra Date: Tue, 6 Feb 2024 15:53:24 +0100 Subject: [PATCH] feat: serve or render sitemap.xml (#530) fixes #472 Co-authored-by: Dominique Pfister --- package-lock.json | 6 + package.json | 1 + src/index.js | 1 + src/sitemap-pipe.js | 146 +++++++++ test/fixtures/content/sitemap-bad-data.json | 3 + test/fixtures/content/sitemap-corrupt.json | 1 + test/fixtures/content/sitemap.json | 23 ++ test/fixtures/content/sitemap.xml | 8 + test/sitemap-pipe.test.js | 332 ++++++++++++++++++++ 9 files changed, 521 insertions(+) create mode 100644 src/sitemap-pipe.js create mode 100644 test/fixtures/content/sitemap-bad-data.json create mode 100644 test/fixtures/content/sitemap-corrupt.json create mode 100644 test/fixtures/content/sitemap.json create mode 100644 test/fixtures/content/sitemap.xml create mode 100644 test/sitemap-pipe.test.js diff --git a/package-lock.json b/package-lock.json index 27cff534..bcd13a26 100644 --- a/package-lock.json +++ b/package-lock.json @@ -21,6 +21,7 @@ "hast-util-to-string": "3.0.0", "hastscript": "9.0.0", "jose": "5.2.1", + "lodash.escape": "4.0.1", "mdast-util-to-hast": "13.1.0", "mdast-util-to-string": "4.0.0", "mime": "4.0.1", @@ -5267,6 +5268,11 @@ "integrity": "sha512-kZzYOKspf8XVX5AvmQF94gQW0lejFVgb80G85bU4ZWzoJ6C03PQg3coYAUpSTpQWelrZELd3XWgHzw4Ck5kaIw==", "dev": true }, + "node_modules/lodash.escape": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/lodash.escape/-/lodash.escape-4.0.1.tgz", + "integrity": "sha512-nXEOnb/jK9g0DYMr1/Xvq6l5xMD7GDG55+GSYIYmS0G4tBk/hURD4JR9WCavs04t33WmJx9kCyp9vJ+mr4BOUw==" + }, "node_modules/lodash.escaperegexp": { "version": "4.1.2", "resolved": "https://registry.npmjs.org/lodash.escaperegexp/-/lodash.escaperegexp-4.1.2.tgz", diff --git a/package.json b/package.json index e1e2d77e..2a65709a 100644 --- a/package.json +++ b/package.json @@ -55,6 +55,7 @@ "hast-util-to-string": "3.0.0", "hastscript": "9.0.0", "jose": "5.2.1", + "lodash.escape": "4.0.1", "mdast-util-to-hast": "13.1.0", "mdast-util-to-string": "4.0.0", "mime": "4.0.1", diff --git a/src/index.js b/src/index.js index c4cef5ad..66d3f1d2 100644 --- a/src/index.js +++ b/src/index.js @@ -13,6 +13,7 @@ export * from './html-pipe.js'; export * from './json-pipe.js'; export * from './auth-pipe.js'; export * from './options-pipe.js'; +export * from './sitemap-pipe.js'; export * from './PipelineContent.js'; export * from './PipelineRequest.js'; export * from './PipelineResponse.js'; diff --git a/src/sitemap-pipe.js b/src/sitemap-pipe.js new file mode 100644 index 00000000..fd713fe8 --- /dev/null +++ b/src/sitemap-pipe.js @@ -0,0 +1,146 @@ +/* + * Copyright 2024 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +import escape from 'lodash.escape'; +import { cleanupHeaderValue } from '@adobe/helix-shared-utils'; +import { authenticate } from './steps/authenticate.js'; +import fetchContent from './steps/fetch-content.js'; +import renderCode from './steps/render-code.js'; +import setXSurrogateKeyHeader from './steps/set-x-surrogate-key-header.js'; +import setCustomResponseHeaders from './steps/set-custom-response-headers.js'; +import { PipelineStatusError } from './PipelineStatusError.js'; +import { PipelineResponse } from './PipelineResponse.js'; +import initConfig from './steps/init-config.js'; +import { extractLastModified, updateLastModified } from './utils/last-modified.js'; + +async function generateSitemap(state, partition) { + const { + owner, repo, ref, contentBusId, s3Loader, log, + previewHost, liveHost, prodHost, + } = state; + const ret = await s3Loader.getObject('helix-content-bus', `${contentBusId}/live/sitemap.json`); + if (ret.status !== 200) { + return ret; + } + let config; + try { + config = JSON.parse(ret.body); + } catch (e) { + log.info('failed to parse /sitemap.json', e); + throw new PipelineStatusError(404, `Failed to parse /sitemap.json: ${e.message}`); + } + const { data } = config; + if (!data || !Array.isArray(data)) { + throw new PipelineStatusError(404, 'Expected \'data\' array not found in /sitemap.json'); + } + const host = partition === 'preview' + ? (previewHost || `${ref}--${repo}--${owner}.hlx.page`) + : (prodHost || liveHost || `${ref}--${repo}--${owner}.hlx.live`); + const loc = ({ path, lastModified }) => ` + https://${host}${escape(path)} + ${new Date(lastModified * 1000).toISOString().substring(0, 10)} + `; + const xml = [ + '', + '', + ...data.map((record) => loc(record)), + '', + ].join('\n'); + return new PipelineResponse(xml, { + status: 200, + headers: { + 'content-type': 'application/xml; charset=utf-8', + 'last-modified': ret.headers.get('last-modified'), + }, + }); +} + +/** + * Serves or renders the sitemap xml. The sitemap is always served from the preview content-bus + * partition. + * + * todo: currently only serves an existing sitemap.xml from the contentbus. + * generate sitemap on the fly based on the sitemap.json + * + * @param {PipelineState} state + * @param {PipelineRequest} req + * @returns {PipelineResponse} + */ +export async function sitemapPipe(state, req) { + const { partition, log } = state; + state.type = 'sitemap'; + + // force loading from preview + state.partition = 'preview'; + + if (state.info?.path !== '/sitemap.xml') { + // this should not happen as it would mean that the caller used the wrong route. so we respond + // with a 500 to indicate that something is wrong. + return new PipelineResponse('', { + status: 500, + headers: { + 'x-error': 'invalid route', + }, + }); + } + + /** @type PipelineResponse */ + const res = new PipelineResponse('', { + headers: { + 'content-type': 'text/plain; charset=utf-8', + }, + }); + + try { + await initConfig(state, req, res); + + // await requireProject(state, req, res); + if (res.error !== 401) { + await authenticate(state, req, res); + } + + // ...and apply the folder mapping + state.timer?.update('content-fetch'); + + // fetch sitemap.xml + await fetchContent(state, req, res); + if (res.status === 404) { + const ret = await generateSitemap(state, partition); + if (ret.status === 200) { + res.status = 200; + updateLastModified(state, res, extractLastModified(ret.headers)); + delete res.error; + state.content.data = ret.body; + } + } + if (res.error) { + // if content loading produced an error, we're done. + throw new PipelineStatusError(res.status, res.error); + } + + state.timer?.update('serialize'); + await renderCode(state, req, res); + await setCustomResponseHeaders(state, req, res); + await setXSurrogateKeyHeader(state, req, res); + } catch (e) { + res.error = e.message; + res.status = e.code || 500; + + const level = res.status >= 500 ? 'error' : 'info'; + log[level](`pipeline status: ${res.status} ${res.error}`); + res.headers.set('x-error', cleanupHeaderValue(res.error)); + if (res.status < 500) { + await setCustomResponseHeaders(state, req, res); + await setXSurrogateKeyHeader(state, req, res); + } + } + return res; +} diff --git a/test/fixtures/content/sitemap-bad-data.json b/test/fixtures/content/sitemap-bad-data.json new file mode 100644 index 00000000..8aabe644 --- /dev/null +++ b/test/fixtures/content/sitemap-bad-data.json @@ -0,0 +1,3 @@ +{ + "data": "this is not an array" +} diff --git a/test/fixtures/content/sitemap-corrupt.json b/test/fixtures/content/sitemap-corrupt.json new file mode 100644 index 00000000..7e31dc3c --- /dev/null +++ b/test/fixtures/content/sitemap-corrupt.json @@ -0,0 +1 @@ +this is not JSON \ No newline at end of file diff --git a/test/fixtures/content/sitemap.json b/test/fixtures/content/sitemap.json new file mode 100644 index 00000000..ce249983 --- /dev/null +++ b/test/fixtures/content/sitemap.json @@ -0,0 +1,23 @@ +{ + "total": 2, + "offset": 0, + "limit": 8, + "columns": [ + "path", + "lastModified", + "robots" + ], + "data": [ + { + "path": "/", + "lastModified": 1701361070, + "robots": "" + }, + { + "lastModified": 1703163776, + "path": "/test", + "robots": "" + } + ], + ":type": "sheet" +} \ No newline at end of file diff --git a/test/fixtures/content/sitemap.xml b/test/fixtures/content/sitemap.xml new file mode 100644 index 00000000..f08bc560 --- /dev/null +++ b/test/fixtures/content/sitemap.xml @@ -0,0 +1,8 @@ + + + https://www.aem.live/ + + + https://www.aem.live/developer + + diff --git a/test/sitemap-pipe.test.js b/test/sitemap-pipe.test.js new file mode 100644 index 00000000..87c01b06 --- /dev/null +++ b/test/sitemap-pipe.test.js @@ -0,0 +1,332 @@ +/* + * Copyright 2024 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ + +/* eslint-env mocha */ +import assert from 'assert'; +import { FileS3Loader } from './FileS3Loader.js'; +import { + sitemapPipe, PipelineRequest, PipelineState, +} from '../src/index.js'; + +const DEFAULT_CONFIG = { + contentBusId: 'foobar', + owner: 'owner', + repo: 'repo', +}; + +const DEFAULT_STATE = (opts = {}) => (new PipelineState({ + config: DEFAULT_CONFIG, + site: 'site', + org: 'org', + ref: 'ref', + partition: 'preview', + s3Loader: new FileS3Loader(), + ...opts, +})); + +describe('Sitemap Pipe Test', () => { + it('responds with 500 for non sitemap', async () => { + const resp = await sitemapPipe( + DEFAULT_STATE(), + new PipelineRequest(new URL('https://www.hlx.live/')), + ); + assert.strictEqual(resp.status, 500); + assert.deepStrictEqual(Object.fromEntries(resp.headers.entries()), { + 'x-error': 'invalid route', + }); + }); + + it('responds with 500 for content-bus errors', async () => { + const resp = await sitemapPipe( + DEFAULT_STATE({ + s3Loader: new FileS3Loader().status('sitemap.xml', 500), + path: '/sitemap.xml', + }), + new PipelineRequest(new URL('https://www.hlx.live/')), + ); + assert.strictEqual(resp.status, 502); + assert.deepStrictEqual(Object.fromEntries(resp.headers.entries()), { + 'content-type': 'text/plain; charset=utf-8', + 'x-error': 'failed to load /sitemap.xml from content-bus: 500', + }); + }); + + it('responds with 404 for sitemap and json not found', async () => { + const resp = await sitemapPipe( + DEFAULT_STATE({ + s3Loader: new FileS3Loader() + .status('sitemap.xml', 404) + .status('sitemap.json', 404), + path: '/sitemap.xml', + }), + new PipelineRequest(new URL('https://www.hlx.live/')), + ); + assert.strictEqual(resp.status, 404); + assert.deepStrictEqual(Object.fromEntries(resp.headers.entries()), { + 'content-type': 'text/plain; charset=utf-8', + 'x-error': 'failed to load /sitemap.xml from content-bus: 404', + 'x-surrogate-key': 'RXei-6EcTEMTEIqi foobar_metadata ref--repo--owner_head', + }); + }); + + it('responds with 404 for sitemap not found and corrupt json', async () => { + const resp = await sitemapPipe( + DEFAULT_STATE({ + s3Loader: new FileS3Loader() + .status('sitemap.xml', 404) + .rewrite('sitemap.json', 'sitemap-corrupt.json'), + path: '/sitemap.xml', + }), + new PipelineRequest(new URL('https://www.hlx.live/')), + ); + assert.strictEqual(resp.status, 404); + assert.deepStrictEqual(Object.fromEntries(resp.headers.entries()), { + 'content-type': 'text/plain; charset=utf-8', + 'x-error': 'Failed to parse /sitemap.json: Unexpected token \'h\', "this is not JSON" is not valid JSON', + 'x-surrogate-key': 'RXei-6EcTEMTEIqi foobar_metadata ref--repo--owner_head', + }); + }); + + it('responds with 404 for sitemap not found and bad \'data\' property', async () => { + const resp = await sitemapPipe( + DEFAULT_STATE({ + s3Loader: new FileS3Loader() + .status('sitemap.xml', 404) + .rewrite('sitemap.json', 'sitemap-bad-data.json'), + path: '/sitemap.xml', + }), + new PipelineRequest(new URL('https://www.hlx.live/')), + ); + assert.strictEqual(resp.status, 404); + assert.deepStrictEqual(Object.fromEntries(resp.headers.entries()), { + 'content-type': 'text/plain; charset=utf-8', + 'x-error': "Expected 'data' array not found in /sitemap.json", + 'x-surrogate-key': 'RXei-6EcTEMTEIqi foobar_metadata ref--repo--owner_head', + }); + }); + + it('serves sitemap from preview', async () => { + const resp = await sitemapPipe( + DEFAULT_STATE({ + path: '/sitemap.xml', + timer: { + update: () => { }, + }, + }), + new PipelineRequest(new URL('https://www.hlx.live/')), + ); + assert.strictEqual(resp.status, 200); + assert.deepStrictEqual(Object.fromEntries(resp.headers.entries()), { + 'content-type': 'application/xml; charset=utf-8', + 'last-modified': 'Fri, 30 Apr 2021 03:47:18 GMT', + 'x-surrogate-key': 'rCCgYLwPe4ckYgJ7 RXei-6EcTEMTEIqi foobar_metadata ref--repo--owner_head', + }); + assert.strictEqual(resp.body, ` + + https://www.aem.live/ + + + https://www.aem.live/developer + + +`); + }); + + it('renders sitemap from preview with fallback origin', async () => { + const resp = await sitemapPipe( + DEFAULT_STATE({ + s3Loader: new FileS3Loader() + .status('sitemap.xml', 404), + path: '/sitemap.xml', + }), + new PipelineRequest(new URL('https://www.hlx.live/')), + ); + assert.strictEqual(resp.status, 200); + assert.deepStrictEqual(Object.fromEntries(resp.headers.entries()), { + 'content-type': 'application/xml; charset=utf-8', + 'last-modified': 'Fri, 30 Apr 2021 03:47:18 GMT', + 'x-surrogate-key': 'RXei-6EcTEMTEIqi foobar_metadata ref--repo--owner_head', + }); + assert.strictEqual(resp.body, ` + + + https://ref--repo--owner.hlx.page/ + 2023-11-30 + + + https://ref--repo--owner.hlx.page/test + 2023-12-21 + +`); + }); + + it('renders sitemap from preview with preview host', async () => { + const resp = await sitemapPipe( + DEFAULT_STATE({ + config: { + ...DEFAULT_CONFIG, + cdn: { + preview: { + host: '$ref--$repo--$owner.my.page', + }, + }, + }, + s3Loader: new FileS3Loader() + .status('sitemap.xml', 404), + path: '/sitemap.xml', + }), + new PipelineRequest(new URL('https://www.hlx.live/')), + ); + assert.strictEqual(resp.status, 200); + assert.deepStrictEqual(Object.fromEntries(resp.headers.entries()), { + 'content-type': 'application/xml; charset=utf-8', + 'last-modified': 'Fri, 30 Apr 2021 03:47:18 GMT', + 'x-surrogate-key': 'RXei-6EcTEMTEIqi foobar_metadata ref--repo--owner_head', + }); + assert.strictEqual(resp.body, ` + + + https://ref--repo--owner.my.page/ + 2023-11-30 + + + https://ref--repo--owner.my.page/test + 2023-12-21 + +`); + }); + + it('renders sitemap from live with prod CDN', async () => { + const resp = await sitemapPipe( + DEFAULT_STATE({ + config: { + ...DEFAULT_CONFIG, + cdn: { + prod: { + host: 'www.adobe.com', + }, + }, + }, + s3Loader: new FileS3Loader() + .status('sitemap.xml', 404), + path: '/sitemap.xml', + partition: 'live', + }), + new PipelineRequest(new URL('https://www.hlx.live/')), + ); + assert.strictEqual(resp.status, 200); + assert.deepStrictEqual(Object.fromEntries(resp.headers.entries()), { + 'content-type': 'application/xml; charset=utf-8', + 'last-modified': 'Fri, 30 Apr 2021 03:47:18 GMT', + 'x-surrogate-key': 'RXei-6EcTEMTEIqi foobar_metadata ref--repo--owner_head', + }); + assert.strictEqual(resp.body, ` + + + https://www.adobe.com/ + 2023-11-30 + + + https://www.adobe.com/test + 2023-12-21 + +`); + }); + + it('renders sitemap from live with live host', async () => { + const resp = await sitemapPipe( + DEFAULT_STATE({ + config: { + ...DEFAULT_CONFIG, + cdn: { + live: { + host: '$ref--$repo--$owner.my.live', + }, + }, + }, + s3Loader: new FileS3Loader() + .status('sitemap.xml', 404), + path: '/sitemap.xml', + partition: 'live', + }), + new PipelineRequest(new URL('https://www.hlx.live/')), + ); + assert.strictEqual(resp.status, 200); + assert.deepStrictEqual(Object.fromEntries(resp.headers.entries()), { + 'content-type': 'application/xml; charset=utf-8', + 'last-modified': 'Fri, 30 Apr 2021 03:47:18 GMT', + 'x-surrogate-key': 'RXei-6EcTEMTEIqi foobar_metadata ref--repo--owner_head', + }); + assert.strictEqual(resp.body, ` + + + https://ref--repo--owner.my.live/ + 2023-11-30 + + + https://ref--repo--owner.my.live/test + 2023-12-21 + +`); + }); + + it('renders sitemap from live with fallback origin', async () => { + const resp = await sitemapPipe( + DEFAULT_STATE({ + s3Loader: new FileS3Loader() + .status('sitemap.xml', 404), + path: '/sitemap.xml', + partition: 'live', + }), + new PipelineRequest(new URL('https://www.hlx.live/')), + ); + + assert.strictEqual(resp.status, 200); + assert.deepStrictEqual(Object.fromEntries(resp.headers.entries()), { + 'content-type': 'application/xml; charset=utf-8', + 'last-modified': 'Fri, 30 Apr 2021 03:47:18 GMT', + 'x-surrogate-key': 'RXei-6EcTEMTEIqi foobar_metadata ref--repo--owner_head', + }); + assert.strictEqual(resp.body, ` + + + https://ref--repo--owner.hlx.live/ + 2023-11-30 + + + https://ref--repo--owner.hlx.live/test + 2023-12-21 + +`); + }); + + it('handles pipeline errors', async () => { + const resp = await sitemapPipe( + DEFAULT_STATE({ + path: '/sitemap.xml', + timer: { + update: () => { + throw new Error('boom!'); + }, + }, + }), + new PipelineRequest(new URL('https://www.hlx.live/')), + ); + + assert.strictEqual(resp.status, 500); + assert.deepStrictEqual(Object.fromEntries(resp.headers.entries()), { + 'content-type': 'text/plain; charset=utf-8', + 'x-error': 'boom!', + }); + assert.strictEqual(resp.body, ''); + }); +});