Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: crawl URLs in <meta> tags #9900

Merged
merged 13 commits into from
May 17, 2023
5 changes: 5 additions & 0 deletions .changeset/thirty-garlics-tan.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
'@sveltejs/kit': minor
---

feat: crawl URLs in `<meta>` tags
111 changes: 68 additions & 43 deletions packages/kit/src/core/postbuild/crawl.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,20 @@ const ATTRIBUTE_NAME = /[^\t\n\f />"'=]/;

const WHITESPACE = /[\s\n\r]/;

const CRAWLABLE_META_NAME_ATTRS = new Set([
'og:url',
'og:image',
'og:image:url',
'og:image:secure_url',
'og:video',
'og:video:url',
'og:video:secure_url',
'og:audio',
'og:audio:url',
'og:audio:secure_url',
'twitter:image'
]);

/**
* @param {string} html
* @param {string} base
Expand Down Expand Up @@ -81,6 +95,9 @@ export function crawl(html, base) {

const tag = html.slice(start, i).toUpperCase();

/** @type {Record<string, string>} */
const attributes = {};

if (tag === 'SCRIPT' || tag === 'STYLE') {
while (i < html.length) {
if (
Expand All @@ -95,9 +112,6 @@ export function crawl(html, base) {
}
}

let href = '';
let rel = '';

while (i < html.length) {
const start = i;

Expand Down Expand Up @@ -159,44 +173,7 @@ export function crawl(html, base) {
}

value = decode(value);

if (name === 'href') {
if (tag === 'BASE') {
base = resolve(base, value);
} else {
href = resolve(base, value);
}
} else if (name === 'id') {
ids.push(value);
} else if (name === 'name') {
if (tag === 'A') ids.push(value);
} else if (name === 'rel') {
rel = value;
} else if (name === 'src') {
if (value) hrefs.push(resolve(base, value));
} else if (name === 'srcset') {
const candidates = [];
let insideURL = true;
value = value.trim();
for (let i = 0; i < value.length; i++) {
if (
value[i] === ',' &&
(!insideURL || (insideURL && WHITESPACE.test(value[i + 1])))
) {
candidates.push(value.slice(0, i));
value = value.substring(i + 1).trim();
i = 0;
insideURL = true;
} else if (WHITESPACE.test(value[i])) {
insideURL = false;
}
}
candidates.push(value);
for (const candidate of candidates) {
const src = candidate.split(WHITESPACE)[0];
if (src) hrefs.push(resolve(base, src));
}
}
attributes[name] = value;
} else {
i -= 1;
}
Expand All @@ -205,8 +182,56 @@ export function crawl(html, base) {
i += 1;
}

if (href && !/\bexternal\b/i.test(rel)) {
hrefs.push(resolve(base, href));
const { href, id, name, property, rel, src, srcset, content } = attributes;

if (href) {
if (tag === 'BASE') {
base = resolve(base, href);
} else if (!rel || !/\bexternal\b/i.test(rel)) {
hrefs.push(resolve(base, href));
}
}

if (id) {
ids.push(id);
}

if (name && tag === 'A') {
ids.push(name);
}

if (src) {
hrefs.push(resolve(base, src));
}

if (srcset) {
let value = srcset;
const candidates = [];
let insideURL = true;
value = value.trim();
for (let i = 0; i < value.length; i++) {
if (value[i] === ',' && (!insideURL || (insideURL && WHITESPACE.test(value[i + 1])))) {
candidates.push(value.slice(0, i));
value = value.substring(i + 1).trim();
i = 0;
insideURL = true;
} else if (WHITESPACE.test(value[i])) {
insideURL = false;
}
}
candidates.push(value);
for (const candidate of candidates) {
const src = candidate.split(WHITESPACE)[0];
if (src) hrefs.push(resolve(base, src));
}
}

if (tag === 'META' && content) {
const attr = name ?? property;

if (attr && CRAWLABLE_META_NAME_ATTRS.has(attr)) {
hrefs.push(resolve(base, content));
}
}
}
}
Expand Down
14 changes: 14 additions & 0 deletions packages/kit/src/core/postbuild/fixtures/meta/input.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<!DOCTYPE html>
<html>
<head>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="description" content="This is a description" />

<!--Only these should get crawled-->
<meta content="https://external.com" name="twitter:image" />
<meta name="og:image" content="/og-image.jpg" />
<meta property="og:audio" content="https://example.com/audio.mp3" />
<meta content="/video.mp4" property="og:video"/>
</head>
<body></body>
</html>
4 changes: 4 additions & 0 deletions packages/kit/src/core/postbuild/fixtures/meta/output.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"hrefs": ["https://external.com", "/og-image.jpg", "https://example.com/audio.mp3", "/video.mp4"],
"ids": []
}