From b0c425d21e38b3ae065b31635b5e8d4dff3afcbb Mon Sep 17 00:00:00 2001 From: weareoutman Date: Fri, 24 Nov 2023 15:00:58 +0800 Subject: [PATCH] feat: do not index unlisted content closes #371 --- .eslintrc | 1 + .../src/server/utils/parse.spec.ts | 20 ++++++++++++++++++- .../src/server/utils/parse.ts | 9 ++++++++- .../src/server/utils/scanDocuments.spec.ts | 10 +++++++++- .../src/server/utils/scanDocuments.ts | 13 ++++++------ website/docs/unlisted-post.md | 9 +++++++++ 6 files changed, 53 insertions(+), 9 deletions(-) create mode 100644 website/docs/unlisted-post.md diff --git a/.eslintrc b/.eslintrc index a1d1a6ea..130bd542 100644 --- a/.eslintrc +++ b/.eslintrc @@ -35,6 +35,7 @@ } ], "rules": { + "@typescript-eslint/no-explicit-any": "warn", "react-hooks/rules-of-hooks": "error", "react-hooks/exhaustive-deps": "warn" } diff --git a/docusaurus-search-local/src/server/utils/parse.spec.ts b/docusaurus-search-local/src/server/utils/parse.spec.ts index 4d5d6e15..53d221f7 100644 --- a/docusaurus-search-local/src/server/utils/parse.spec.ts +++ b/docusaurus-search-local/src/server/utils/parse.spec.ts @@ -5,7 +5,7 @@ import { import { parse } from "./parse"; describe("parse", () => { - test.each<[string, "docs" | "blog" | "page", ParsedDocument]>([ + test.each<[string, "docs" | "blog" | "page", ParsedDocument | null]>([ [ `
@@ -84,6 +84,24 @@ describe("parse", () => { breadcrumb: [], }, ], + [ + ` + + + +
+
+

Hello World

+
+
+ Test + Peace. +
+
+ `, + "docs", + null, + ], ])("parse(...) should work", (html, type, doc) => { expect( parse(html, type, "", { diff --git a/docusaurus-search-local/src/server/utils/parse.ts b/docusaurus-search-local/src/server/utils/parse.ts index f2d740ff..8f3e782b 100644 --- a/docusaurus-search-local/src/server/utils/parse.ts +++ b/docusaurus-search-local/src/server/utils/parse.ts @@ -11,8 +11,15 @@ export function parse( type: "docs" | "blog" | "page", url: string, { ignoreCssSelectors }: ProcessedPluginOptions -): ParsedDocument { +): ParsedDocument | null { const $ = cheerio.load(html); + + const robotsMeta = $('meta[name="robots"]'); + if (robotsMeta.attr("content")?.includes("noindex")) { + // Unlisted content + return null; + } + // Remove copy buttons from code boxes $('div[class^="mdxCodeBlock_"] button').remove(); diff --git a/docusaurus-search-local/src/server/utils/scanDocuments.spec.ts b/docusaurus-search-local/src/server/utils/scanDocuments.spec.ts index d1045a48..6fecaf3f 100644 --- a/docusaurus-search-local/src/server/utils/scanDocuments.spec.ts +++ b/docusaurus-search-local/src/server/utils/scanDocuments.spec.ts @@ -33,6 +33,12 @@ describe("scanDocuments", () => { url: "/2", type: "page", }, + { + // Unlisted + filePath: "/tmp/3", + url: "/3", + type: "docs", + }, ]; mockParse.mockImplementation((html) => { if (html.includes("1")) { @@ -52,7 +58,7 @@ describe("scanDocuments", () => { ], breadcrumb: ["Docs"], }; - } else { + } else if (html.includes("2")) { return { pageTitle: "Hello First Page", sections: [ @@ -64,6 +70,8 @@ describe("scanDocuments", () => { ], breadcrumb: [], }; + } else { + return null; } }); const allDocuments = await scanDocuments( diff --git a/docusaurus-search-local/src/server/utils/scanDocuments.ts b/docusaurus-search-local/src/server/utils/scanDocuments.ts index c6a25714..cd28cbc0 100644 --- a/docusaurus-search-local/src/server/utils/scanDocuments.ts +++ b/docusaurus-search-local/src/server/utils/scanDocuments.ts @@ -35,12 +35,13 @@ export async function scanDocuments( ); const html = await readFileAsync(filePath, { encoding: "utf8" }); - const { pageTitle, sections, breadcrumb } = parse( - html, - type, - url, - config - ); + + const parsed = parse(html, type, url, config); + if (!parsed) { + // Unlisted content + return; + } + const { pageTitle, sections, breadcrumb } = parsed; const titleId = getNextDocId(); diff --git a/website/docs/unlisted-post.md b/website/docs/unlisted-post.md new file mode 100644 index 00000000..91c98bb8 --- /dev/null +++ b/website/docs/unlisted-post.md @@ -0,0 +1,9 @@ +--- +unlisted: true +--- + +# Unlisted Post + +This unlisted blog post should be "hidden" in production, but remain accessible. + +It is filtered from the sidebar, sitemap, SEO indexation...