From e5747d50cc728c985d17f3fd28ebb9bd15834234 Mon Sep 17 00:00:00 2001 From: inaridiy Date: Wed, 5 Jun 2024 19:59:35 +0900 Subject: [PATCH 1/7] feat: improve --- .../webforai/src/extract-hast/readability.ts | 20 +++++-------- packages/webforai/src/extract-hast/utils.ts | 2 +- .../webforai/src/html-to-markdown.test.ts | 30 ++++++++++++------- packages/webforai/src/mdast-to-markdown.ts | 1 + 4 files changed, 29 insertions(+), 24 deletions(-) diff --git a/packages/webforai/src/extract-hast/readability.ts b/packages/webforai/src/extract-hast/readability.ts index 086dc90..4a0f990 100644 --- a/packages/webforai/src/extract-hast/readability.ts +++ b/packages/webforai/src/extract-hast/readability.ts @@ -97,7 +97,6 @@ const unlikelyElementFilter = (node: Hast) => { return true; } const element = node as Element; - const match = matchString(element); // Skip main content elements if (["body", "article", "main", "section", "a"].includes(element.tagName)) { @@ -106,6 +105,7 @@ const unlikelyElementFilter = (node: Hast) => { if (hasAncestors(element, ["table", "code"], 3)) { return true; } + const match = matchString(element); // Remove unlikely candidates if (REGEXPS.unlikelyCandidates.test(match) && !REGEXPS.okMaybeItsaCandidate.test(match)) { @@ -136,7 +136,7 @@ export const readabilityExtractHast = (hast: Hast): Hast => { const body = select("body", hast) ?? hast; const proxiedHast = parents(body) as unknown as ProxiedHast; - let baseFilterd = filter(proxiedHast, (node) => { + const baseFilterd = filter(proxiedHast, (node) => { if (!metadataFilter(node as Hast)) { return false; } @@ -152,17 +152,7 @@ export const readabilityExtractHast = (hast: Hast): Hast => { const baseText = hastToString(baseFilterd); let minimalLength = lang in BASE_MINIMAL_LENGTH ? BASE_MINIMAL_LENGTH[lang as keyof typeof BASE_MINIMAL_LENGTH] : 500; - if (baseText.length > minimalLength) { - const filterd = filter(baseFilterd, (node) => { - if (!unlikelyElementFilter(node as Hast)) { - return false; - } - return true; - }); - if (filterd) { - baseFilterd = filterd; - } - } else { + if (baseText.length < minimalLength) { minimalLength = Math.max(0, baseText.length - 200); } @@ -193,6 +183,10 @@ export const readabilityExtractHast = (hast: Hast): Hast => { if (!removeEmptyFilter(node as Hast)) { return false; } + if (!unlikelyElementFilter(node as Hast)) { + return false; + } + return true; }) as Hast; diff --git a/packages/webforai/src/extract-hast/utils.ts b/packages/webforai/src/extract-hast/utils.ts index 2cef842..f140433 100644 --- a/packages/webforai/src/extract-hast/utils.ts +++ b/packages/webforai/src/extract-hast/utils.ts @@ -1,7 +1,7 @@ import type { Element } from "hast"; export const matchString = (element: Element) => - `${element.tagName} ${element.properties.id} ${classnames(element).join(" ")} }`; + `${element.tagName} ${element.properties.id} ${classnames(element).join(" ")}`; export const classnames = (element: Element) => { if (Array.isArray(element.properties.className)) { diff --git a/packages/webforai/src/html-to-markdown.test.ts b/packages/webforai/src/html-to-markdown.test.ts index c54e94e..e0b2f5d 100644 --- a/packages/webforai/src/html-to-markdown.test.ts +++ b/packages/webforai/src/html-to-markdown.test.ts @@ -45,8 +45,8 @@ Example ![Example](/example.jpg) -* Item 1 -* Item 2 +- Item 1 +- Item 2 `; const imageHidden = `# Hello, world! @@ -55,8 +55,8 @@ This is a paragraph. [Example](/example.html) -* Item 1 -* Item 2 +- Item 1 +- Item 2 `; const htmlTable = ` @@ -85,31 +85,31 @@ describe("htmlToMarkdown", () => { it("should convert HTML to Markdown", () => { const markdown = htmlToMarkdown(html, { extractors: false }); const d = distance(markdown, expected); - expect(d).lte(2); + expect(d).lte(5); }); it("should convert HTML to Markdown with replaced base URL", () => { const markdown = htmlToMarkdown(html, { baseUrl: "https://example.com", extractors: false }); const d = distance(markdown, baseUrlReplaced); - expect(d).lte(2); + expect(d).lte(5); }); it("should convert HTML to Markdown with links as text", () => { const markdown = htmlToMarkdown(html, { linkAsText: true, extractors: false }); const d = distance(markdown, linkAsText); - expect(d).lte(2); + expect(d).lte(5); }); it("should convert HTML to Markdown with hidden images", () => { const markdown = htmlToMarkdown(html, { hideImage: true, extractors: false }); const d = distance(markdown, imageHidden); - expect(d).lte(2); + expect(d).lte(5); }); it("should convert HTML table to Markdown table", () => { const markdown = htmlToMarkdown(htmlTable, { extractors: false }); const d = distance(markdown, expectedTableMarkdown); - expect(d).lte(4); + expect(d).lte(5); }); it("should convert HTML table with table as text option", () => { @@ -127,6 +127,16 @@ describe("htmlToMarkdown E2E", () => { // @ts-ignore const original = await import("../README.md?raw"); const d = distance(markdown, original.default); - expect(d).lte(1000); // I'd like to optimise more! + expect(d).lte(400); // I'd like to optimise more! + }); + + it("should convert GitHub README to Markdown ", async () => { + const html = await fetch("https://github.com/inaridiy/webforai").then((res) => res.text()); + const markdown = htmlToMarkdown(html, { baseUrl: "https://www.npmjs.com/package/webforai" }); + + // @ts-ignore + const original = await import("../../../README.md?raw"); + const d = distance(markdown, original.default); + expect(d).lte(400); // I'd like to optimise more! }); }); diff --git a/packages/webforai/src/mdast-to-markdown.ts b/packages/webforai/src/mdast-to-markdown.ts index 831551f..32fd692 100644 --- a/packages/webforai/src/mdast-to-markdown.ts +++ b/packages/webforai/src/mdast-to-markdown.ts @@ -22,6 +22,7 @@ export interface MdastToMarkdownOptions extends ToMarkdownOptions { */ export const DEFAULT_MDAST_TO_MARKDOWN_OPTIONS: MdastToMarkdownOptions = { extensions: [gfmToMarkdown(), mathToMarkdown()], + bullet: "-", }; /** From a9b091f126c05d3876d12b39cf8e2806ef4b0a47 Mon Sep 17 00:00:00 2001 From: inaridiy Date: Wed, 5 Jun 2024 20:21:29 +0900 Subject: [PATCH 2/7] feat: improve --- .../webforai/src/extract-hast/readability.ts | 20 +++++++++++++++++++ .../webforai/src/html-to-markdown.test.ts | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/packages/webforai/src/extract-hast/readability.ts b/packages/webforai/src/extract-hast/readability.ts index 4a0f990..24fb939 100644 --- a/packages/webforai/src/extract-hast/readability.ts +++ b/packages/webforai/src/extract-hast/readability.ts @@ -115,19 +115,39 @@ const unlikelyElementFilter = (node: Hast) => { return true; }; +const isImageLink = (element: Element) => { + const a = select("a", element); + const img = select("img", a); + + if (!(a && img)) { + return false; + } + + const imgFilename = img.properties.src?.toString().split("/").pop(); + const hrefFilename = a.properties.href?.toString().split("/").pop(); + + return imgFilename === hrefFilename && imgFilename; +}; + const removeEmptyFilter = (node: Hast) => { if (node.type !== "element") { return true; } const element = node as Element; + if (!PARAGRAPH_TAGS.includes(element.tagName)) { return true; } + if (isImageLink(element)) { + return true; + } + const text = hastToString(element); if (text.length < 10) { return false; } + return true; }; diff --git a/packages/webforai/src/html-to-markdown.test.ts b/packages/webforai/src/html-to-markdown.test.ts index e0b2f5d..8754482 100644 --- a/packages/webforai/src/html-to-markdown.test.ts +++ b/packages/webforai/src/html-to-markdown.test.ts @@ -137,6 +137,6 @@ describe("htmlToMarkdown E2E", () => { // @ts-ignore const original = await import("../../../README.md?raw"); const d = distance(markdown, original.default); - expect(d).lte(400); // I'd like to optimise more! + expect(d).lte(200); // I'd like to optimise more! }); }); From 47647676a838b922e2cf32b1d3637c8153b996dd Mon Sep 17 00:00:00 2001 From: inaridiy Date: Wed, 5 Jun 2024 20:22:56 +0900 Subject: [PATCH 3/7] Minor performance improvements --- .changeset/proud-pillows-sort.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/proud-pillows-sort.md diff --git a/.changeset/proud-pillows-sort.md b/.changeset/proud-pillows-sort.md new file mode 100644 index 0000000..3a701c5 --- /dev/null +++ b/.changeset/proud-pillows-sort.md @@ -0,0 +1,5 @@ +--- +"webforai": patch +--- + +Minor performance improvements From 4437e28e1e7807fd061aee99510ea2d3f71a2a78 Mon Sep 17 00:00:00 2001 From: inaridiy Date: Wed, 5 Jun 2024 22:55:54 +0900 Subject: [PATCH 4/7] feat: accuracy improvement --- .changeset/nervous-eagles-hope.md | 5 ++++ packages/webforai/src/extract-mdast.ts | 25 +++++++++++++++++++ packages/webforai/src/html-to-mdast.ts | 9 ++++--- .../src/mdast-handlers/custom-a-handler.ts | 7 +++++- 4 files changed, 42 insertions(+), 4 deletions(-) create mode 100644 .changeset/nervous-eagles-hope.md create mode 100644 packages/webforai/src/extract-mdast.ts diff --git a/.changeset/nervous-eagles-hope.md b/.changeset/nervous-eagles-hope.md new file mode 100644 index 0000000..b759fc3 --- /dev/null +++ b/.changeset/nervous-eagles-hope.md @@ -0,0 +1,5 @@ +--- +"webforai": patch +--- + +accuracy improvement diff --git a/packages/webforai/src/extract-mdast.ts b/packages/webforai/src/extract-mdast.ts new file mode 100644 index 0000000..5f3177e --- /dev/null +++ b/packages/webforai/src/extract-mdast.ts @@ -0,0 +1,25 @@ +import type { Nodes as Mdast, Parent } from "mdast"; +import { filter } from "unist-util-filter"; + +const DECLATION_TYPES = ["blockquote", "strong", "emphasis", "delete"]; + +const emptyDeclarationFilter = (node: Mdast) => { + if (!DECLATION_TYPES.includes(node.type)) { + return true; + } + if ((node as Parent).children.length === 0) { + return false; + } + + return true; +}; + +export const extractMdast = (node: Mdast) => { + const extracted = filter(node, (node) => { + if (!emptyDeclarationFilter(node as Mdast)) { + return false; + } + return true; + }); + return extracted as Mdast; +}; diff --git a/packages/webforai/src/html-to-mdast.ts b/packages/webforai/src/html-to-mdast.ts index 109523e..c66094b 100644 --- a/packages/webforai/src/html-to-mdast.ts +++ b/packages/webforai/src/html-to-mdast.ts @@ -1,10 +1,10 @@ import type { Nodes as Hast } from "hast"; -import type { Nodes as Mdast } from "mdast"; - import { fromHtml } from "hast-util-from-html"; import { toMdast } from "hast-util-to-mdast"; +import type { Nodes as Mdast } from "mdast"; import { type Extracotrs, extractHast } from "./extract-hast"; +import { extractMdast } from "./extract-mdast"; import { customAHandler } from "./mdast-handlers/custom-a-handler"; import { customCodeHandler } from "./mdast-handlers/custom-code-handler"; import { customDivHandler } from "./mdast-handlers/custom-div-handler"; @@ -12,6 +12,7 @@ import { customImgHandler } from "./mdast-handlers/custom-img-handler"; import { customTableHandler } from "./mdast-handlers/custom-table-handler"; import { emptyHandler } from "./mdast-handlers/empty-handler"; import { mathHandler } from "./mdast-handlers/math-handler"; +import { warpRoot } from "./utils/mdast-utils"; export type HtmlToMdastOptions = { /** @@ -63,5 +64,7 @@ export const htmlToMdast = (htmlOrHast: string | Hast, options?: HtmlToMdastOpti }, }); - return mdast; + const extractedMdast = extractMdast(mdast); + + return extractedMdast; }; diff --git a/packages/webforai/src/mdast-handlers/custom-a-handler.ts b/packages/webforai/src/mdast-handlers/custom-a-handler.ts index 56f6740..91e3b5a 100644 --- a/packages/webforai/src/mdast-handlers/custom-a-handler.ts +++ b/packages/webforai/src/mdast-handlers/custom-a-handler.ts @@ -5,7 +5,12 @@ export const customAHandler = (options?: { asText?: boolean }): Handle => (state, node) => { if (options?.asText) { - const link = { type: "text", value: hastToString(node) } as const; + const text = hastToString(node); + if (3 >= text.length) { + return undefined; + } + + const link = { type: "text", value: text } as const; state.patch(node, link); return link; } From 7095312baa68c61dfc2e7f361b84ea8a7e8523ef Mon Sep 17 00:00:00 2001 From: inaridiy Date: Wed, 5 Jun 2024 22:56:18 +0900 Subject: [PATCH 5/7] style: --- packages/webforai/src/html-to-mdast.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/webforai/src/html-to-mdast.ts b/packages/webforai/src/html-to-mdast.ts index c66094b..e383fdb 100644 --- a/packages/webforai/src/html-to-mdast.ts +++ b/packages/webforai/src/html-to-mdast.ts @@ -12,7 +12,6 @@ import { customImgHandler } from "./mdast-handlers/custom-img-handler"; import { customTableHandler } from "./mdast-handlers/custom-table-handler"; import { emptyHandler } from "./mdast-handlers/empty-handler"; import { mathHandler } from "./mdast-handlers/math-handler"; -import { warpRoot } from "./utils/mdast-utils"; export type HtmlToMdastOptions = { /** From ab3ee5b2be97ac9cef63423c5c5b44343b08c5a3 Mon Sep 17 00:00:00 2001 From: inaridiy Date: Wed, 5 Jun 2024 23:05:54 +0900 Subject: [PATCH 6/7] feat: --- packages/webforai/src/extract-hast/readability.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/webforai/src/extract-hast/readability.ts b/packages/webforai/src/extract-hast/readability.ts index 24fb939..95d9466 100644 --- a/packages/webforai/src/extract-hast/readability.ts +++ b/packages/webforai/src/extract-hast/readability.ts @@ -19,7 +19,7 @@ const REGEXPS = { hidden: /hidden|invisible|fallback-image/i, byline: /byline|author|dateline|writtenby|p-author/i, unlikelyCandidates: - /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote|speechify-ignore/i, + /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote|speechify-ignore|uls-language-block/i, okMaybeItsaCandidate: /and|article|body|column|content|main|shadow|code/i, }; From 3c6da3952f176769cf8aa899f6c7207c231d806a Mon Sep 17 00:00:00 2001 From: inaridiy Date: Wed, 5 Jun 2024 23:06:36 +0900 Subject: [PATCH 7/7] Improve --- .changeset/little-papayas-divide.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/little-papayas-divide.md diff --git a/.changeset/little-papayas-divide.md b/.changeset/little-papayas-divide.md new file mode 100644 index 0000000..c7ead56 --- /dev/null +++ b/.changeset/little-papayas-divide.md @@ -0,0 +1,5 @@ +--- +"webforai": patch +--- + +Improve