diff --git a/.changeset/little-papayas-divide.md b/.changeset/little-papayas-divide.md new file mode 100644 index 0000000..c7ead56 --- /dev/null +++ b/.changeset/little-papayas-divide.md @@ -0,0 +1,5 @@ +--- +"webforai": patch +--- + +Improve diff --git a/.changeset/nervous-eagles-hope.md b/.changeset/nervous-eagles-hope.md new file mode 100644 index 0000000..b759fc3 --- /dev/null +++ b/.changeset/nervous-eagles-hope.md @@ -0,0 +1,5 @@ +--- +"webforai": patch +--- + +accuracy improvement diff --git a/.changeset/proud-pillows-sort.md b/.changeset/proud-pillows-sort.md new file mode 100644 index 0000000..3a701c5 --- /dev/null +++ b/.changeset/proud-pillows-sort.md @@ -0,0 +1,5 @@ +--- +"webforai": patch +--- + +Minor performance improvements diff --git a/packages/webforai/src/extract-hast/readability.ts b/packages/webforai/src/extract-hast/readability.ts index 086dc90..95d9466 100644 --- a/packages/webforai/src/extract-hast/readability.ts +++ b/packages/webforai/src/extract-hast/readability.ts @@ -19,7 +19,7 @@ const REGEXPS = { hidden: /hidden|invisible|fallback-image/i, byline: /byline|author|dateline|writtenby|p-author/i, unlikelyCandidates: - /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote|speechify-ignore/i, + /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote|speechify-ignore|uls-language-block/i, okMaybeItsaCandidate: /and|article|body|column|content|main|shadow|code/i, }; @@ -97,7 +97,6 @@ const unlikelyElementFilter = (node: Hast) => { return true; } const element = node as Element; - const match = matchString(element); // Skip main content elements if (["body", "article", "main", "section", "a"].includes(element.tagName)) { @@ -106,6 +105,7 @@ const unlikelyElementFilter = (node: Hast) => { if (hasAncestors(element, ["table", "code"], 3)) { return true; } + const match = matchString(element); // Remove unlikely candidates if (REGEXPS.unlikelyCandidates.test(match) && !REGEXPS.okMaybeItsaCandidate.test(match)) { @@ -115,19 +115,39 @@ const unlikelyElementFilter = (node: Hast) => { return true; }; +const isImageLink = (element: Element) => { + const a = select("a", element); + const img = select("img", a); + + if (!(a && img)) { + return false; + } + + const imgFilename = img.properties.src?.toString().split("/").pop(); + const hrefFilename = a.properties.href?.toString().split("/").pop(); + + return imgFilename === hrefFilename && imgFilename; +}; + const removeEmptyFilter = (node: Hast) => { if (node.type !== "element") { return true; } const element = node as Element; + if (!PARAGRAPH_TAGS.includes(element.tagName)) { return true; } + if (isImageLink(element)) { + return true; + } + const text = hastToString(element); if (text.length < 10) { return false; } + return true; }; @@ -136,7 +156,7 @@ export const readabilityExtractHast = (hast: Hast): Hast => { const body = select("body", hast) ?? hast; const proxiedHast = parents(body) as unknown as ProxiedHast; - let baseFilterd = filter(proxiedHast, (node) => { + const baseFilterd = filter(proxiedHast, (node) => { if (!metadataFilter(node as Hast)) { return false; } @@ -152,17 +172,7 @@ export const readabilityExtractHast = (hast: Hast): Hast => { const baseText = hastToString(baseFilterd); let minimalLength = lang in BASE_MINIMAL_LENGTH ? BASE_MINIMAL_LENGTH[lang as keyof typeof BASE_MINIMAL_LENGTH] : 500; - if (baseText.length > minimalLength) { - const filterd = filter(baseFilterd, (node) => { - if (!unlikelyElementFilter(node as Hast)) { - return false; - } - return true; - }); - if (filterd) { - baseFilterd = filterd; - } - } else { + if (baseText.length < minimalLength) { minimalLength = Math.max(0, baseText.length - 200); } @@ -193,6 +203,10 @@ export const readabilityExtractHast = (hast: Hast): Hast => { if (!removeEmptyFilter(node as Hast)) { return false; } + if (!unlikelyElementFilter(node as Hast)) { + return false; + } + return true; }) as Hast; diff --git a/packages/webforai/src/extract-hast/utils.ts b/packages/webforai/src/extract-hast/utils.ts index 2cef842..f140433 100644 --- a/packages/webforai/src/extract-hast/utils.ts +++ b/packages/webforai/src/extract-hast/utils.ts @@ -1,7 +1,7 @@ import type { Element } from "hast"; export const matchString = (element: Element) => - `${element.tagName} ${element.properties.id} ${classnames(element).join(" ")} }`; + `${element.tagName} ${element.properties.id} ${classnames(element).join(" ")}`; export const classnames = (element: Element) => { if (Array.isArray(element.properties.className)) { diff --git a/packages/webforai/src/extract-mdast.ts b/packages/webforai/src/extract-mdast.ts new file mode 100644 index 0000000..5f3177e --- /dev/null +++ b/packages/webforai/src/extract-mdast.ts @@ -0,0 +1,25 @@ +import type { Nodes as Mdast, Parent } from "mdast"; +import { filter } from "unist-util-filter"; + +const DECLATION_TYPES = ["blockquote", "strong", "emphasis", "delete"]; + +const emptyDeclarationFilter = (node: Mdast) => { + if (!DECLATION_TYPES.includes(node.type)) { + return true; + } + if ((node as Parent).children.length === 0) { + return false; + } + + return true; +}; + +export const extractMdast = (node: Mdast) => { + const extracted = filter(node, (node) => { + if (!emptyDeclarationFilter(node as Mdast)) { + return false; + } + return true; + }); + return extracted as Mdast; +}; diff --git a/packages/webforai/src/html-to-markdown.test.ts b/packages/webforai/src/html-to-markdown.test.ts index c54e94e..8754482 100644 --- a/packages/webforai/src/html-to-markdown.test.ts +++ b/packages/webforai/src/html-to-markdown.test.ts @@ -45,8 +45,8 @@ Example ![Example](/example.jpg) -* Item 1 -* Item 2 +- Item 1 +- Item 2 `; const imageHidden = `# Hello, world! @@ -55,8 +55,8 @@ This is a paragraph. [Example](/example.html) -* Item 1 -* Item 2 +- Item 1 +- Item 2 `; const htmlTable = ` @@ -85,31 +85,31 @@ describe("htmlToMarkdown", () => { it("should convert HTML to Markdown", () => { const markdown = htmlToMarkdown(html, { extractors: false }); const d = distance(markdown, expected); - expect(d).lte(2); + expect(d).lte(5); }); it("should convert HTML to Markdown with replaced base URL", () => { const markdown = htmlToMarkdown(html, { baseUrl: "https://example.com", extractors: false }); const d = distance(markdown, baseUrlReplaced); - expect(d).lte(2); + expect(d).lte(5); }); it("should convert HTML to Markdown with links as text", () => { const markdown = htmlToMarkdown(html, { linkAsText: true, extractors: false }); const d = distance(markdown, linkAsText); - expect(d).lte(2); + expect(d).lte(5); }); it("should convert HTML to Markdown with hidden images", () => { const markdown = htmlToMarkdown(html, { hideImage: true, extractors: false }); const d = distance(markdown, imageHidden); - expect(d).lte(2); + expect(d).lte(5); }); it("should convert HTML table to Markdown table", () => { const markdown = htmlToMarkdown(htmlTable, { extractors: false }); const d = distance(markdown, expectedTableMarkdown); - expect(d).lte(4); + expect(d).lte(5); }); it("should convert HTML table with table as text option", () => { @@ -127,6 +127,16 @@ describe("htmlToMarkdown E2E", () => { // @ts-ignore const original = await import("../README.md?raw"); const d = distance(markdown, original.default); - expect(d).lte(1000); // I'd like to optimise more! + expect(d).lte(400); // I'd like to optimise more! + }); + + it("should convert GitHub README to Markdown ", async () => { + const html = await fetch("https://github.com/inaridiy/webforai").then((res) => res.text()); + const markdown = htmlToMarkdown(html, { baseUrl: "https://www.npmjs.com/package/webforai" }); + + // @ts-ignore + const original = await import("../../../README.md?raw"); + const d = distance(markdown, original.default); + expect(d).lte(200); // I'd like to optimise more! }); }); diff --git a/packages/webforai/src/html-to-mdast.ts b/packages/webforai/src/html-to-mdast.ts index 109523e..e383fdb 100644 --- a/packages/webforai/src/html-to-mdast.ts +++ b/packages/webforai/src/html-to-mdast.ts @@ -1,10 +1,10 @@ import type { Nodes as Hast } from "hast"; -import type { Nodes as Mdast } from "mdast"; - import { fromHtml } from "hast-util-from-html"; import { toMdast } from "hast-util-to-mdast"; +import type { Nodes as Mdast } from "mdast"; import { type Extracotrs, extractHast } from "./extract-hast"; +import { extractMdast } from "./extract-mdast"; import { customAHandler } from "./mdast-handlers/custom-a-handler"; import { customCodeHandler } from "./mdast-handlers/custom-code-handler"; import { customDivHandler } from "./mdast-handlers/custom-div-handler"; @@ -63,5 +63,7 @@ export const htmlToMdast = (htmlOrHast: string | Hast, options?: HtmlToMdastOpti }, }); - return mdast; + const extractedMdast = extractMdast(mdast); + + return extractedMdast; }; diff --git a/packages/webforai/src/mdast-handlers/custom-a-handler.ts b/packages/webforai/src/mdast-handlers/custom-a-handler.ts index 56f6740..91e3b5a 100644 --- a/packages/webforai/src/mdast-handlers/custom-a-handler.ts +++ b/packages/webforai/src/mdast-handlers/custom-a-handler.ts @@ -5,7 +5,12 @@ export const customAHandler = (options?: { asText?: boolean }): Handle => (state, node) => { if (options?.asText) { - const link = { type: "text", value: hastToString(node) } as const; + const text = hastToString(node); + if (3 >= text.length) { + return undefined; + } + + const link = { type: "text", value: text } as const; state.patch(node, link); return link; } diff --git a/packages/webforai/src/mdast-to-markdown.ts b/packages/webforai/src/mdast-to-markdown.ts index 831551f..32fd692 100644 --- a/packages/webforai/src/mdast-to-markdown.ts +++ b/packages/webforai/src/mdast-to-markdown.ts @@ -22,6 +22,7 @@ export interface MdastToMarkdownOptions extends ToMarkdownOptions { */ export const DEFAULT_MDAST_TO_MARKDOWN_OPTIONS: MdastToMarkdownOptions = { extensions: [gfmToMarkdown(), mathToMarkdown()], + bullet: "-", }; /**