Merge pull request #18 from inaridiy/feat/accuracy-improvement

Feat/accuracy improvement
inaridiy · Jun 5, 2024 · 871564f · 871564f
2 parents 6bf88c3 + 3c6da39
commit 871564f
Show file tree

Hide file tree

Showing 10 changed files with 101 additions and 29 deletions.
diff --git a/.changeset/little-papayas-divide.md b/.changeset/little-papayas-divide.md
@@ -0,0 +1,5 @@
+---
+"webforai": patch
+---
+
+Improve
diff --git a/.changeset/nervous-eagles-hope.md b/.changeset/nervous-eagles-hope.md
@@ -0,0 +1,5 @@
+---
+"webforai": patch
+---
+
+accuracy improvement
diff --git a/.changeset/proud-pillows-sort.md b/.changeset/proud-pillows-sort.md
@@ -0,0 +1,5 @@
+---
+"webforai": patch
+---
+
+Minor performance improvements
diff --git a/packages/webforai/src/extract-hast/readability.ts b/packages/webforai/src/extract-hast/readability.ts
@@ -19,7 +19,7 @@ const REGEXPS = {
 	hidden: /hidden|invisible|fallback-image/i,
 	byline: /byline|author|dateline|writtenby|p-author/i,
 	unlikelyCandidates:
-		/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote|speechify-ignore/i,
+		/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote|speechify-ignore|uls-language-block/i,
 	okMaybeItsaCandidate: /and|article|body|column|content|main|shadow|code/i,
 };
 
@@ -97,7 +97,6 @@ const unlikelyElementFilter = (node: Hast) => {
 		return true;
 	}
 	const element = node as Element;
-	const match = matchString(element);
 
 	// Skip main content elements
 	if (["body", "article", "main", "section", "a"].includes(element.tagName)) {
@@ -106,6 +105,7 @@ const unlikelyElementFilter = (node: Hast) => {
 	if (hasAncestors(element, ["table", "code"], 3)) {
 		return true;
 	}
+	const match = matchString(element);
 
 	// Remove unlikely candidates
 	if (REGEXPS.unlikelyCandidates.test(match) && !REGEXPS.okMaybeItsaCandidate.test(match)) {
@@ -115,19 +115,39 @@ const unlikelyElementFilter = (node: Hast) => {
 	return true;
 };
 
+const isImageLink = (element: Element) => {
+	const a = select("a", element);
+	const img = select("img", a);
+
+	if (!(a && img)) {
+		return false;
+	}
+
+	const imgFilename = img.properties.src?.toString().split("/").pop();
+	const hrefFilename = a.properties.href?.toString().split("/").pop();
+
+	return imgFilename === hrefFilename && imgFilename;
+};
+
 const removeEmptyFilter = (node: Hast) => {
 	if (node.type !== "element") {
 		return true;
 	}
 	const element = node as Element;
+
 	if (!PARAGRAPH_TAGS.includes(element.tagName)) {
 		return true;
 	}
 
+	if (isImageLink(element)) {
+		return true;
+	}
+
 	const text = hastToString(element);
 	if (text.length < 10) {
 		return false;
 	}
+
 	return true;
 };
 
@@ -136,7 +156,7 @@ export const readabilityExtractHast = (hast: Hast): Hast => {
 	const body = select("body", hast) ?? hast;
 
 	const proxiedHast = parents(body) as unknown as ProxiedHast;
-	let baseFilterd = filter(proxiedHast, (node) => {
+	const baseFilterd = filter(proxiedHast, (node) => {
 		if (!metadataFilter(node as Hast)) {
 			return false;
 		}
@@ -152,17 +172,7 @@ export const readabilityExtractHast = (hast: Hast): Hast => {
 
 	const baseText = hastToString(baseFilterd);
 	let minimalLength = lang in BASE_MINIMAL_LENGTH ? BASE_MINIMAL_LENGTH[lang as keyof typeof BASE_MINIMAL_LENGTH] : 500;
-	if (baseText.length > minimalLength) {
-		const filterd = filter(baseFilterd, (node) => {
-			if (!unlikelyElementFilter(node as Hast)) {
-				return false;
-			}
-			return true;
-		});
-		if (filterd) {
-			baseFilterd = filterd;
-		}
-	} else {
+	if (baseText.length < minimalLength) {
 		minimalLength = Math.max(0, baseText.length - 200);
 	}
 
@@ -193,6 +203,10 @@ export const readabilityExtractHast = (hast: Hast): Hast => {
 		if (!removeEmptyFilter(node as Hast)) {
 			return false;
 		}
+		if (!unlikelyElementFilter(node as Hast)) {
+			return false;
+		}
+
 		return true;
 	}) as Hast;
 

diff --git a/packages/webforai/src/extract-hast/utils.ts b/packages/webforai/src/extract-hast/utils.ts
@@ -1,7 +1,7 @@
 import type { Element } from "hast";
 
 export const matchString = (element: Element) =>
-	`${element.tagName} ${element.properties.id} ${classnames(element).join(" ")} }`;
+	`${element.tagName} ${element.properties.id} ${classnames(element).join(" ")}`;
 
 export const classnames = (element: Element) => {
 	if (Array.isArray(element.properties.className)) {

diff --git a/packages/webforai/src/extract-mdast.ts b/packages/webforai/src/extract-mdast.ts
@@ -0,0 +1,25 @@
+import type { Nodes as Mdast, Parent } from "mdast";
+import { filter } from "unist-util-filter";
+
+const DECLATION_TYPES = ["blockquote", "strong", "emphasis", "delete"];
+
+const emptyDeclarationFilter = (node: Mdast) => {
+	if (!DECLATION_TYPES.includes(node.type)) {
+		return true;
+	}
+	if ((node as Parent).children.length === 0) {
+		return false;
+	}
+
+	return true;
+};
+
+export const extractMdast = (node: Mdast) => {
+	const extracted = filter(node, (node) => {
+		if (!emptyDeclarationFilter(node as Mdast)) {
+			return false;
+		}
+		return true;
+	});
+	return extracted as Mdast;
+};
diff --git a/packages/webforai/src/html-to-markdown.test.ts b/packages/webforai/src/html-to-markdown.test.ts
@@ -45,8 +45,8 @@ Example
 
 ![Example](/example.jpg)
 
-* Item 1
-* Item 2
+- Item 1
+- Item 2
 `;
 
 const imageHidden = `# Hello, world!
@@ -55,8 +55,8 @@ This is a paragraph.
 
 [Example](/example.html)
 
-* Item 1
-* Item 2
+- Item 1
+- Item 2
 `;
 
 const htmlTable = `
@@ -85,31 +85,31 @@ describe("htmlToMarkdown", () => {
 	it("should convert HTML to Markdown", () => {
 		const markdown = htmlToMarkdown(html, { extractors: false });
 		const d = distance(markdown, expected);
-		expect(d).lte(2);
+		expect(d).lte(5);
 	});
 
 	it("should convert HTML to Markdown with replaced base URL", () => {
 		const markdown = htmlToMarkdown(html, { baseUrl: "https://example.com", extractors: false });
 		const d = distance(markdown, baseUrlReplaced);
-		expect(d).lte(2);
+		expect(d).lte(5);
 	});
 
 	it("should convert HTML to Markdown with links as text", () => {
 		const markdown = htmlToMarkdown(html, { linkAsText: true, extractors: false });
 		const d = distance(markdown, linkAsText);
-		expect(d).lte(2);
+		expect(d).lte(5);
 	});
 
 	it("should convert HTML to Markdown with hidden images", () => {
 		const markdown = htmlToMarkdown(html, { hideImage: true, extractors: false });
 		const d = distance(markdown, imageHidden);
-		expect(d).lte(2);
+		expect(d).lte(5);
 	});
 
 	it("should convert HTML table to Markdown table", () => {
 		const markdown = htmlToMarkdown(htmlTable, { extractors: false });
 		const d = distance(markdown, expectedTableMarkdown);
-		expect(d).lte(4);
+		expect(d).lte(5);
 	});
 
 	it("should convert HTML table with table as text option", () => {
@@ -127,6 +127,16 @@ describe("htmlToMarkdown E2E", () => {
 		// @ts-ignore
 		const original = await import("../README.md?raw");
 		const d = distance(markdown, original.default);
-		expect(d).lte(1000); // I'd like to optimise more!
+		expect(d).lte(400); // I'd like to optimise more!
+	});
+
+	it("should convert GitHub README to Markdown ", async () => {
+		const html = await fetch("https://github.com/inaridiy/webforai").then((res) => res.text());
+		const markdown = htmlToMarkdown(html, { baseUrl: "https://www.npmjs.com/package/webforai" });
+
+		// @ts-ignore
+		const original = await import("../../../README.md?raw");
+		const d = distance(markdown, original.default);
+		expect(d).lte(200); // I'd like to optimise more!
 	});
 });
diff --git a/packages/webforai/src/html-to-mdast.ts b/packages/webforai/src/html-to-mdast.ts
@@ -1,10 +1,10 @@
 import type { Nodes as Hast } from "hast";
-import type { Nodes as Mdast } from "mdast";
-
 import { fromHtml } from "hast-util-from-html";
 import { toMdast } from "hast-util-to-mdast";
+import type { Nodes as Mdast } from "mdast";
 
 import { type Extracotrs, extractHast } from "./extract-hast";
+import { extractMdast } from "./extract-mdast";
 import { customAHandler } from "./mdast-handlers/custom-a-handler";
 import { customCodeHandler } from "./mdast-handlers/custom-code-handler";
 import { customDivHandler } from "./mdast-handlers/custom-div-handler";
@@ -63,5 +63,7 @@ export const htmlToMdast = (htmlOrHast: string | Hast, options?: HtmlToMdastOpti
 		},
 	});
 
-	return mdast;
+	const extractedMdast = extractMdast(mdast);
+
+	return extractedMdast;
 };
diff --git a/packages/webforai/src/mdast-handlers/custom-a-handler.ts b/packages/webforai/src/mdast-handlers/custom-a-handler.ts
@@ -5,7 +5,12 @@ export const customAHandler =
 	(options?: { asText?: boolean }): Handle =>
 	(state, node) => {
 		if (options?.asText) {
-			const link = { type: "text", value: hastToString(node) } as const;
+			const text = hastToString(node);
+			if (3 >= text.length) {
+				return undefined;
+			}
+
+			const link = { type: "text", value: text } as const;
 			state.patch(node, link);
 			return link;
 		}

diff --git a/packages/webforai/src/mdast-to-markdown.ts b/packages/webforai/src/mdast-to-markdown.ts
@@ -22,6 +22,7 @@ export interface MdastToMarkdownOptions extends ToMarkdownOptions {
  */
 export const DEFAULT_MDAST_TO_MARKDOWN_OPTIONS: MdastToMarkdownOptions = {
 	extensions: [gfmToMarkdown(), mathToMarkdown()],
+	bullet: "-",
 };
 
 /**
-Original file line number
+Diff line change
@@ Expand Up @@
      */
     export const DEFAULT_MDAST_TO_MARKDOWN_OPTIONS: MdastToMarkdownOptions = {
     	extensions: [gfmToMarkdown(), mathToMarkdown()],
+    	bullet: "-",
     };
     /**
@@ Expand Down @@