Skip to content

Commit

Permalink
Merge pull request #18 from inaridiy/feat/accuracy-improvement
Browse files Browse the repository at this point in the history
Feat/accuracy improvement
  • Loading branch information
inaridiy authored Jun 5, 2024
2 parents 6bf88c3 + 3c6da39 commit 871564f
Show file tree
Hide file tree
Showing 10 changed files with 101 additions and 29 deletions.
5 changes: 5 additions & 0 deletions .changeset/little-papayas-divide.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"webforai": patch
---

Improve
5 changes: 5 additions & 0 deletions .changeset/nervous-eagles-hope.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"webforai": patch
---

accuracy improvement
5 changes: 5 additions & 0 deletions .changeset/proud-pillows-sort.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"webforai": patch
---

Minor performance improvements
42 changes: 28 additions & 14 deletions packages/webforai/src/extract-hast/readability.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ const REGEXPS = {
hidden: /hidden|invisible|fallback-image/i,
byline: /byline|author|dateline|writtenby|p-author/i,
unlikelyCandidates:
/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote|speechify-ignore/i,
/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote|speechify-ignore|uls-language-block/i,
okMaybeItsaCandidate: /and|article|body|column|content|main|shadow|code/i,
};

Expand Down Expand Up @@ -97,7 +97,6 @@ const unlikelyElementFilter = (node: Hast) => {
return true;
}
const element = node as Element;
const match = matchString(element);

// Skip main content elements
if (["body", "article", "main", "section", "a"].includes(element.tagName)) {
Expand All @@ -106,6 +105,7 @@ const unlikelyElementFilter = (node: Hast) => {
if (hasAncestors(element, ["table", "code"], 3)) {
return true;
}
const match = matchString(element);

// Remove unlikely candidates
if (REGEXPS.unlikelyCandidates.test(match) && !REGEXPS.okMaybeItsaCandidate.test(match)) {
Expand All @@ -115,19 +115,39 @@ const unlikelyElementFilter = (node: Hast) => {
return true;
};

const isImageLink = (element: Element) => {
const a = select("a", element);
const img = select("img", a);

if (!(a && img)) {
return false;
}

const imgFilename = img.properties.src?.toString().split("/").pop();
const hrefFilename = a.properties.href?.toString().split("/").pop();

return imgFilename === hrefFilename && imgFilename;
};

const removeEmptyFilter = (node: Hast) => {
if (node.type !== "element") {
return true;
}
const element = node as Element;

if (!PARAGRAPH_TAGS.includes(element.tagName)) {
return true;
}

if (isImageLink(element)) {
return true;
}

const text = hastToString(element);
if (text.length < 10) {
return false;
}

return true;
};

Expand All @@ -136,7 +156,7 @@ export const readabilityExtractHast = (hast: Hast): Hast => {
const body = select("body", hast) ?? hast;

const proxiedHast = parents(body) as unknown as ProxiedHast;
let baseFilterd = filter(proxiedHast, (node) => {
const baseFilterd = filter(proxiedHast, (node) => {
if (!metadataFilter(node as Hast)) {
return false;
}
Expand All @@ -152,17 +172,7 @@ export const readabilityExtractHast = (hast: Hast): Hast => {

const baseText = hastToString(baseFilterd);
let minimalLength = lang in BASE_MINIMAL_LENGTH ? BASE_MINIMAL_LENGTH[lang as keyof typeof BASE_MINIMAL_LENGTH] : 500;
if (baseText.length > minimalLength) {
const filterd = filter(baseFilterd, (node) => {
if (!unlikelyElementFilter(node as Hast)) {
return false;
}
return true;
});
if (filterd) {
baseFilterd = filterd;
}
} else {
if (baseText.length < minimalLength) {
minimalLength = Math.max(0, baseText.length - 200);
}

Expand Down Expand Up @@ -193,6 +203,10 @@ export const readabilityExtractHast = (hast: Hast): Hast => {
if (!removeEmptyFilter(node as Hast)) {
return false;
}
if (!unlikelyElementFilter(node as Hast)) {
return false;
}

return true;
}) as Hast;

Expand Down
2 changes: 1 addition & 1 deletion packages/webforai/src/extract-hast/utils.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import type { Element } from "hast";

export const matchString = (element: Element) =>
`${element.tagName} ${element.properties.id} ${classnames(element).join(" ")} }`;
`${element.tagName} ${element.properties.id} ${classnames(element).join(" ")}`;

export const classnames = (element: Element) => {
if (Array.isArray(element.properties.className)) {
Expand Down
25 changes: 25 additions & 0 deletions packages/webforai/src/extract-mdast.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import type { Nodes as Mdast, Parent } from "mdast";
import { filter } from "unist-util-filter";

const DECLATION_TYPES = ["blockquote", "strong", "emphasis", "delete"];

const emptyDeclarationFilter = (node: Mdast) => {
if (!DECLATION_TYPES.includes(node.type)) {
return true;
}
if ((node as Parent).children.length === 0) {
return false;
}

return true;
};

export const extractMdast = (node: Mdast) => {
const extracted = filter(node, (node) => {
if (!emptyDeclarationFilter(node as Mdast)) {
return false;
}
return true;
});
return extracted as Mdast;
};
30 changes: 20 additions & 10 deletions packages/webforai/src/html-to-markdown.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ Example
![Example](/example.jpg)
* Item 1
* Item 2
- Item 1
- Item 2
`;

const imageHidden = `# Hello, world!
Expand All @@ -55,8 +55,8 @@ This is a paragraph.
[Example](/example.html)
* Item 1
* Item 2
- Item 1
- Item 2
`;

const htmlTable = `
Expand Down Expand Up @@ -85,31 +85,31 @@ describe("htmlToMarkdown", () => {
it("should convert HTML to Markdown", () => {
const markdown = htmlToMarkdown(html, { extractors: false });
const d = distance(markdown, expected);
expect(d).lte(2);
expect(d).lte(5);
});

it("should convert HTML to Markdown with replaced base URL", () => {
const markdown = htmlToMarkdown(html, { baseUrl: "https://example.com", extractors: false });
const d = distance(markdown, baseUrlReplaced);
expect(d).lte(2);
expect(d).lte(5);
});

it("should convert HTML to Markdown with links as text", () => {
const markdown = htmlToMarkdown(html, { linkAsText: true, extractors: false });
const d = distance(markdown, linkAsText);
expect(d).lte(2);
expect(d).lte(5);
});

it("should convert HTML to Markdown with hidden images", () => {
const markdown = htmlToMarkdown(html, { hideImage: true, extractors: false });
const d = distance(markdown, imageHidden);
expect(d).lte(2);
expect(d).lte(5);
});

it("should convert HTML table to Markdown table", () => {
const markdown = htmlToMarkdown(htmlTable, { extractors: false });
const d = distance(markdown, expectedTableMarkdown);
expect(d).lte(4);
expect(d).lte(5);
});

it("should convert HTML table with table as text option", () => {
Expand All @@ -127,6 +127,16 @@ describe("htmlToMarkdown E2E", () => {
// @ts-ignore
const original = await import("../README.md?raw");
const d = distance(markdown, original.default);
expect(d).lte(1000); // I'd like to optimise more!
expect(d).lte(400); // I'd like to optimise more!
});

it("should convert GitHub README to Markdown ", async () => {
const html = await fetch("https://github.com/inaridiy/webforai").then((res) => res.text());
const markdown = htmlToMarkdown(html, { baseUrl: "https://www.npmjs.com/package/webforai" });

// @ts-ignore
const original = await import("../../../README.md?raw");
const d = distance(markdown, original.default);
expect(d).lte(200); // I'd like to optimise more!
});
});
8 changes: 5 additions & 3 deletions packages/webforai/src/html-to-mdast.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import type { Nodes as Hast } from "hast";
import type { Nodes as Mdast } from "mdast";

import { fromHtml } from "hast-util-from-html";
import { toMdast } from "hast-util-to-mdast";
import type { Nodes as Mdast } from "mdast";

import { type Extracotrs, extractHast } from "./extract-hast";
import { extractMdast } from "./extract-mdast";
import { customAHandler } from "./mdast-handlers/custom-a-handler";
import { customCodeHandler } from "./mdast-handlers/custom-code-handler";
import { customDivHandler } from "./mdast-handlers/custom-div-handler";
Expand Down Expand Up @@ -63,5 +63,7 @@ export const htmlToMdast = (htmlOrHast: string | Hast, options?: HtmlToMdastOpti
},
});

return mdast;
const extractedMdast = extractMdast(mdast);

return extractedMdast;
};
7 changes: 6 additions & 1 deletion packages/webforai/src/mdast-handlers/custom-a-handler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@ export const customAHandler =
(options?: { asText?: boolean }): Handle =>
(state, node) => {
if (options?.asText) {
const link = { type: "text", value: hastToString(node) } as const;
const text = hastToString(node);
if (3 >= text.length) {
return undefined;
}

const link = { type: "text", value: text } as const;
state.patch(node, link);
return link;
}
Expand Down
1 change: 1 addition & 0 deletions packages/webforai/src/mdast-to-markdown.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ export interface MdastToMarkdownOptions extends ToMarkdownOptions {
*/
export const DEFAULT_MDAST_TO_MARKDOWN_OPTIONS: MdastToMarkdownOptions = {
extensions: [gfmToMarkdown(), mathToMarkdown()],
bullet: "-",
};

/**
Expand Down

0 comments on commit 871564f

Please sign in to comment.