Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/accuracy improvement #18

Merged
merged 7 commits into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/little-papayas-divide.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"webforai": patch
---

Improve
5 changes: 5 additions & 0 deletions .changeset/nervous-eagles-hope.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"webforai": patch
---

accuracy improvement
5 changes: 5 additions & 0 deletions .changeset/proud-pillows-sort.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"webforai": patch
---

Minor performance improvements
42 changes: 28 additions & 14 deletions packages/webforai/src/extract-hast/readability.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ const REGEXPS = {
hidden: /hidden|invisible|fallback-image/i,
byline: /byline|author|dateline|writtenby|p-author/i,
unlikelyCandidates:
/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote|speechify-ignore/i,
/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote|speechify-ignore|uls-language-block/i,
okMaybeItsaCandidate: /and|article|body|column|content|main|shadow|code/i,
};

Expand Down Expand Up @@ -97,7 +97,6 @@ const unlikelyElementFilter = (node: Hast) => {
return true;
}
const element = node as Element;
const match = matchString(element);

// Skip main content elements
if (["body", "article", "main", "section", "a"].includes(element.tagName)) {
Expand All @@ -106,6 +105,7 @@ const unlikelyElementFilter = (node: Hast) => {
if (hasAncestors(element, ["table", "code"], 3)) {
return true;
}
const match = matchString(element);

// Remove unlikely candidates
if (REGEXPS.unlikelyCandidates.test(match) && !REGEXPS.okMaybeItsaCandidate.test(match)) {
Expand All @@ -115,19 +115,39 @@ const unlikelyElementFilter = (node: Hast) => {
return true;
};

const isImageLink = (element: Element) => {
const a = select("a", element);
const img = select("img", a);

if (!(a && img)) {
return false;
}

const imgFilename = img.properties.src?.toString().split("/").pop();
const hrefFilename = a.properties.href?.toString().split("/").pop();

return imgFilename === hrefFilename && imgFilename;
};

const removeEmptyFilter = (node: Hast) => {
if (node.type !== "element") {
return true;
}
const element = node as Element;

if (!PARAGRAPH_TAGS.includes(element.tagName)) {
return true;
}

if (isImageLink(element)) {
return true;
}

const text = hastToString(element);
if (text.length < 10) {
return false;
}

return true;
};

Expand All @@ -136,7 +156,7 @@ export const readabilityExtractHast = (hast: Hast): Hast => {
const body = select("body", hast) ?? hast;

const proxiedHast = parents(body) as unknown as ProxiedHast;
let baseFilterd = filter(proxiedHast, (node) => {
const baseFilterd = filter(proxiedHast, (node) => {
if (!metadataFilter(node as Hast)) {
return false;
}
Expand All @@ -152,17 +172,7 @@ export const readabilityExtractHast = (hast: Hast): Hast => {

const baseText = hastToString(baseFilterd);
let minimalLength = lang in BASE_MINIMAL_LENGTH ? BASE_MINIMAL_LENGTH[lang as keyof typeof BASE_MINIMAL_LENGTH] : 500;
if (baseText.length > minimalLength) {
const filterd = filter(baseFilterd, (node) => {
if (!unlikelyElementFilter(node as Hast)) {
return false;
}
return true;
});
if (filterd) {
baseFilterd = filterd;
}
} else {
if (baseText.length < minimalLength) {
minimalLength = Math.max(0, baseText.length - 200);
}

Expand Down Expand Up @@ -193,6 +203,10 @@ export const readabilityExtractHast = (hast: Hast): Hast => {
if (!removeEmptyFilter(node as Hast)) {
return false;
}
if (!unlikelyElementFilter(node as Hast)) {
return false;
}

return true;
}) as Hast;

Expand Down
2 changes: 1 addition & 1 deletion packages/webforai/src/extract-hast/utils.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import type { Element } from "hast";

export const matchString = (element: Element) =>
`${element.tagName} ${element.properties.id} ${classnames(element).join(" ")} }`;
`${element.tagName} ${element.properties.id} ${classnames(element).join(" ")}`;

export const classnames = (element: Element) => {
if (Array.isArray(element.properties.className)) {
Expand Down
25 changes: 25 additions & 0 deletions packages/webforai/src/extract-mdast.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import type { Nodes as Mdast, Parent } from "mdast";
import { filter } from "unist-util-filter";

const DECLATION_TYPES = ["blockquote", "strong", "emphasis", "delete"];

const emptyDeclarationFilter = (node: Mdast) => {
if (!DECLATION_TYPES.includes(node.type)) {
return true;
}
if ((node as Parent).children.length === 0) {
return false;
}

return true;
};

export const extractMdast = (node: Mdast) => {
const extracted = filter(node, (node) => {
if (!emptyDeclarationFilter(node as Mdast)) {
return false;
}
return true;
});
return extracted as Mdast;
};
30 changes: 20 additions & 10 deletions packages/webforai/src/html-to-markdown.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ Example

![Example](/example.jpg)

* Item 1
* Item 2
- Item 1
- Item 2
`;

const imageHidden = `# Hello, world!
Expand All @@ -55,8 +55,8 @@ This is a paragraph.

[Example](/example.html)

* Item 1
* Item 2
- Item 1
- Item 2
`;

const htmlTable = `
Expand Down Expand Up @@ -85,31 +85,31 @@ describe("htmlToMarkdown", () => {
it("should convert HTML to Markdown", () => {
const markdown = htmlToMarkdown(html, { extractors: false });
const d = distance(markdown, expected);
expect(d).lte(2);
expect(d).lte(5);
});

it("should convert HTML to Markdown with replaced base URL", () => {
const markdown = htmlToMarkdown(html, { baseUrl: "https://example.com", extractors: false });
const d = distance(markdown, baseUrlReplaced);
expect(d).lte(2);
expect(d).lte(5);
});

it("should convert HTML to Markdown with links as text", () => {
const markdown = htmlToMarkdown(html, { linkAsText: true, extractors: false });
const d = distance(markdown, linkAsText);
expect(d).lte(2);
expect(d).lte(5);
});

it("should convert HTML to Markdown with hidden images", () => {
const markdown = htmlToMarkdown(html, { hideImage: true, extractors: false });
const d = distance(markdown, imageHidden);
expect(d).lte(2);
expect(d).lte(5);
});

it("should convert HTML table to Markdown table", () => {
const markdown = htmlToMarkdown(htmlTable, { extractors: false });
const d = distance(markdown, expectedTableMarkdown);
expect(d).lte(4);
expect(d).lte(5);
});

it("should convert HTML table with table as text option", () => {
Expand All @@ -127,6 +127,16 @@ describe("htmlToMarkdown E2E", () => {
// @ts-ignore
const original = await import("../README.md?raw");
const d = distance(markdown, original.default);
expect(d).lte(1000); // I'd like to optimise more!
expect(d).lte(400); // I'd like to optimise more!
});

it("should convert GitHub README to Markdown ", async () => {
const html = await fetch("https://github.com/inaridiy/webforai").then((res) => res.text());
const markdown = htmlToMarkdown(html, { baseUrl: "https://www.npmjs.com/package/webforai" });

// @ts-ignore
const original = await import("../../../README.md?raw");
const d = distance(markdown, original.default);
expect(d).lte(200); // I'd like to optimise more!
});
});
8 changes: 5 additions & 3 deletions packages/webforai/src/html-to-mdast.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import type { Nodes as Hast } from "hast";
import type { Nodes as Mdast } from "mdast";

import { fromHtml } from "hast-util-from-html";
import { toMdast } from "hast-util-to-mdast";
import type { Nodes as Mdast } from "mdast";

import { type Extracotrs, extractHast } from "./extract-hast";
import { extractMdast } from "./extract-mdast";
import { customAHandler } from "./mdast-handlers/custom-a-handler";
import { customCodeHandler } from "./mdast-handlers/custom-code-handler";
import { customDivHandler } from "./mdast-handlers/custom-div-handler";
Expand Down Expand Up @@ -63,5 +63,7 @@ export const htmlToMdast = (htmlOrHast: string | Hast, options?: HtmlToMdastOpti
},
});

return mdast;
const extractedMdast = extractMdast(mdast);

return extractedMdast;
};
7 changes: 6 additions & 1 deletion packages/webforai/src/mdast-handlers/custom-a-handler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@ export const customAHandler =
(options?: { asText?: boolean }): Handle =>
(state, node) => {
if (options?.asText) {
const link = { type: "text", value: hastToString(node) } as const;
const text = hastToString(node);
if (3 >= text.length) {
return undefined;
}

const link = { type: "text", value: text } as const;
state.patch(node, link);
return link;
}
Expand Down
1 change: 1 addition & 0 deletions packages/webforai/src/mdast-to-markdown.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ export interface MdastToMarkdownOptions extends ToMarkdownOptions {
*/
export const DEFAULT_MDAST_TO_MARKDOWN_OPTIONS: MdastToMarkdownOptions = {
extensions: [gfmToMarkdown(), mathToMarkdown()],
bullet: "-",
};

/**
Expand Down
Loading