diff --git a/src/htmlStringToDocument.ts b/src/htmlStringToDocument.ts new file mode 100644 index 0000000..30244cc --- /dev/null +++ b/src/htmlStringToDocument.ts @@ -0,0 +1,233 @@ +import { parseHtml } from "./parseHtml"; +import { + BLOCKS, + NodeData, + TopLevelBlock, + Document, + MARKS, + INLINES, + Node, + Text, + Block, + Inline, + Mark, +} from "@contentful/rich-text-types"; +import { HTMLElementNode, HTMLNode, HTMLTagName } from "./types"; + +const BLOCK_TYPES = Object.values(BLOCKS); +const INLINE_TYPES = Object.values(INLINES); +const MARK_TYPES = Object.values(MARKS); + +const isBlockType = (nodeType: BLOCKS | MARKS | INLINES): nodeType is BLOCKS => + BLOCK_TYPES.includes(nodeType as BLOCKS); +const isInlineType = ( + nodeType: BLOCKS | MARKS | INLINES +): nodeType is INLINES => INLINE_TYPES.includes(nodeType as INLINES); +const isMarkType = (nodeType: BLOCKS | MARKS | INLINES): nodeType is MARKS => + MARK_TYPES.includes(nodeType as MARKS); + +const isNodeTypeMark = (node: Node | Text | Mark): node is Mark => { + return isMarkType((node).type as MARKS); +}; + +const isNodeTypeText = (node: Node | Text | Mark): node is Text => { + if (isNodeTypeMark(node)) { + return false; + } + if (node.nodeType === "text") { + return true; + } + return false; +}; + +const HTML_TAG_NODE_TYPES: Partial< + Record +> = { + h1: BLOCKS.HEADING_1, + h2: BLOCKS.HEADING_2, + h3: BLOCKS.HEADING_3, + h4: BLOCKS.HEADING_4, + h5: BLOCKS.HEADING_5, + h6: BLOCKS.HEADING_6, + hr: BLOCKS.HR, + li: BLOCKS.LIST_ITEM, + ol: BLOCKS.OL_LIST, + p: BLOCKS.PARAGRAPH, + blockquote: BLOCKS.QUOTE, + table: BLOCKS.TABLE, + td: BLOCKS.TABLE_CELL, + th: BLOCKS.TABLE_HEADER_CELL, + tr: BLOCKS.TABLE_ROW, + ul: BLOCKS.UL_LIST, + b: MARKS.BOLD, + strong: MARKS.BOLD, + pre: MARKS.CODE, + i: MARKS.ITALIC, + sub: MARKS.SUBSCRIPT, + sup: MARKS.SUPERSCRIPT, + u: MARKS.UNDERLINE, + a: INLINES.HYPERLINK, + img: BLOCKS.EMBEDDED_ASSET, + video: BLOCKS.EMBEDDED_ASSET, + audio: BLOCKS.EMBEDDED_ASSET, +}; + +type Next = (node: HTMLNode) => Array; + +const convertBlockNode = + (nodeType: BLOCKS) => + (node: HTMLElementNode, next: Next): Array => { + const block: Block = { + nodeType, + content: next(node) as Array, + data: {}, + }; + return [block]; + }; + +const convertInlineNode = + (nodeType: INLINES) => + (node: HTMLElementNode, next: Next): Array => { + const inline: Inline = { + nodeType, + content: next(node) as Array, + data: {}, + }; + return [inline]; + }; + +const convertAnchorNode = (node: HTMLElementNode, next: Next) => { + const anchor: Inline = { + nodeType: INLINES.HYPERLINK, + content: next(node) as Array, + data: { + uri: node.attrs.href, + }, + }; + return [anchor]; +}; + +const convertMarkNode = (nodeType: MARKS) => (_: HTMLElementNode, __: Next) => { + const mark: Mark = { + type: nodeType, + }; + return [mark]; +}; + +type ConvertNode = Record< + HTMLTagName, + (node: HTMLElementNode, next: Next) => Array +>; + +type ConvertMark = Record< + HTMLTagName, + (node: HTMLElementNode, next: Next) => Array +>; + +interface ConvertOptions { + convertNode: Partial; + convertMark: Partial; +} + +const defaultConvertOptions: ConvertOptions = { + convertNode: { + h1: convertBlockNode(BLOCKS.HEADING_1), + h2: convertBlockNode(BLOCKS.HEADING_2), + h3: convertBlockNode(BLOCKS.HEADING_3), + h4: convertBlockNode(BLOCKS.HEADING_4), + h5: convertBlockNode(BLOCKS.HEADING_5), + h6: convertBlockNode(BLOCKS.HEADING_6), + hr: convertBlockNode(BLOCKS.HR), + li: convertBlockNode(BLOCKS.LIST_ITEM), + ol: convertBlockNode(BLOCKS.OL_LIST), + p: convertBlockNode(BLOCKS.PARAGRAPH), + blockquote: convertBlockNode(BLOCKS.QUOTE), + table: convertBlockNode(BLOCKS.TABLE), + td: convertBlockNode(BLOCKS.TABLE_CELL), + th: convertBlockNode(BLOCKS.TABLE_HEADER_CELL), + tr: convertBlockNode(BLOCKS.TABLE_ROW), + ul: convertBlockNode(BLOCKS.UL_LIST), + a: convertAnchorNode, + img: convertBlockNode(BLOCKS.EMBEDDED_ASSET), + video: convertBlockNode(BLOCKS.EMBEDDED_ASSET), + audio: convertBlockNode(BLOCKS.EMBEDDED_ASSET), + }, + convertMark: { + b: convertMarkNode(MARKS.BOLD), + strong: convertMarkNode(MARKS.BOLD), + pre: convertMarkNode(MARKS.CODE), + i: convertMarkNode(MARKS.ITALIC), + sub: convertMarkNode(MARKS.SUBSCRIPT), + sup: convertMarkNode(MARKS.SUPERSCRIPT), + u: convertMarkNode(MARKS.UNDERLINE), + }, +}; + +const createDocumentNode = ( + content: TopLevelBlock[], + data: NodeData = {} +): Document => ({ + nodeType: BLOCKS.DOCUMENT, + data, + content, +}); + +const mapHtmlNodeToRichTextNode: Next = (node) => { + if (node.type === "text") { + const textNode: Text = { + nodeType: "text", + marks: [], + value: node.value, + data: {}, + }; + return [textNode]; + } + + const next: Next = (node) => { + if (node.type === "element") { + return node.children.flatMap((child) => mapHtmlNodeToRichTextNode(child)); + } + return mapHtmlNodeToRichTextNode(node); + }; + + const nodeType = HTML_TAG_NODE_TYPES[node.tagName]; + + // Skip element if no node type is found + if (!nodeType) { + return next(node); + } + + if (isBlockType(nodeType)) { + const converter = defaultConvertOptions.convertNode[node.tagName] ?? next; + const block = converter(node, next); + return block; + } + + if (isInlineType(nodeType)) { + const converter = defaultConvertOptions.convertNode[node.tagName] ?? next; + const inline = converter(node, next); + return inline; + } + + if (isMarkType(nodeType)) { + const converter = defaultConvertOptions.convertMark[node.tagName] ?? next; + const mark = converter(node, next); + const children = next(node); + return children.map((child) => { + if (isNodeTypeText(child)) { + child.marks = [...child.marks, ...mark.filter(isNodeTypeMark)]; + } + return child; + }); + } + + throw new Error(`Unknown nodeType ${nodeType}`); +}; + +export const htmlStringToDocument = (htmlString: string): Document => { + const parsedHtml = parseHtml(htmlString); + const richTextNodes = parsedHtml.flatMap((node) => + mapHtmlNodeToRichTextNode(node) + ); + return createDocumentNode(richTextNodes as TopLevelBlock[]); +}; diff --git a/src/parseHtml.ts b/src/parseHtml.ts new file mode 100644 index 0000000..f2c8028 --- /dev/null +++ b/src/parseHtml.ts @@ -0,0 +1,64 @@ +import { parseFragment } from "parse5"; +import { + ChildNode, + Template, + DocumentType, + TextNode, + CommentNode, + Element, +} from "parse5/dist/tree-adapters/default"; +import { isNotNull } from "./utils"; + +import { HTMLNode, HTMLTagName } from "./types"; + +const isChildNodeComment = (childNode: ChildNode): childNode is CommentNode => { + return childNode.nodeName === "#comment"; +}; + +const isChildNodeTextNode = (childNode: ChildNode): childNode is TextNode => { + return childNode.nodeName === "#text"; +}; + +const isChildNodeTemplate = (childNode: ChildNode): childNode is Template => { + return childNode.nodeName === "template"; +}; + +const isChildNodeDocumentType = ( + childNode: ChildNode +): childNode is DocumentType => { + return childNode.nodeName === "#documentType"; +}; + +const mapChildNodeToHtmlNode = (childNode: ChildNode): HTMLNode | null => { + if ( + isChildNodeComment(childNode) || + isChildNodeDocumentType(childNode) || + isChildNodeTemplate(childNode) + ) { + return null; + } + if (isChildNodeTextNode(childNode)) { + return { + type: "text", + value: childNode.value, + }; + } + + return { + type: "element", + tagName: childNode.tagName as HTMLTagName, + children: childNode.childNodes + .map((c) => mapChildNodeToHtmlNode(c)) + .filter(isNotNull), + attrs: Object.fromEntries( + childNode.attrs.map((attr) => [attr.name, attr.value]) + ), + }; +}; + +export const parseHtml = (htmlString: string): HTMLNode[] => { + const parsedHtml = parseFragment(htmlString); + return parsedHtml.childNodes + .map((node) => mapChildNodeToHtmlNode(node)) + .filter(isNotNull); +}; diff --git a/src/test/example.ts b/src/test/example.ts new file mode 100644 index 0000000..085fbcd --- /dev/null +++ b/src/test/example.ts @@ -0,0 +1,665 @@ +import { Document } from "@contentful/rich-text-types"; + +export const EXAMPLE_RICH_TEXT = { + nodeType: "document", + data: {}, + content: [ + { + nodeType: "heading-2", + data: {}, + content: [ + { + nodeType: "text", + value: "Heading 2", + marks: [], + data: {}, + }, + ], + }, + { + nodeType: "heading-3", + data: {}, + content: [ + { + nodeType: "text", + value: "Heading 3", + marks: [], + data: {}, + }, + ], + }, + { + nodeType: "heading-4", + data: {}, + content: [ + { + nodeType: "text", + value: "Heading 4", + marks: [], + data: {}, + }, + ], + }, + { + nodeType: "heading-5", + data: {}, + content: [ + { + nodeType: "text", + value: "Heading 5", + marks: [], + data: {}, + }, + ], + }, + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "Normal Text", + marks: [], + data: {}, + }, + ], + }, + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "Bold Text", + marks: [ + { + type: "bold", + }, + ], + data: {}, + }, + ], + }, + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "Italic Text", + marks: [ + { + type: "italic", + }, + ], + data: {}, + }, + ], + }, + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "Underlined Text", + marks: [ + { + type: "underline", + }, + ], + data: {}, + }, + ], + }, + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "Bold, Italic, and Underlined Sentence", + marks: [ + { + type: "underline", + }, + { + type: "italic", + }, + { + type: "bold", + }, + ], + data: {}, + }, + ], + }, + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "Mixed ", + marks: [], + data: {}, + }, + { + nodeType: "text", + value: "Bold", + marks: [ + { + type: "bold", + }, + ], + data: {}, + }, + { + nodeType: "text", + value: ", ", + marks: [], + data: {}, + }, + { + nodeType: "text", + value: "Italic", + marks: [ + { + type: "italic", + }, + ], + data: {}, + }, + { + nodeType: "text", + value: ", and ", + marks: [], + data: {}, + }, + { + nodeType: "text", + value: "Underlined", + marks: [ + { + type: "underline", + }, + ], + data: {}, + }, + { + nodeType: "text", + value: " Sentence", + marks: [], + data: {}, + }, + ], + }, + { + nodeType: "hr", + data: {}, + content: [], + }, + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "", + marks: [], + data: {}, + }, + { + nodeType: "hyperlink", + data: { + uri: "https://example.com", + }, + content: [ + { + nodeType: "text", + value: "Hyperlink", + marks: [], + data: {}, + }, + ], + }, + { + nodeType: "text", + value: " to a URL", + marks: [], + data: {}, + }, + ], + }, + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "Sentence that is partially ", + marks: [], + data: {}, + }, + { + nodeType: "text", + value: "bold", + marks: [ + { + type: "bold", + }, + ], + data: {}, + }, + { + nodeType: "text", + value: " and partially a ", + marks: [], + data: {}, + }, + { + nodeType: "hyperlink", + data: { + uri: "https://example.com", + }, + content: [ + { + nodeType: "text", + value: "hyperlink", + marks: [], + data: {}, + }, + ], + }, + { + nodeType: "text", + value: "", + marks: [], + data: {}, + }, + ], + }, + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "Sentence th", + marks: [], + data: {}, + }, + { + nodeType: "text", + value: "at is partially ", + marks: [ + { + type: "bold", + }, + ], + data: {}, + }, + { + nodeType: "hyperlink", + data: { + uri: "https://example.com", + }, + content: [ + { + nodeType: "text", + value: "overl", + marks: [ + { + type: "bold", + }, + ], + data: {}, + }, + { + nodeType: "text", + value: "apping bold and", + marks: [], + data: {}, + }, + ], + }, + { + nodeType: "text", + value: " hyperlink", + marks: [], + data: {}, + }, + ], + }, + { + nodeType: "unordered-list", + data: {}, + content: [ + { + nodeType: "list-item", + data: {}, + content: [ + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "Unordered List Item 1", + marks: [], + data: {}, + }, + ], + }, + ], + }, + { + nodeType: "list-item", + data: {}, + content: [ + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "Unordered List Item 2", + marks: [], + data: {}, + }, + ], + }, + ], + }, + ], + }, + { + nodeType: "ordered-list", + data: {}, + content: [ + { + nodeType: "list-item", + data: {}, + content: [ + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "Ordered List Item 1", + marks: [], + data: {}, + }, + ], + }, + ], + }, + { + nodeType: "list-item", + data: {}, + content: [ + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "Ordered List Item 2", + marks: [], + data: {}, + }, + ], + }, + ], + }, + ], + }, + { + nodeType: "blockquote", + data: {}, + content: [ + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "Blockquote", + marks: [], + data: {}, + }, + ], + }, + ], + }, + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "", + marks: [], + data: {}, + }, + ], + }, + { + nodeType: "table", + data: {}, + content: [ + { + nodeType: "table-row", + data: {}, + content: [ + { + nodeType: "table-header-cell", + data: {}, + content: [ + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "Table Header Cell 1", + marks: [], + data: {}, + }, + ], + }, + ], + }, + { + nodeType: "table-header-cell", + data: {}, + content: [ + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "Table Header Cell 2", + marks: [], + data: {}, + }, + ], + }, + ], + }, + ], + }, + { + nodeType: "table-row", + data: {}, + content: [ + { + nodeType: "table-cell", + data: {}, + content: [ + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "Table Cell AA", + marks: [], + data: {}, + }, + ], + }, + ], + }, + { + nodeType: "table-cell", + data: {}, + content: [ + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "Table Cell BA", + marks: [], + data: {}, + }, + ], + }, + ], + }, + ], + }, + { + nodeType: "table-row", + data: {}, + content: [ + { + nodeType: "table-cell", + data: {}, + content: [ + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "Table Cell AB", + marks: [], + data: {}, + }, + ], + }, + ], + }, + { + nodeType: "table-cell", + data: {}, + content: [ + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "Table Cell BB", + marks: [], + data: {}, + }, + ], + }, + ], + }, + ], + }, + { + nodeType: "table-row", + data: {}, + content: [ + { + nodeType: "table-cell", + data: {}, + content: [ + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "Table Cell ", + marks: [], + data: {}, + }, + { + nodeType: "hyperlink", + data: { + uri: "https://example.com", + }, + content: [ + { + nodeType: "text", + value: "Hyperlink", + marks: [], + data: {}, + }, + ], + }, + { + nodeType: "text", + value: "", + marks: [], + data: {}, + }, + ], + }, + ], + }, + { + nodeType: "table-cell", + data: {}, + content: [ + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "Table Cell Marks", + marks: [ + { + type: "bold", + }, + { + type: "italic", + }, + { + type: "underline", + }, + ], + data: {}, + }, + ], + }, + ], + }, + ], + }, + ], + }, + { + nodeType: "paragraph", + data: {}, + content: [ + { + nodeType: "text", + value: "", + marks: [], + data: {}, + }, + ], + }, + ], +} as Document; diff --git a/src/test/index.test.ts b/src/test/index.test.ts new file mode 100644 index 0000000..71737c8 --- /dev/null +++ b/src/test/index.test.ts @@ -0,0 +1,59 @@ +import { documentToHtmlString } from "@contentful/rich-text-html-renderer"; +import { BLOCKS, Document } from "@contentful/rich-text-types"; +import { htmlStringToDocument } from "../htmlStringToDocument"; + +import { describe, expect, it } from "vitest"; +import { EXAMPLE_RICH_TEXT } from "./example"; + +// https://www.contentful.com/developers/docs/tutorials/general/getting-started-with-rich-text-field-type/ +const richTextDocument: Document = { + nodeType: BLOCKS.DOCUMENT, + data: {}, + content: [ + { + nodeType: BLOCKS.PARAGRAPH, + content: [ + { + nodeType: "text", + marks: [], + value: "I am an odd paragraph.", + data: {}, + }, + ], + data: {}, + }, + { + nodeType: BLOCKS.PARAGRAPH, + content: [ + { + nodeType: "text", + marks: [], + value: "I am even.", + data: {}, + }, + ], + data: {}, + }, + ], +}; + +const htmlString = documentToHtmlString(EXAMPLE_RICH_TEXT); + +describe("Parse HTML string to Contentful Document", () => { + /*it("Parse string to generic HTML nodes", () => { + const htmlNodes = parseHtml(CISION_EXAMPLE); + expect(htmlNodes).toEqual([]); + });*/ + + /*it("Parse HTML string to Contentful Rich Text", () => { + const htmlNodes = htmlStringToDocument(CISION_EXAMPLE); + const newHtmlString = documentToHtmlString(htmlNodes); + expect(newHtmlString).toEqual(CISION_EXAMPLE); + });*/ + + it("Parse HTML string to Contentful Rich Text", () => { + const htmlNodes = htmlStringToDocument(htmlString); + const newHtmlString = documentToHtmlString(htmlNodes); + expect(newHtmlString).toEqual(htmlString); + }); +}); diff --git a/src/types.ts b/src/types.ts new file mode 100644 index 0000000..62daa65 --- /dev/null +++ b/src/types.ts @@ -0,0 +1,15 @@ +export type HTMLTagName = keyof HTMLElementTagNameMap; + +export interface HTMLTextNode { + type: "text"; + value: string; +} + +export interface HTMLElementNode { + type: "element"; + tagName: HTMLTagName; + children: HTMLNode[]; + attrs: { [attr: string]: string }; +} + +export type HTMLNode = HTMLElementNode | HTMLTextNode; diff --git a/src/utils.ts b/src/utils.ts new file mode 100644 index 0000000..202f20c --- /dev/null +++ b/src/utils.ts @@ -0,0 +1,2 @@ +export const isNotNull = (value: T): value is Exclude => + value !== null;