From 5d7aa59605265e2b2a30dddd80594bd20a5a50ef Mon Sep 17 00:00:00 2001 From: Richard Ore Date: Sat, 27 May 2023 21:52:31 +0100 Subject: [PATCH 1/3] Added html module to Blade built-in libs --- libs/html | 1 + 1 file changed, 1 insertion(+) create mode 160000 libs/html diff --git a/libs/html b/libs/html new file mode 160000 index 00000000..bd964c41 --- /dev/null +++ b/libs/html @@ -0,0 +1 @@ +Subproject commit bd964c41cc9c60f516a0c0204ec6939833b9d344 From 47ff20730c8703df5fcd825897c394429e911c49 Mon Sep 17 00:00:00 2001 From: Richard Ore Date: Sat, 27 May 2023 21:57:02 +0100 Subject: [PATCH 2/3] deleted gitted html lib --- libs/html | 1 - 1 file changed, 1 deletion(-) delete mode 160000 libs/html diff --git a/libs/html b/libs/html deleted file mode 160000 index bd964c41..00000000 --- a/libs/html +++ /dev/null @@ -1 +0,0 @@ -Subproject commit bd964c41cc9c60f516a0c0204ec6939833b9d344 From d5eabcb4b58379a0f36a623015a0e95efecbbf3c Mon Sep 17 00:00:00 2001 From: Richard Ore Date: Sat, 27 May 2023 21:58:37 +0100 Subject: [PATCH 3/3] readding html --- libs/html/format.b | 50 +++++++ libs/html/html.b | 33 +++++ libs/html/index.b | 153 ++++++++++++++++++++++ libs/html/lexer.b | 315 +++++++++++++++++++++++++++++++++++++++++++++ libs/html/parser.b | 150 +++++++++++++++++++++ libs/html/tags.b | 55 ++++++++ 6 files changed, 756 insertions(+) create mode 100644 libs/html/format.b create mode 100644 libs/html/html.b create mode 100644 libs/html/index.b create mode 100644 libs/html/lexer.b create mode 100644 libs/html/parser.b create mode 100644 libs/html/tags.b diff --git a/libs/html/format.b b/libs/html/format.b new file mode 100644 index 00000000..0444884c --- /dev/null +++ b/libs/html/format.b @@ -0,0 +1,50 @@ +#!-- part of the html module + +import iters + +def split_head(str, sep) { + var idx = str.index_of(sep) + if idx == -1 return [str] + return [str[0, idx], str[idx + sep.length(),]] +} + +def unquote(str) { + var car = str[0] + var end = str.length() - 1 + var is_quote_start = car == '"' or car == "'" + if is_quote_start and car == str[end] { + return str[1, end] + } + return str +} + +def format(nodes, options) { + return iters.map(nodes, | node | { + var type = node.type + var output_node = type == 'element' ? { + type, + name: node.name, + attributes: format_attributes(node.attributes), + children: format(node.children, options), + } : { + type, + content: node.content, + } + if options.get('with_position', false) { + output_node.position = node.position + } + return output_node + }) +} + +def format_attributes(attributes) { + return iters.map(attributes, | attribute | { + var parts = split_head(attribute.trim(), '=') + var name = parts[0] + var value + if parts.length() > 1 { + value = is_string(parts[1]) ? unquote(parts[1]) : nil + } + return {name, value} + }) +} diff --git a/libs/html/html.b b/libs/html/html.b new file mode 100644 index 00000000..dbac1749 --- /dev/null +++ b/libs/html/html.b @@ -0,0 +1,33 @@ +#!-- part of the html module + +import iters + +def format_attributes(attributes) { + return iters.reduce(attributes, | attrs, attribute | { + if (attribute.value == nil) { + return '${attrs} ${attribute.name}' + } + # var quote_escape = attribute.value.index_of('\'') != -1 + # var quote = quote_escape ? '"' : '\'' + var quote = '"' + return '${attrs} ${attribute.name}=${quote}${attribute.value}${quote}' + }, '') +} + +def html(tree, options) { + var res = '' + for node in tree { + if is_list(node) { + res += html(node, options) + } else if (node.type == 'text') { + res += node.content + } else if (node.type == 'comment') { + res += '' + } else { + var is_self_closing = options.void_tags.contains(node.name.lower()) + res += is_self_closing ? '<${node.name}${format_attributes(node.attributes)}>' : + '<${node.name}${format_attributes(node.attributes)}>${html(node.children, options)}' + } + } + return res +} diff --git a/libs/html/index.b b/libs/html/index.b new file mode 100644 index 00000000..a44c398e --- /dev/null +++ b/libs/html/index.b @@ -0,0 +1,153 @@ +# +# @module html +# +# The html module provides interfaces for converting HTML to Blade and vice-versa. +# +# ## Nodes: Simplifying HTML Representation +# +# Nodes are the building blocks that describe HTML tags, including their attributes, contents, and nested children. This representation closely resembles how web browsers organize an HTML document in the Document Object Model (DOM) using nodes. When the `html` module decodes HTML, it generates a nested list of nodes. +# +# Within the `html` module, there are two main types of elements, just like in the HTML DOM: Text nodes and Element nodes. +# +# ### Element Node: Structuring HTML Elements +# +# An element node is represented by a dictionary with the following properties: +# +# - **type**: Indicates the node type. For element nodes, this value is always "element." +# - **name**: Specifies the name of the HTML element. +# - **attributes**: Contains a list of attributes associated with the HTML element, such as the `id` or `style` attribute. Each attribute consists of a `name` and `value` entry. +# - **children**: Represents a list of nodes that are nested within the current element. +# +# ### Text Node: Handling Textual Content +# +# A text node is represented by a dictionary with the following properties: +# +# - **type**: Indicates the node type. For text nodes, this value is always "text." +# - **content**: Stores the textual content of the element, equivalent to the `innerText` property in JavaScript. +# +# To identify the type of node, you can check the value of the `type` property. +# +# When calling the `decode()` function with the `with_position` option enabled, both node types will include an additional key called "position." The position dictionary provides the following information: +# +# - **index**: Represents the ordinal index in the source string that corresponds to the start of the HTML element. +# - **line**: Specifies the line number in the HTML source where the node is located. +# - **column**: Indicates the offset, in terms of columns, from the start of the line in the source. +# +# ## Options: Configuring Decode and Encode +# +# The default exported functions `decode()` and `encode()` offer an optional second argument that allows you to customize their behavior. You can pass a dictionary of options to these functions to modify their functionality. Below are the available options: +# +# - `void_tags`: Specifies a list of HTML tags that are considered void elements. +# - `closing_tags`: Defines whether or not closing tags should be included in the output. +# - `childless_tags`: Indicates a list of HTML tags that do not have any children. +# - `tag_ancestors`: Specifies whether to include the ancestors of an HTML tag in the output. +# - `with_position`: Enables the inclusion of position information for nodes. +# +# By default, all these options are set to their exported values, adhering to the HTML specification. You can selectively specify options based on your requirements. Remember that when providing options, they are merged with the default values to determine the final configuration. Omitting options will keep their default behavior. +# +# > Note: This applies to any function within the module that accepts an `options` argument. +# +# @copyright 2023, Ore Richard Muyiwa and Blade contributors +# + +import .tags { * } +import .lexer +import .parser +import .format +import .html + +var _parse_defaults = { + void_tags, + closing_tags, + childless_tags, + tag_ancestors, + with_position: false +} + +/** + * decode(str [, options]) + * + * Decodes an HTML string into a list of nodes (described above) + * representing the structure of the HTML document. + * + * The _options_ argument is an optional argument that allows the caller + * to modify how HTML is decoded using one or more of the HTML options + * described above. For example, one can pass the `void_tags` option to + * declare a custom tag as self-closing and thus avoid an error from not + * closing such tags. + * + * Example, + * + * ```blade + * import html + * echo html.decode('

Hello World!

') + * ``` + * + * The code above should output the following: + * + * ``` + * [{type: element, name: p, attributes: [], children: [{type: text, content: Hello World!}]}] + * ``` + * + * You can include information about the position of the node in the source by setting the + * `with_position` option to `true`. + * + * For example: + * + * ```blade + * import html + * echo html.decode('', {with_position: true}) + * ``` + * + * The code should output the nodes with the position information. + * + * ``` + * [{type: element, name: img, attributes: [], children: [], position: {start: {index: 0, line: 1, column: 1}, end: {index: 5, line: 1, column: 6}}}] + * ``` + * + * @param string str + * @param dict options + * @returns list + */ +def decode(str, options) { + # create options + if !options options = _parse_defaults + else { + for key, value in _parse_defaults { + if options.get(key, nil) == nil { + options.set(key, value) + } + } + } + + var tokens = lexer(str, options) + var nodes = parser(tokens, options) + return format(nodes, options) +} + +/** + * encode(nodes [, options]) + * + * Encodes the list of `elements` into an HTML string. + * + * The _options_ argument is an optional argument that allows the caller + * to modify how HTML is encoded using one or more of the HTML options + * described above. For example, one can pass the `void_tags` option to + * declare a custom tag as self-closing. + * + * @param list nodes + * @param dict options + * @returns string + */ +def encode(nodes, options) { + if !options options = _parse_defaults + else { + for key, value in _parse_defaults { + if options.get(key, nil) == nil { + options.set(key, value) + } + } + } + + return html(nodes, options) +} diff --git a/libs/html/lexer.b b/libs/html/lexer.b new file mode 100644 index 00000000..43d4e744 --- /dev/null +++ b/libs/html/lexer.b @@ -0,0 +1,315 @@ +#!-- part of the html module + +def feed_position(position, str, len) { + var start = position.index + var end = position.index = start + len + iter var i = start; i < end; i++ { + var char = str[i] + if char == '\n' { + position.line++ + position.column = 1 + } else { + position.column++ + } + } +} + +def jump_position(position, str, end) { + var len = end - position.index + return feed_position(position, str, len) +} + +def lexer(str, options) { + var state = { + str, + options, + position: { + index: 0, + line: 1, + column: 1, + }, + tokens: [] + } + lex(state) + return state.tokens +} + +def lex(state) { + var len = state.str.length() + while state.position.index < len { + var start = state.position.index + lex_text(state) + if state.position.index == start { + var is_comment = state.str.index_of('!--', start + 1) > -1 + if is_comment { + lex_comment(state) + } else { + var name = lex_tag(state) + if state.get('childless_tags', []).contains(name.lower()) { + lex_skip_tag(name, state) + } + } + } + } +} + +var alphanumeric = '/[A-_za-z0-9]/' + +def find_text_end(str, index) { + while true { + var text_end = str.index_of('<', index) + if text_end == -1 { + return text_end + } + var char = str[text_end + 1] + if char == '/' or char == '!' or char.match(alphanumeric) { + return text_end + } + index = text_end + 1 + } +} + +def lex_text(state) { + var type = 'text' + var text_end = find_text_end(state.str, state.position.index) + if text_end == state.position.index return + if text_end == -1 { + text_end = state.str.length() + } + + var start = state.position.clone() + var content = state.str[state.position.index, text_end] + jump_position(state.position, state.str, text_end) + var end = state.position.clone() + state.tokens.append({ + type, + content, + position: { + start, + end, + } + }) +} + +def lex_comment(state) { + var start = state.position.clone() + feed_position(state.position, state.str, 4) # "', state.position.index) + var comment_end = content_end + 3 # "-->".length() + if content_end == -1 { + content_end = comment_end = state.str.length() + } + + var content = state.str[state.position.index, content_end] + jump_position(state.position, state.str, comment_end) + state.tokens.append({ + type: 'comment', + content, + position: { + start, + end: state.position.clone() + } + }) +} + +def lex_tag(state) { + { + var second_char = state.str[state.position.index + 1] + var close = second_char == '/' + var start = state.position.clone() + feed_position(state.position, state.str, close ? 2 : 1) + state.tokens.append({ + type: 'tag-start', + close, + position: { + start, + } + }) + } + + var name = lex_tag_name(state) + lex_tag_attributes(state) + + { + var first_char = state.str[state.position.index] + var close = first_char == '/' + feed_position(state.position, state.str, close ? 2 : 1) + var end = state.position.clone() + state.tokens.append({ + type: 'tag-end', + close, + position: { + end, + } + }) + } + + return name +} + +# _see https:#developer.mozilla.org/en-US/docs/_web/_java_script/_guide/_regular__expressions#special-white-space +var whitespace = '/\s/' + +def is_whitespace_char(char) { + return char.match(whitespace) +} + +def lex_tag_name(state) { + var len = state.str.length() + var start = state.position.index + while start < len { + var char = state.str[start] + var is_tag_char = !(is_whitespace_char(char) or char == '/' or char == '>') + if is_tag_char break + start++ + } + + var end = start + 1 + while end < len { + var char = state.str[end] + var is_tag_char = !(is_whitespace_char(char) or char == '/' or char == '>') + if !is_tag_char break + end++ + } + + jump_position(state.position, state.str, end) + var name = state.str[start, end] + state.tokens.append({ + type: 'tag', + content: name + }) + return name +} + +def lex_tag_attributes(state) { + var cursor = state.position.index + var quote = nil # nil, single-, or double-quote + var word_begin = cursor # index of word start + var words = [] # "key", "key=value", "key='value'", etc + var len = state.str.length() + while cursor < len { + var char = state.str[cursor] + if quote { + var is_quote_end = char == quote + if is_quote_end { + quote = nil + } + cursor++ + continue + } + + var is_tag_end = char == '/' or char == '>' + if is_tag_end { + if cursor != word_begin { + words.append(state.str[word_begin, cursor]) + } + break + } + + var is_word_end = is_whitespace_char(char) + if is_word_end { + if cursor != word_begin { + words.append(state.str[word_begin, cursor]) + } + word_begin = cursor + 1 + cursor++ + continue + } + + var is_quote_start = char == '\'' or char == '"' + if is_quote_start { + quote = char + cursor++ + continue + } + + cursor++ + } + jump_position(state.position, state.str, cursor) + + var w_len = words.length() + var type = 'attribute' + iter var i = 0; i < w_len; i++ { + var word = words[i] + var is_not_pair = word.index_of('=') == -1 + if is_not_pair and words.length() > i + 1 { + var second_word = words[i + 1] + if second_word and second_word.index_of('=') > -1 { + if second_word.length() > 1 { + var new_word = word + second_word + state.tokens.append({type, content: new_word}) + i += 1 + continue + } + var third_word = words[i + 2] + i += 1 + if third_word { + var new_word = word + '=' + third_word + state.tokens.append({type, content: new_word}) + i += 1 + continue + } + } + } + if word.ends_with('=') { + var second_word = words[i + 1] + if second_word and second_word.index_of('=') > -1 { + var new_word = word + second_word + state.tokens.append({type, content: new_word}) + i += 1 + continue + } + + var new_word = word[0, -1] + state.tokens.append({type, content: new_word}) + continue + } + + state.tokens.append({type, content: word}) + } +} + +def lex_skip_tag(name, state) { + var safe_tag_name = name.lower() + var len = state.str.length() + var index = state.position.index + while index < len { + var next_tag = state.str.index_of('= 0 { + var parent_tag_name = stack[current_index].name + if parent_tag_name == name { + break + } + if tag_parents.contains(parent_tag_name) { + return true + } + current_index-- + } + } + return false +} + +def rewind_stack(stack, new_length, children_end_position, end_position) { + stack[new_length].position.end = end_position + var len = stack.length() + iter var i = new_length + 1; i < len; i++ { + stack[i].position.end = children_end_position + } + stack[new_length,] +} + +def parse(state) { + var nodes = state.stack[state.stack.length() - 1].children + var len = state.tokens.length() + while state.cursor < len { + var token = state.tokens[state.cursor] + if token.type != 'tag-start' { + nodes.append(token) + state.cursor++ + continue + } + + var tag_token = state.tokens[state.cursor++] + state.cursor++ + var name = tag_token.content.lower() + if token.close { + var index = state.stack.length() + var should_rewind = false + while index-- > -1 { + if state.stack[index].name == name { + should_rewind = true + break + } + } + while state.cursor < len { + var end_token = state.tokens[state.cursor] + if end_token.type != 'tag-end' break + state.cursor++ + } + if should_rewind { + rewind_stack(state.stack, index, token.position.start, state.tokens[state.cursor - 1].position.end) + break + } else { + continue + } + } + + var is_closing_tag = state.options.closing_tags.contains(name.lower()) + var should_rewind_to_auto_close = is_closing_tag + if should_rewind_to_auto_close { + var terminals = state.options.get('tag_ancestors') + should_rewind_to_auto_close = !has_terminal_parent(name.lower(), state.stack, terminals) + } + + if should_rewind_to_auto_close { + # rewind the stack to just above the previous + # closing tag of the same name + var current_index = state.stack.length() - 1 + while current_index > 0 { + if name == state.stack[current_index].name { + rewind_stack(state.stack, current_index, token.position.start, token.position.start) + var previous_index = current_index - 1 + nodes = state.stack[previous_index].children + break + } + current_index = current_index - 1 + } + } + + var attributes = [] + var attr_token + while state.cursor < len { + attr_token = state.tokens[state.cursor] + if attr_token.type == 'tag-end' break + attributes.append(attr_token.content) + state.cursor++ + } + + state.cursor++ + var children = [] + var position = { + start: token.position.start, + end: attr_token.position.end + } + var element_node = { + type: 'element', + name: tag_token.content, + attributes, + children, + position, + } + nodes.append(element_node) + + var has_children = !(attr_token.close or state.options.void_tags.contains(name.lower())) + if has_children { + var size = state.stack.append({ + name, + children, + position, + }) + var inner_state = { + tokens: state.tokens, + options: state.options, + cursor: state.cursor, + stack: state.stack, + } + parse(inner_state) + state.cursor = inner_state.cursor + var rewound_in_element = state.stack.length() == size + if rewound_in_element { + element_node.position.end = state.tokens[state.cursor - 1].position.end + } + } + } +} diff --git a/libs/html/tags.b b/libs/html/tags.b new file mode 100644 index 00000000..18f9c803 --- /dev/null +++ b/libs/html/tags.b @@ -0,0 +1,55 @@ +#!-- part of the html module + +/** + * Tags which contain arbitary non-parsed content + * For example: