diff --git a/.gitignore b/.gitignore index 0d770e8c..63ab5761 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,6 @@ ext/nokogumbo/* /lib/nokogumbo/nokogumbo.dll /pkg /tmp +/gumbo-parser/googletest +/gumbo-parser/build +/test/html5lib-tests diff --git a/.travis.yml b/.travis.yml index 82583c94..94f986b7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,6 @@ os: - osx - linux rvm: # http://rubies.travis-ci.org/ - - 1.9 - 2.0 - 2.1 - 2.2 @@ -13,18 +12,38 @@ rvm: # http://rubies.travis-ci.org/ - 2.5 matrix: exclude: - - os: osx - rvm: 1.9 - os: osx rvm: 2.0 - os: osx - env: WITH_LIBXML=false V=1 + env: WITH_LIBXML=false + include: + - name: test gumbo + os: osx + language: cpp + install: + - curl -L https://github.com/google/googletest/archive/release-1.8.0.tar.gz | tar zxf - --strip-components 1 -C gumbo-parser googletest-release-1.8.0/googletest + - make -C gumbo-parser/googletest/make gtest_main.a + before_script: true + script: + - make -C gumbo-parser + - name: test gumbo + os: linux + language: cpp + install: + - curl -L https://github.com/google/googletest/archive/release-1.8.0.tar.gz | tar zxf - --strip-components 1 -C gumbo-parser googletest-release-1.8.0/googletest + - make -C gumbo-parser/googletest/make gtest_main.a + before_script: true + script: + - make -C gumbo-parser + env: - - WITH_LIBXML=true V=1 - - WITH_LIBXML=false V=1 -before_script: | - if [ "$WITH_LIBXML" == "false" ]; then - sudo apt-get remove libxml2-dev - fi + - WITH_LIBXML=true + - WITH_LIBXML=false +before_script: + - if [ "$WITH_LIBXML" = "false" ]; then sudo apt-get remove libxml2-dev; fi + - cd test && git clone --depth 1 --branch master --single-branch https://github.com/html5lib/html5lib-tests.git +script: + - MAKE='make V=1' bundle exec rake compile + - bundle exec rake sudo: required cache: bundler diff --git a/Rakefile b/Rakefile index 36f753a4..e573db75 100644 --- a/Rakefile +++ b/Rakefile @@ -27,6 +27,12 @@ SOURCES = ['ext/nokogumbo/extconf.rb', 'ext/nokogumbo/nokogumbo.c'] # gem, package, and extension tasks task 'gem' => 'test' +desc 'Run the gumbo unit tests' +task 'test:gumbo' => 'gumbo-parser/googletest' do + sh(*%w{make -C gumbo-parser}) +end + + SPEC = Gem::Specification.new do |gem| gem.name = 'nokogumbo' gem.version = Nokogumbo::VERSION diff --git a/ext/nokogumbo/extconf.rb b/ext/nokogumbo/extconf.rb index aef75ccd..4829c7c3 100644 --- a/ext/nokogumbo/extconf.rb +++ b/ext/nokogumbo/extconf.rb @@ -1,6 +1,8 @@ require 'mkmf' $CFLAGS += " -std=c99" +CONFIG['warnflags'] = '-Wall' + if have_library('xml2', 'xmlNewDoc') # libxml2 libraries from http://www.xmlsoft.org/ pkg_config('libxml-2.0') @@ -28,11 +30,6 @@ require 'fileutils' gumbo_dir = File.expand_path('../../gumbo-parser', ext_dir) FileUtils.ln_s(Dir[File.join(gumbo_dir, 'src/*.[hc]')], ext_dir, force:true) - case RbConfig::CONFIG['target_os'] - when 'mingw32', /mswin/ - FileUtils.ln_s(Dir[File.join(gumbo_dir, 'visualc/include/*.h')], ext_dir, - force: true) - end # Set these to nil so that create_makefile picks up the new sources. $srcs = $objs = nil end diff --git a/ext/nokogumbo/nokogumbo.c b/ext/nokogumbo/nokogumbo.c index 1459e13f..51ac4f3a 100644 --- a/ext/nokogumbo/nokogumbo.c +++ b/ext/nokogumbo/nokogumbo.c @@ -21,7 +21,6 @@ #include #include "gumbo.h" #include "error.h" -#include "parser.h" // class constants static VALUE Document; @@ -34,7 +33,7 @@ static VALUE XMLSyntaxError; #define NIL NULL #define CONST_CAST (xmlChar const*) #else -#define NIL 0 +#define NIL Qnil #define CONST_CAST // more class constants @@ -45,11 +44,15 @@ static VALUE Comment; // interned symbols static VALUE new; +static VALUE attribute; static VALUE set_attribute; +static VALUE remove_attribute; static VALUE add_child; static VALUE internal_subset; static VALUE remove_; static VALUE create_internal_subset; +static VALUE key_; +static VALUE node_name_; // map libxml2 types to Ruby VALUE #define xmlNodePtr VALUE @@ -58,12 +61,10 @@ static VALUE create_internal_subset; // redefine libxml2 API as Ruby function calls #define xmlNewDocNode(doc, ns, name, content) \ rb_funcall(Element, new, 2, rb_str_new2(name), doc) -#define xmlNewProp(element, name, value) \ - rb_funcall(element, set_attribute, 2, rb_str_new2(name), rb_str_new2(value)) #define xmlNewDocText(doc, text) \ rb_funcall(Text, new, 2, rb_str_new2(text), doc) #define xmlNewCDataBlock(doc, content, length) \ - rb_funcall(CDATA, new, 2, rb_str_new(content, length), doc) + rb_funcall(CDATA, new, 2, doc, rb_str_new(content, length)) #define xmlNewDocComment(doc, text) \ rb_funcall(Comment, new, 2, doc, rb_str_new2(text)) #define xmlAddChild(element, node) \ @@ -83,6 +84,78 @@ static VALUE xmlNewDoc(char* version) { rb_funcall(rb_funcall(doc, internal_subset, 0), remove_, 0); return doc; } + +static VALUE find_dummy_key(VALUE collection) { + VALUE r_dummy = Qnil; + char dummy[5] = "a"; + size_t len = 1; + while (len < sizeof dummy) { + r_dummy = rb_str_new(dummy, len); + if (rb_funcall(collection, key_, 1, r_dummy) == Qfalse) + return r_dummy; + for (size_t i = 0; ; ++i) { + if (dummy[i] == 0) { + dummy[i] = 'a'; + ++len; + break; + } + if (dummy[i] == 'z') + dummy[i] = 'a'; + else { + ++dummy[i]; + break; + } + } + } + // This collection has 475254 elements?? Give up. + return Qnil; +} + +static xmlNodePtr xmlNewProp(xmlNodePtr node, const char *name, const char *value) { + // Nokogiri::XML::Node#set_attribute calls xmlSetProp(node, name, value) + // which behaves roughly as + // if name is a QName prefix:local + // if node->doc has a namespace ns corresponding to prefix + // return xmlSetNsProp(node, ns, local, value) + // return xmlSetNsProp(node, NULL, name, value) + // + // If the prefix is "xml", then the namespace lookup will create it. + // + // By contrast, xmlNewProp does not do this parsing and creates an attribute + // with the name and value exactly as given. This is the behavior that we + // want. + // + // Thus, for attribute names like "xml:lang", #set_attribute will create an + // attribute with namespace "xml" and name "lang". This is incorrect for + // html elements (but correct for foreign elements). + // + // Work around this by inserting a dummy attribute and then changing the + // name, if needed. + + // Can't use strchr since it's locale-sensitive. + size_t len = strlen(name); + VALUE r_name = rb_str_new(name, len); + if (memchr(name, ':', len) == NULL) { + // No colon. + return rb_funcall(node, set_attribute, 2, r_name, rb_str_new2(value)); + } + // Find a dummy attribute string that doesn't already exist. + VALUE dummy = find_dummy_key(node); + if (dummy == Qnil) + return Qnil; + // Add the dummy attribute. + VALUE r_value = rb_funcall(node, set_attribute, 2, dummy, rb_str_new2(value)); + if (r_value == Qnil) + return Qnil; + // Remove thet old attribute, if it exists. + rb_funcall(node, remove_attribute, 1, r_name); + // Rename the dummy + VALUE attr = rb_funcall(node, attribute, 1, dummy); + if (attr == Qnil) + return Qnil; + rb_funcall(attr, node_name_, 1, r_name); + return attr; +} #endif // Build a xmlNodePtr for a given GumboNode (recursively) @@ -90,30 +163,15 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node); // Build a xmlNodePtr for a given GumboElement (recursively) static xmlNodePtr walk_element(xmlDocPtr document, GumboElement *node) { - // determine tag name for a given node - xmlNodePtr element; - if (node->tag != GUMBO_TAG_UNKNOWN) { - element = xmlNewDocNode(document, NIL, - CONST_CAST gumbo_normalized_tagname(node->tag), NIL); - } else { - GumboStringPiece tag = node->original_tag; - gumbo_tag_from_original_text(&tag); -#ifdef _MSC_VER - char* name = alloca(tag.length+1); -#else - char name[tag.length+1]; -#endif - strncpy(name, tag.data, tag.length); - name[tag.length] = '\0'; - element = xmlNewDocNode(document, NIL, CONST_CAST name, NIL); - } + // create the given element + xmlNodePtr element = xmlNewDocNode(document, NIL, CONST_CAST node->name, NIL); // add in the attributes GumboVector* attrs = &node->attributes; char *name = NULL; - int namelen = 0; - char *ns; - for (int i=0; i < attrs->length; i++) { + size_t namelen = 0; + const char *ns; + for (size_t i=0; i < attrs->length; i++) { GumboAttribute *attr = attrs->data[i]; switch (attr->attr_namespace) { @@ -156,7 +214,7 @@ static xmlNodePtr walk_element(xmlDocPtr document, GumboElement *node) { // add in the children GumboVector* children = &node->children; - for (int i=0; i < children->length; i++) { + for (size_t i=0; i < children->length; i++) { xmlNodePtr node = walk_tree(document, children->data[i]); if (node) xmlAddChild(element, node); } @@ -176,8 +234,8 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) { return xmlNewDocText(document, CONST_CAST node->v.text.text); case GUMBO_NODE_CDATA: return xmlNewCDataBlock(document, - CONST_CAST node->v.text.original_text.data, - (int) node->v.text.original_text.length); + CONST_CAST node->v.text.text, + (int) strlen(node->v.text.text)); case GUMBO_NODE_COMMENT: return xmlNewDocComment(document, CONST_CAST node->v.text.text); } @@ -200,12 +258,12 @@ static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) { const char *public = output->document->v.document.public_identifier; const char *system = output->document->v.document.system_identifier; xmlCreateIntSubset(doc, CONST_CAST name, - (public[0] ? CONST_CAST public : NIL), - (system[0] ? CONST_CAST system : NIL)); + (public[0] ? CONST_CAST public : NULL), + (system[0] ? CONST_CAST system : NULL)); } GumboVector *children = &output->document->v.document.children; - for (int i=0; i < children->length; i++) { + for (size_t i=0; i < children->length; i++) { GumboNode *child = children->data[i]; xmlNodePtr node = walk_tree(doc, child); if (node) { @@ -221,15 +279,14 @@ static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) { // Add parse errors to rdoc. if (output->errors.length) { GumboVector *errors = &output->errors; - GumboParser parser = { ._options = &options }; GumboStringBuffer msg; VALUE rerrors = rb_ary_new2(errors->length); - gumbo_string_buffer_init(&parser, &msg); - for (int i=0; i < errors->length; i++) { + gumbo_string_buffer_init(&msg); + for (size_t i=0; i < errors->length; i++) { GumboError *err = errors->data[i]; - gumbo_string_buffer_clear(&parser, &msg); - gumbo_caret_diagnostic_to_string(&parser, err, input, input_len, &msg); + gumbo_string_buffer_clear(&msg); + gumbo_caret_diagnostic_to_string(err, input, input_len, &msg); VALUE err_str = rb_str_new(msg.data, msg.length); VALUE syntax_error = rb_class_new_instance(1, &err_str, XMLSyntaxError); rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER @@ -245,10 +302,10 @@ static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) { rb_ary_push(rerrors, syntax_error); } rb_iv_set(rdoc, "@errors", rerrors); - gumbo_string_buffer_destroy(&parser, &msg); + gumbo_string_buffer_destroy(&msg); } - gumbo_destroy_output(&options, output); + gumbo_destroy_output(output); return rdoc; } @@ -274,11 +331,15 @@ void Init_nokogumbo() { // interned symbols new = rb_intern("new"); + attribute = rb_intern("attribute"); set_attribute = rb_intern("set_attribute"); + remove_attribute = rb_intern("remove_attribute"); add_child = rb_intern("add_child_node_and_reparent_attrs"); internal_subset = rb_intern("internal_subset"); remove_ = rb_intern("remove"); create_internal_subset = rb_intern("create_internal_subset"); + key_ = rb_intern("key?"); + node_name_ = rb_intern("node_name="); #endif // define Nokogumbo module with a parse method diff --git a/gumbo-parser/.clang-format b/gumbo-parser/.clang-format deleted file mode 100644 index e2138046..00000000 --- a/gumbo-parser/.clang-format +++ /dev/null @@ -1,65 +0,0 @@ ---- -Language: Cpp -# BasedOnStyle: Google -AccessModifierOffset: -1 -AlignAfterOpenBracket: false -AlignEscapedNewlinesLeft: true -AlignOperands: true -AlignTrailingComments: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowShortBlocksOnASingleLine: false -AllowShortCaseLabelsOnASingleLine: false -AllowShortIfStatementsOnASingleLine: true -AllowShortLoopsOnASingleLine: true -AllowShortFunctionsOnASingleLine: All -AlwaysBreakAfterDefinitionReturnType: false -AlwaysBreakTemplateDeclarations: true -AlwaysBreakBeforeMultilineStrings: true -BreakBeforeBinaryOperators: None -BreakBeforeTernaryOperators: true -BreakConstructorInitializersBeforeComma: false -BinPackParameters: true -BinPackArguments: true -ColumnLimit: 80 -ConstructorInitializerAllOnOneLineOrOnePerLine: true -ConstructorInitializerIndentWidth: 4 -DerivePointerAlignment: true -ExperimentalAutoDetectBinPacking: false -IndentCaseLabels: true -IndentWrappedFunctionNames: false -IndentFunctionDeclarationAfterType: false -MaxEmptyLinesToKeep: 1 -KeepEmptyLinesAtTheStartOfBlocks: false -NamespaceIndentation: None -ObjCBlockIndentWidth: 2 -ObjCSpaceAfterProperty: false -ObjCSpaceBeforeProtocolList: false -PenaltyBreakBeforeFirstCallParameter: 1 -PenaltyBreakComment: 300 -PenaltyBreakString: 1000 -PenaltyBreakFirstLessLess: 120 -PenaltyExcessCharacter: 1000000 -PenaltyReturnTypeOnItsOwnLine: 200 -PointerAlignment: Left -SpacesBeforeTrailingComments: 2 -Cpp11BracedListStyle: true -Standard: Auto -IndentWidth: 2 -TabWidth: 8 -UseTab: Never -BreakBeforeBraces: Attach -SpacesInParentheses: false -SpacesInSquareBrackets: false -SpacesInAngles: false -SpaceInEmptyParentheses: false -SpacesInCStyleCastParentheses: false -SpaceAfterCStyleCast: true -SpacesInContainerLiterals: true -SpaceBeforeAssignmentOperators: true -ContinuationIndentWidth: 4 -CommentPragmas: '^ IWYU pragma:' -ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] -SpaceBeforeParens: ControlStatements -DisableFormat: false -... - diff --git a/gumbo-parser/.gitignore b/gumbo-parser/.gitignore deleted file mode 100644 index 508df8fd..00000000 --- a/gumbo-parser/.gitignore +++ /dev/null @@ -1,79 +0,0 @@ -# Compilation artifacts -*.o -*.lo -*.la - -# Editor swap files -*.swp -*.swo -*.swn - -#emacs editor leftovers -*.*~ - -#diff leftovers -*.orig - -# gtest pieces -gtest -gtest-1.7.0 - -# Other build artifacts -/Debug -/visualc/Debug -/visualc/Release -/visualc/gumbo.sdf -/visualc/gumbo.opensdf -/build -.log -.sdf -.opensdf -.deps -.dirstamp -.libs -Makefile -Makefile.in -aclocal.m4 -autom4te.cache -compile -config.guess -config.log -config.status -config.sub -configure -depcomp -gumbo.pc -gumbo_test -gumbo_test.log -gumbo_test.trs -install-sh -libtool -ltmain.sh -m4/ -missing -test-driver -test-suite.log - -# gyp android artifacts -gumbo_parser.target.mk - -# `make dist` artifacts -/gumbo-[0-9].[0-9].tar.gz -/gumbo-[0-9].[0-9]/ - -# Python dist artifacts -*.pyc -*.dylib -dist -build -python/gumbo.egg-info -python/gumbo/libgumbo.so - -# Example binaries -benchmark -clean_text -find_links -get_title -positions_of_class -prettyprint -serialize diff --git a/gumbo-parser/.gitmodules b/gumbo-parser/.gitmodules deleted file mode 100644 index be8537ac..00000000 --- a/gumbo-parser/.gitmodules +++ /dev/null @@ -1,6 +0,0 @@ -[submodule "third_party/gtest"] - path = third_party/gtest - url = https://chromium.googlesource.com/external/googletest/ -[submodule "testdata"] - path = testdata - url = https://github.com/html5lib/html5lib-tests.git diff --git a/gumbo-parser/.travis.yml b/gumbo-parser/.travis.yml deleted file mode 100644 index d76208f5..00000000 --- a/gumbo-parser/.travis.yml +++ /dev/null @@ -1,26 +0,0 @@ -language: c++ - -compiler: - - gcc - - clang - -os: - - linux - - osx - -install: - - wget 'https://googletest.googlecode.com/files/gtest-1.7.0.zip' - - unzip gtest-1.7.0.zip - - ln -s gtest-1.7.0 gtest - - sudo pip install BeautifulSoup - - sudo pip install html5lib==0.95 - -script: - - ./autogen.sh && ./configure && make && make check - - python python/gumbo/gumboc_test.py - - python python/gumbo/html5lib_adapter_test.py - - python python/gumbo/soup_adapter_test.py - - sudo make install - - g++ examples/clean_text.cc `pkg-config --cflags --libs gumbo` - - sudo python setup.py sdist install - - python -c 'import gumbo; gumbo.parse("Foo")' diff --git a/gumbo-parser/Makefile b/gumbo-parser/Makefile new file mode 100644 index 00000000..b1f688d8 --- /dev/null +++ b/gumbo-parser/Makefile @@ -0,0 +1,35 @@ +.PHONY: all clean check dirs + +gumbo_objs := $(patsubst %.c,build/%.o,$(wildcard src/*.c)) +test_objs := $(patsubst %.cc,build/%.o,$(wildcard test/*.cc)) +gtest_lib := googletest/make/gtest_main.a + +CPPFLAGS := -Isrc +CFLAGS := -std=c99 -Os +CXXFLAGS := -isystem googletest/include -std=c++11 -Os +LDFLAGS := -pthread + +all: check + +build/src: + mkdir -p "$@" + +build/test: + mkdir -p "$@" + +build/src/%.o: src/%.c | build/src + $(CC) -MMD $(CPPFLAGS) $(CFLAGS) -c -o $@ $< + +build/test/%.o: test/%.cc | build/test + $(CXX) -MMD $(CPPFLAGS) $(CXXFLAGS) -c -o $@ $< + +build/run_tests: $(gumbo_objs) $(test_objs) $(gtest_lib) + $(CXX) -o $@ $+ $(LDFLAGS) + +check: build/run_tests + ./build/run_tests + +clean: + $(RM) -r build + +-include $(test_objs:.o=.d) $(gumbo_objs:.o=.d) diff --git a/gumbo-parser/src/README.md b/gumbo-parser/src/README.md new file mode 100644 index 00000000..5e75c41e --- /dev/null +++ b/gumbo-parser/src/README.md @@ -0,0 +1,41 @@ +libgumbo +======== + +This is an internal fork of the [libgumbo] library, which was copied and +later modified under the terms of the Apache 2.0 [license]. See `lua-gumbo` +commit [`0a04728`] for details of the original import. + +Since importing the code, the following notable fixes and improvements +have been made: + +* `91cef89`: Re-implement `adjust_foreign_attributes()` with a gperf hash +* `b11abe7`: Pass `TagSet` arrays into functions by reference instead of value +* `b73dc03`: Simplify `maybe_replace_codepoint()` function +* `d5d0bb3`: Remove special handling of `` tag +* `7bd5162`: Remove special handling of `` tag +* `a5c1b0e`: Use `realloc(3)` instead of `malloc(3)` in `enlarge_vector_if_full()` +* `dcbebd7`: Use `realloc(3)` instead of `malloc(3)` in `maybe_resize_string_buffer()` +* `df15262`: Make `destroy_node()` function non-recursive +* `2df37f5`: Fix signedness of some format specifiers +* `176553e`: Add maximum element nesting limit +* `bed0f4a`: Annotate `gumbo_debug()` with `PRINTF` macro and fix warnings +* `7ffc218`: Annotate `print_message()` with `PRINTF` macro and fix warnings +* `1bd8ab5`, `9136507`, `53a1f9a`: Deduplicate some identical `TagSet` arrays +* `a7a9065`: Add some GCC/Clang function attributes +* `8d3d4e4`: Remove custom allocator support +* `8d3b006`: Fix recording of source positions for `` end tags +* `1a8d763`: Replace linear search in `maybe_replace_codepoint()` with a lookup table +* `6dca79e`: Replace `strcasecmp()` and `strncasecmp()` with ascii-only equivalents +* `17ab1d2`: Fix `TAGSET_INCLUDES` macro to work properly with multiple bit flags +* `7e56d45`: Re-implement `gumbo_normalize_svg_tagname()` with a gperf hash +* `a518d35`: Replace linear array search in `adjust_svg_attributes()` with a gperf hash +* `a4a7433`: Fix duplicate `TagSet` initializer being ignored in `is_special_node()` +* `8137fcd`: Add support for `` tag +* `4b35471`: Add missing `static` qualifiers to hide symbols that shouldn't be extern +* `df57c59`, `03101f3`, `ea62330`: Replace use of locale-dependant `ctype.h` functions + with custom, ASCII-only equivalents + + +[libgumbo]: https://github.com/google/gumbo-parser/tree/aa91b27b02c0c80c482e24348a457ed7c3c088e0/src +[license]: https://github.com/google/gumbo-parser/blob/aa91b27b02c0c80c482e24348a457ed7c3c088e0/COPYING +[`0a04728`]: https://gitlab.com/craigbarnes/lua-gumbo/commit/0a047282815af86f3367a7d95fefcfe5723ece48 diff --git a/gumbo-parser/src/ascii.c b/gumbo-parser/src/ascii.c new file mode 100644 index 00000000..036dc406 --- /dev/null +++ b/gumbo-parser/src/ascii.c @@ -0,0 +1,33 @@ +#include "ascii.h" + +int gumbo_ascii_strcasecmp(const char *s1, const char *s2) { + int c1, c2; + while (*s1 && *s2) { + c1 = (int)(unsigned char) gumbo_ascii_tolower(*s1); + c2 = (int)(unsigned char) gumbo_ascii_tolower(*s2); + if (c1 != c2) { + return (c1 - c2); + } + s1++; + s2++; + } + return (((int)(unsigned char) *s1) - ((int)(unsigned char) *s2)); +} + +int gumbo_ascii_strncasecmp(const char *s1, const char *s2, size_t n) { + int c1, c2; + while (n && *s1 && *s2) { + n -= 1; + c1 = (int)(unsigned char) gumbo_ascii_tolower(*s1); + c2 = (int)(unsigned char) gumbo_ascii_tolower(*s2); + if (c1 != c2) { + return (c1 - c2); + } + s1++; + s2++; + } + if (n) { + return (((int)(unsigned char) *s1) - ((int)(unsigned char) *s2)); + } + return 0; +} diff --git a/gumbo-parser/src/ascii.h b/gumbo-parser/src/ascii.h new file mode 100644 index 00000000..729c69ac --- /dev/null +++ b/gumbo-parser/src/ascii.h @@ -0,0 +1,31 @@ +#ifndef GUMBO_ASCII_H_ +#define GUMBO_ASCII_H_ + +#include +#include "macros.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define gumbo_ascii_isupper(c) (((unsigned)(c) - 'A') < 26) + +CONST_FN +static inline int gumbo_ascii_tolower(int c) { + if (gumbo_ascii_isupper(c)) { + return c | 32; + } + return c; +} + +PURE NONNULL_ARGS +int gumbo_ascii_strcasecmp(const char *s1, const char *s2); + +PURE NONNULL_ARGS +int gumbo_ascii_strncasecmp(const char *s1, const char *s2, size_t n); + +#ifdef __cplusplus +} +#endif + +#endif // GUMBO_ASCII_H_ diff --git a/gumbo-parser/src/attribute.c b/gumbo-parser/src/attribute.c index 234927a5..8967ee98 100644 --- a/gumbo-parser/src/attribute.c +++ b/gumbo-parser/src/attribute.c @@ -1,44 +1,42 @@ -// Copyright 2010 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) +/* + Copyright 2018 Craig Barnes. + Copyright 2010 Google Inc. -#include "attribute.h" + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include #include #include -#include - +#include "attribute.h" +#include "ascii.h" #include "util.h" -struct GumboInternalParser; - -GumboAttribute* gumbo_get_attribute( - const GumboVector* attributes, const char* name) { +GumboAttribute* gumbo_get_attribute ( + const GumboVector* attributes, + const char* name +) { for (unsigned int i = 0; i < attributes->length; ++i) { GumboAttribute* attr = attributes->data[i]; - if (!strcasecmp(attr->name, name)) { + if (!gumbo_ascii_strcasecmp(attr->name, name)) { return attr; } } return NULL; } -void gumbo_destroy_attribute( - struct GumboInternalParser* parser, GumboAttribute* attribute) { - gumbo_parser_deallocate(parser, (void*) attribute->name); - gumbo_parser_deallocate(parser, (void*) attribute->value); - gumbo_parser_deallocate(parser, (void*) attribute); +void gumbo_destroy_attribute(GumboAttribute* attribute) { + gumbo_free((void*) attribute->name); + gumbo_free((void*) attribute->value); + gumbo_free((void*) attribute); } diff --git a/gumbo-parser/src/attribute.h b/gumbo-parser/src/attribute.h index f9b8aea5..3383bde6 100644 --- a/gumbo-parser/src/attribute.h +++ b/gumbo-parser/src/attribute.h @@ -1,19 +1,3 @@ -// Copyright 2010 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) - #ifndef GUMBO_ATTRIBUTE_H_ #define GUMBO_ATTRIBUTE_H_ @@ -23,15 +7,11 @@ extern "C" { #endif -struct GumboInternalParser; - -// Release the memory used for an GumboAttribute, including the attribute -// itself. -void gumbo_destroy_attribute( - struct GumboInternalParser* parser, GumboAttribute* attribute); +// Release the memory used for a GumboAttribute, including the attribute itself +void gumbo_destroy_attribute(GumboAttribute* attribute); #ifdef __cplusplus } #endif -#endif // GUMBO_ATTRIBUTE_H_ +#endif // GUMBO_ATTRIBUTE_H_ diff --git a/gumbo-parser/src/char_ref.c b/gumbo-parser/src/char_ref.c index a1d74fd5..d9e35214 100644 --- a/gumbo-parser/src/char_ref.c +++ b/gumbo-parser/src/char_ref.c @@ -1,68 +1,43 @@ +/* + Copyright 2017-2018 Craig Barnes. + Copyright 2011 Google Inc. -#line 1 "char_ref.rl" -// Copyright 2011 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) -// -// This is a Ragel state machine re-implementation of the original char_ref.c, -// rewritten to improve efficiency. To generate the .c file from it, -// -// $ ragel -F0 char_ref.rl -// -// The generated source is also checked into source control so that most people -// hacking on the parser do not need to install ragel. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at -#include "char_ref.h" + https://www.apache.org/licenses/LICENSE-2.0 -#include -#include -#include -#include -#include // Only for debug assertions at present. + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +#include +#include "char_ref.h" #include "error.h" -#include "string_piece.h" +#include "macros.h" #include "utf8.h" -#include "util.h" struct GumboInternalParser; const int kGumboNoChar = -1; -// Table of replacement characters. The spec specifies that any occurrence of -// the first character should be replaced by the second character, and a parse -// error recorded. -typedef struct { - int from_char; - int to_char; -} CharReplacement; - -static const CharReplacement kCharReplacements[] = {{0x00, 0xfffd}, - {0x0d, 0x000d}, {0x80, 0x20ac}, {0x81, 0x0081}, {0x82, 0x201A}, - {0x83, 0x0192}, {0x84, 0x201E}, {0x85, 0x2026}, {0x86, 0x2020}, - {0x87, 0x2021}, {0x88, 0x02C6}, {0x89, 0x2030}, {0x8A, 0x0160}, - {0x8B, 0x2039}, {0x8C, 0x0152}, {0x8D, 0x008D}, {0x8E, 0x017D}, - {0x8F, 0x008F}, {0x90, 0x0090}, {0x91, 0x2018}, {0x92, 0x2019}, - {0x93, 0x201C}, {0x94, 0x201D}, {0x95, 0x2022}, {0x96, 0x2013}, - {0x97, 0x2014}, {0x98, 0x02DC}, {0x99, 0x2122}, {0x9A, 0x0161}, - {0x9B, 0x203A}, {0x9C, 0x0153}, {0x9D, 0x009D}, {0x9E, 0x017E}, - {0x9F, 0x0178}, - // Terminator. - {-1, -1}}; +static const uint32_t kCharReplacements[] = { + [0x00] = 0xFFFD, [0x0D] = 0x000D, [0x80] = 0x20AC, [0x81] = 0x0081, + [0x82] = 0x201A, [0x83] = 0x0192, [0x84] = 0x201E, [0x85] = 0x2026, + [0x86] = 0x2020, [0x87] = 0x2021, [0x88] = 0x02C6, [0x89] = 0x2030, + [0x8A] = 0x0160, [0x8B] = 0x2039, [0x8C] = 0x0152, [0x8D] = 0x008D, + [0x8E] = 0x017D, [0x8F] = 0x008F, [0x90] = 0x0090, [0x91] = 0x2018, + [0x92] = 0x2019, [0x93] = 0x201C, [0x94] = 0x201D, [0x95] = 0x2022, + [0x96] = 0x2013, [0x97] = 0x2014, [0x98] = 0x02DC, [0x99] = 0x2122, + [0x9A] = 0x0161, [0x9B] = 0x203A, [0x9C] = 0x0153, [0x9D] = 0x009D, + [0x9E] = 0x017E, [0x9F] = 0x0178 +}; -static int parse_digit(int c, bool allow_hex) { +static int CONST_FN parse_digit(int c, bool allow_hex) { if (c >= '0' && c <= '9') { return c - '0'; } @@ -75,8 +50,10 @@ static int parse_digit(int c, bool allow_hex) { return -1; } -static void add_no_digit_error( - struct GumboInternalParser* parser, Utf8Iterator* input) { +static void add_no_digit_error ( + struct GumboInternalParser* parser, + Utf8Iterator* input +) { GumboError* error = gumbo_add_error(parser); if (!error) { return; @@ -85,8 +62,12 @@ static void add_no_digit_error( error->type = GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS; } -static void add_codepoint_error(struct GumboInternalParser* parser, - Utf8Iterator* input, GumboErrorType type, int codepoint) { +static void add_codepoint_error ( + struct GumboInternalParser* parser, + Utf8Iterator* input, + GumboErrorType type, + int codepoint +) { GumboError* error = gumbo_add_error(parser); if (!error) { return; @@ -96,8 +77,12 @@ static void add_codepoint_error(struct GumboInternalParser* parser, error->v.codepoint = codepoint; } -static void add_named_reference_error(struct GumboInternalParser* parser, - Utf8Iterator* input, GumboErrorType type, GumboStringPiece text) { +static void add_named_reference_error ( + struct GumboInternalParser* parser, + Utf8Iterator* input, + GumboErrorType type, + GumboStringPiece text +) { GumboError* error = gumbo_add_error(parser); if (!error) { return; @@ -107,17 +92,15 @@ static void add_named_reference_error(struct GumboInternalParser* parser, error->v.text = text; } -static int maybe_replace_codepoint(int codepoint) { - for (int i = 0; kCharReplacements[i].from_char != -1; ++i) { - if (kCharReplacements[i].from_char == codepoint) { - return kCharReplacements[i].to_char; - } - } - return -1; +static uint32_t PURE maybe_replace_codepoint(uint32_t codepoint) { + return (codepoint > 0x9F) ? 0x00 : kCharReplacements[codepoint]; } -static bool consume_numeric_ref( - struct GumboInternalParser* parser, Utf8Iterator* input, int* output) { +static bool consume_numeric_ref ( + struct GumboInternalParser* parser, + Utf8Iterator* input, + int* output +) { utf8iterator_next(input); bool is_hex = false; int c = utf8iterator_current(input); @@ -136,7 +119,7 @@ static bool consume_numeric_ref( return false; } - int codepoint = 0; + uint32_t codepoint = 0; bool status = true; do { codepoint = (codepoint * (is_hex ? 16 : 10)) + digit; @@ -145,31 +128,47 @@ static bool consume_numeric_ref( } while (digit != -1); if (utf8iterator_current(input) != ';') { - add_codepoint_error( - parser, input, GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON, codepoint); + add_codepoint_error ( + parser, + input, + GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON, + codepoint + ); status = false; } else { utf8iterator_next(input); } - int replacement = maybe_replace_codepoint(codepoint); - if (replacement != -1) { - add_codepoint_error( - parser, input, GUMBO_ERR_NUMERIC_CHAR_REF_INVALID, codepoint); + uint32_t replacement = maybe_replace_codepoint(codepoint); + if (replacement != 0) { + add_codepoint_error ( + parser, + input, + GUMBO_ERR_NUMERIC_CHAR_REF_INVALID, + codepoint + ); *output = replacement; return false; } if ((codepoint >= 0xd800 && codepoint <= 0xdfff) || codepoint > 0x10ffff) { - add_codepoint_error( - parser, input, GUMBO_ERR_NUMERIC_CHAR_REF_INVALID, codepoint); + add_codepoint_error ( + parser, + input, + GUMBO_ERR_NUMERIC_CHAR_REF_INVALID, + codepoint + ); *output = 0xfffd; return false; } - if (utf8_is_invalid_code_point(codepoint) || codepoint == 0xb) { - add_codepoint_error( - parser, input, GUMBO_ERR_NUMERIC_CHAR_REF_INVALID, codepoint); + if (utf8_is_invalid_code_point(codepoint)) { + add_codepoint_error ( + parser, + input, + GUMBO_ERR_NUMERIC_CHAR_REF_INVALID, + codepoint + ); status = false; // But return it anyway, per spec. } @@ -177,14 +176,19 @@ static bool consume_numeric_ref( return status; } -static bool maybe_add_invalid_named_reference( - struct GumboInternalParser* parser, Utf8Iterator* input) { +static bool maybe_add_invalid_named_reference ( + struct GumboInternalParser* parser, + Utf8Iterator* input +) { // The iterator will always be reset in this code path, so we don't need to // worry about consuming characters. const char* start = utf8iterator_get_char_pointer(input); int c = utf8iterator_current(input); - while ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || - (c >= '0' && c <= '9')) { + while ( + (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || (c >= '0' && c <= '9') + ) { utf8iterator_next(input); c = utf8iterator_current(input); } @@ -192,18 +196,21 @@ static bool maybe_add_invalid_named_reference( GumboStringPiece bad_ref; bad_ref.data = start; bad_ref.length = utf8iterator_get_char_pointer(input) - start; - add_named_reference_error( - parser, input, GUMBO_ERR_NAMED_CHAR_REF_INVALID, bad_ref); + add_named_reference_error ( + parser, + input, + GUMBO_ERR_NAMED_CHAR_REF_INVALID, + bad_ref + ); return false; } return true; } -#line 2465 "char_ref.rl" -// clang-format off -#line 238 "char_ref.c" + + static const short _char_ref_actions[] = { 0, 1, 0, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1, 6, 1, @@ -13934,11 +13941,29 @@ static const int char_ref_start = 7623; static const int char_ref_en_valid_named_ref = 7623; -#line 2469 "char_ref.rl" -// clang-format on -static bool consume_named_ref(struct GumboInternalParser* parser, - Utf8Iterator* input, bool is_in_attribute, OneOrTwoCodepoints* output) { +static const unsigned char ascii_alnum_table[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0.. 15 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16.. 31 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 32.. 47 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, // 48.. 63 + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 64.. 79 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, // 80.. 95 + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 96..111 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, // 112..127 + // 128..255: implicitly zero +}; + +static inline bool PURE ascii_isalnum(unsigned char ch) { + return ascii_alnum_table[ch]; +} + +static bool consume_named_ref ( + struct GumboInternalParser* parser, + Utf8Iterator* input, + bool is_in_attribute, + OneOrTwoCodepoints* output +) { assert(output->first == kGumboNoChar); const char* p = utf8iterator_get_char_pointer(input); const char* pe = utf8iterator_get_end_pointer(input); @@ -13947,9 +13972,7 @@ static bool consume_named_ref(struct GumboInternalParser* parser, const char *ts, *start; int cs, act; -// clang-format off -#line 13985 "char_ref.c" { cs = char_ref_start; ts = 0; @@ -13957,7 +13980,6 @@ static bool consume_named_ref(struct GumboInternalParser* parser, act = 0; } -#line 2484 "char_ref.rl" // Avoid unused variable warnings. (void) act; (void) ts; @@ -13965,7 +13987,6 @@ static bool consume_named_ref(struct GumboInternalParser* parser, start = p; -#line 14001 "char_ref.c" { int _slen; int _trans; @@ -13984,10 +14005,8 @@ static bool consume_named_ref(struct GumboInternalParser* parser, while ( _nacts-- > 0 ) { switch ( *_acts++ ) { case 1: -#line 1 "NONE" {ts = p;} break; -#line 14023 "char_ref.c" } } @@ -14011,8966 +14030,6725 @@ static bool consume_named_ref(struct GumboInternalParser* parser, switch ( *(_acts++) ) { case 2: -#line 1 "NONE" {te = p+1;} break; case 3: -#line 233 "char_ref.rl" {te = p+1;{ output->first = 0xc6; {p++; goto _out; } }} break; case 4: -#line 235 "char_ref.rl" {te = p+1;{ output->first = 0x26; {p++; goto _out; } }} break; case 5: -#line 237 "char_ref.rl" {te = p+1;{ output->first = 0xc1; {p++; goto _out; } }} break; case 6: -#line 239 "char_ref.rl" {te = p+1;{ output->first = 0x0102; {p++; goto _out; } }} break; case 7: -#line 240 "char_ref.rl" {te = p+1;{ output->first = 0xc2; {p++; goto _out; } }} break; case 8: -#line 242 "char_ref.rl" {te = p+1;{ output->first = 0x0410; {p++; goto _out; } }} break; case 9: -#line 243 "char_ref.rl" {te = p+1;{ output->first = 0x0001d504; {p++; goto _out; } }} break; case 10: -#line 244 "char_ref.rl" {te = p+1;{ output->first = 0xc0; {p++; goto _out; } }} break; case 11: -#line 246 "char_ref.rl" {te = p+1;{ output->first = 0x0391; {p++; goto _out; } }} break; case 12: -#line 247 "char_ref.rl" {te = p+1;{ output->first = 0x0100; {p++; goto _out; } }} break; case 13: -#line 248 "char_ref.rl" {te = p+1;{ output->first = 0x2a53; {p++; goto _out; } }} break; case 14: -#line 249 "char_ref.rl" {te = p+1;{ output->first = 0x0104; {p++; goto _out; } }} break; case 15: -#line 250 "char_ref.rl" {te = p+1;{ output->first = 0x0001d538; {p++; goto _out; } }} break; case 16: -#line 251 "char_ref.rl" {te = p+1;{ output->first = 0x2061; {p++; goto _out; } }} break; case 17: -#line 252 "char_ref.rl" {te = p+1;{ output->first = 0xc5; {p++; goto _out; } }} break; case 18: -#line 254 "char_ref.rl" {te = p+1;{ output->first = 0x0001d49c; {p++; goto _out; } }} break; case 19: -#line 255 "char_ref.rl" {te = p+1;{ output->first = 0x2254; {p++; goto _out; } }} break; case 20: -#line 256 "char_ref.rl" {te = p+1;{ output->first = 0xc3; {p++; goto _out; } }} break; case 21: -#line 258 "char_ref.rl" {te = p+1;{ output->first = 0xc4; {p++; goto _out; } }} break; case 22: -#line 260 "char_ref.rl" {te = p+1;{ output->first = 0x2216; {p++; goto _out; } }} break; case 23: -#line 261 "char_ref.rl" {te = p+1;{ output->first = 0x2ae7; {p++; goto _out; } }} break; case 24: -#line 262 "char_ref.rl" {te = p+1;{ output->first = 0x2306; {p++; goto _out; } }} break; case 25: -#line 263 "char_ref.rl" {te = p+1;{ output->first = 0x0411; {p++; goto _out; } }} break; case 26: -#line 264 "char_ref.rl" {te = p+1;{ output->first = 0x2235; {p++; goto _out; } }} break; case 27: -#line 265 "char_ref.rl" {te = p+1;{ output->first = 0x212c; {p++; goto _out; } }} break; case 28: -#line 266 "char_ref.rl" {te = p+1;{ output->first = 0x0392; {p++; goto _out; } }} break; case 29: -#line 267 "char_ref.rl" {te = p+1;{ output->first = 0x0001d505; {p++; goto _out; } }} break; case 30: -#line 268 "char_ref.rl" {te = p+1;{ output->first = 0x0001d539; {p++; goto _out; } }} break; case 31: -#line 269 "char_ref.rl" {te = p+1;{ output->first = 0x02d8; {p++; goto _out; } }} break; case 32: -#line 270 "char_ref.rl" {te = p+1;{ output->first = 0x212c; {p++; goto _out; } }} break; case 33: -#line 271 "char_ref.rl" {te = p+1;{ output->first = 0x224e; {p++; goto _out; } }} break; case 34: -#line 272 "char_ref.rl" {te = p+1;{ output->first = 0x0427; {p++; goto _out; } }} break; case 35: -#line 273 "char_ref.rl" {te = p+1;{ output->first = 0xa9; {p++; goto _out; } }} break; case 36: -#line 275 "char_ref.rl" {te = p+1;{ output->first = 0x0106; {p++; goto _out; } }} break; case 37: -#line 276 "char_ref.rl" {te = p+1;{ output->first = 0x22d2; {p++; goto _out; } }} break; case 38: -#line 277 "char_ref.rl" {te = p+1;{ output->first = 0x2145; {p++; goto _out; } }} break; case 39: -#line 278 "char_ref.rl" {te = p+1;{ output->first = 0x212d; {p++; goto _out; } }} break; case 40: -#line 279 "char_ref.rl" {te = p+1;{ output->first = 0x010c; {p++; goto _out; } }} break; case 41: -#line 280 "char_ref.rl" {te = p+1;{ output->first = 0xc7; {p++; goto _out; } }} break; case 42: -#line 282 "char_ref.rl" {te = p+1;{ output->first = 0x0108; {p++; goto _out; } }} break; case 43: -#line 283 "char_ref.rl" {te = p+1;{ output->first = 0x2230; {p++; goto _out; } }} break; case 44: -#line 284 "char_ref.rl" {te = p+1;{ output->first = 0x010a; {p++; goto _out; } }} break; case 45: -#line 285 "char_ref.rl" {te = p+1;{ output->first = 0xb8; {p++; goto _out; } }} break; case 46: -#line 286 "char_ref.rl" {te = p+1;{ output->first = 0xb7; {p++; goto _out; } }} break; case 47: -#line 287 "char_ref.rl" {te = p+1;{ output->first = 0x212d; {p++; goto _out; } }} break; case 48: -#line 288 "char_ref.rl" {te = p+1;{ output->first = 0x03a7; {p++; goto _out; } }} break; case 49: -#line 289 "char_ref.rl" {te = p+1;{ output->first = 0x2299; {p++; goto _out; } }} break; case 50: -#line 290 "char_ref.rl" {te = p+1;{ output->first = 0x2296; {p++; goto _out; } }} break; case 51: -#line 291 "char_ref.rl" {te = p+1;{ output->first = 0x2295; {p++; goto _out; } }} break; case 52: -#line 292 "char_ref.rl" {te = p+1;{ output->first = 0x2297; {p++; goto _out; } }} break; case 53: -#line 293 "char_ref.rl" {te = p+1;{ output->first = 0x2232; {p++; goto _out; } }} break; case 54: -#line 294 "char_ref.rl" {te = p+1;{ output->first = 0x201d; {p++; goto _out; } }} break; case 55: -#line 295 "char_ref.rl" {te = p+1;{ output->first = 0x2019; {p++; goto _out; } }} break; case 56: -#line 296 "char_ref.rl" {te = p+1;{ output->first = 0x2237; {p++; goto _out; } }} break; case 57: -#line 297 "char_ref.rl" {te = p+1;{ output->first = 0x2a74; {p++; goto _out; } }} break; case 58: -#line 298 "char_ref.rl" {te = p+1;{ output->first = 0x2261; {p++; goto _out; } }} break; case 59: -#line 299 "char_ref.rl" {te = p+1;{ output->first = 0x222f; {p++; goto _out; } }} break; case 60: -#line 300 "char_ref.rl" {te = p+1;{ output->first = 0x222e; {p++; goto _out; } }} break; case 61: -#line 301 "char_ref.rl" {te = p+1;{ output->first = 0x2102; {p++; goto _out; } }} break; case 62: -#line 302 "char_ref.rl" {te = p+1;{ output->first = 0x2210; {p++; goto _out; } }} break; case 63: -#line 303 "char_ref.rl" {te = p+1;{ output->first = 0x2233; {p++; goto _out; } }} break; case 64: -#line 304 "char_ref.rl" {te = p+1;{ output->first = 0x2a2f; {p++; goto _out; } }} break; case 65: -#line 305 "char_ref.rl" {te = p+1;{ output->first = 0x0001d49e; {p++; goto _out; } }} break; case 66: -#line 306 "char_ref.rl" {te = p+1;{ output->first = 0x22d3; {p++; goto _out; } }} break; case 67: -#line 307 "char_ref.rl" {te = p+1;{ output->first = 0x224d; {p++; goto _out; } }} break; case 68: -#line 308 "char_ref.rl" {te = p+1;{ output->first = 0x2145; {p++; goto _out; } }} break; case 69: -#line 309 "char_ref.rl" {te = p+1;{ output->first = 0x2911; {p++; goto _out; } }} break; case 70: -#line 310 "char_ref.rl" {te = p+1;{ output->first = 0x0402; {p++; goto _out; } }} break; case 71: -#line 311 "char_ref.rl" {te = p+1;{ output->first = 0x0405; {p++; goto _out; } }} break; case 72: -#line 312 "char_ref.rl" {te = p+1;{ output->first = 0x040f; {p++; goto _out; } }} break; case 73: -#line 313 "char_ref.rl" {te = p+1;{ output->first = 0x2021; {p++; goto _out; } }} break; case 74: -#line 314 "char_ref.rl" {te = p+1;{ output->first = 0x21a1; {p++; goto _out; } }} break; case 75: -#line 315 "char_ref.rl" {te = p+1;{ output->first = 0x2ae4; {p++; goto _out; } }} break; case 76: -#line 316 "char_ref.rl" {te = p+1;{ output->first = 0x010e; {p++; goto _out; } }} break; case 77: -#line 317 "char_ref.rl" {te = p+1;{ output->first = 0x0414; {p++; goto _out; } }} break; case 78: -#line 318 "char_ref.rl" {te = p+1;{ output->first = 0x2207; {p++; goto _out; } }} break; case 79: -#line 319 "char_ref.rl" {te = p+1;{ output->first = 0x0394; {p++; goto _out; } }} break; case 80: -#line 320 "char_ref.rl" {te = p+1;{ output->first = 0x0001d507; {p++; goto _out; } }} break; case 81: -#line 321 "char_ref.rl" {te = p+1;{ output->first = 0xb4; {p++; goto _out; } }} break; case 82: -#line 322 "char_ref.rl" {te = p+1;{ output->first = 0x02d9; {p++; goto _out; } }} break; case 83: -#line 323 "char_ref.rl" {te = p+1;{ output->first = 0x02dd; {p++; goto _out; } }} break; case 84: -#line 324 "char_ref.rl" {te = p+1;{ output->first = 0x60; {p++; goto _out; } }} break; case 85: -#line 325 "char_ref.rl" {te = p+1;{ output->first = 0x02dc; {p++; goto _out; } }} break; case 86: -#line 326 "char_ref.rl" {te = p+1;{ output->first = 0x22c4; {p++; goto _out; } }} break; case 87: -#line 327 "char_ref.rl" {te = p+1;{ output->first = 0x2146; {p++; goto _out; } }} break; case 88: -#line 328 "char_ref.rl" {te = p+1;{ output->first = 0x0001d53b; {p++; goto _out; } }} break; case 89: -#line 329 "char_ref.rl" {te = p+1;{ output->first = 0xa8; {p++; goto _out; } }} break; case 90: -#line 330 "char_ref.rl" {te = p+1;{ output->first = 0x20dc; {p++; goto _out; } }} break; case 91: -#line 331 "char_ref.rl" {te = p+1;{ output->first = 0x2250; {p++; goto _out; } }} break; case 92: -#line 332 "char_ref.rl" {te = p+1;{ output->first = 0x222f; {p++; goto _out; } }} break; case 93: -#line 333 "char_ref.rl" {te = p+1;{ output->first = 0xa8; {p++; goto _out; } }} break; case 94: -#line 334 "char_ref.rl" {te = p+1;{ output->first = 0x21d3; {p++; goto _out; } }} break; case 95: -#line 335 "char_ref.rl" {te = p+1;{ output->first = 0x21d0; {p++; goto _out; } }} break; case 96: -#line 336 "char_ref.rl" {te = p+1;{ output->first = 0x21d4; {p++; goto _out; } }} break; case 97: -#line 337 "char_ref.rl" {te = p+1;{ output->first = 0x2ae4; {p++; goto _out; } }} break; case 98: -#line 338 "char_ref.rl" {te = p+1;{ output->first = 0x27f8; {p++; goto _out; } }} break; case 99: -#line 339 "char_ref.rl" {te = p+1;{ output->first = 0x27fa; {p++; goto _out; } }} break; case 100: -#line 340 "char_ref.rl" {te = p+1;{ output->first = 0x27f9; {p++; goto _out; } }} break; case 101: -#line 341 "char_ref.rl" {te = p+1;{ output->first = 0x21d2; {p++; goto _out; } }} break; case 102: -#line 342 "char_ref.rl" {te = p+1;{ output->first = 0x22a8; {p++; goto _out; } }} break; case 103: -#line 343 "char_ref.rl" {te = p+1;{ output->first = 0x21d1; {p++; goto _out; } }} break; case 104: -#line 344 "char_ref.rl" {te = p+1;{ output->first = 0x21d5; {p++; goto _out; } }} break; case 105: -#line 345 "char_ref.rl" {te = p+1;{ output->first = 0x2225; {p++; goto _out; } }} break; case 106: -#line 346 "char_ref.rl" {te = p+1;{ output->first = 0x2193; {p++; goto _out; } }} break; case 107: -#line 347 "char_ref.rl" {te = p+1;{ output->first = 0x2913; {p++; goto _out; } }} break; case 108: -#line 348 "char_ref.rl" {te = p+1;{ output->first = 0x21f5; {p++; goto _out; } }} break; case 109: -#line 349 "char_ref.rl" {te = p+1;{ output->first = 0x0311; {p++; goto _out; } }} break; case 110: -#line 350 "char_ref.rl" {te = p+1;{ output->first = 0x2950; {p++; goto _out; } }} break; case 111: -#line 351 "char_ref.rl" {te = p+1;{ output->first = 0x295e; {p++; goto _out; } }} break; case 112: -#line 352 "char_ref.rl" {te = p+1;{ output->first = 0x21bd; {p++; goto _out; } }} break; case 113: -#line 353 "char_ref.rl" {te = p+1;{ output->first = 0x2956; {p++; goto _out; } }} break; case 114: -#line 354 "char_ref.rl" {te = p+1;{ output->first = 0x295f; {p++; goto _out; } }} break; case 115: -#line 355 "char_ref.rl" {te = p+1;{ output->first = 0x21c1; {p++; goto _out; } }} break; case 116: -#line 356 "char_ref.rl" {te = p+1;{ output->first = 0x2957; {p++; goto _out; } }} break; case 117: -#line 357 "char_ref.rl" {te = p+1;{ output->first = 0x22a4; {p++; goto _out; } }} break; case 118: -#line 358 "char_ref.rl" {te = p+1;{ output->first = 0x21a7; {p++; goto _out; } }} break; case 119: -#line 359 "char_ref.rl" {te = p+1;{ output->first = 0x21d3; {p++; goto _out; } }} break; case 120: -#line 360 "char_ref.rl" {te = p+1;{ output->first = 0x0001d49f; {p++; goto _out; } }} break; case 121: -#line 361 "char_ref.rl" {te = p+1;{ output->first = 0x0110; {p++; goto _out; } }} break; case 122: -#line 362 "char_ref.rl" {te = p+1;{ output->first = 0x014a; {p++; goto _out; } }} break; case 123: -#line 363 "char_ref.rl" {te = p+1;{ output->first = 0xd0; {p++; goto _out; } }} break; case 124: -#line 365 "char_ref.rl" {te = p+1;{ output->first = 0xc9; {p++; goto _out; } }} break; case 125: -#line 367 "char_ref.rl" {te = p+1;{ output->first = 0x011a; {p++; goto _out; } }} break; case 126: -#line 368 "char_ref.rl" {te = p+1;{ output->first = 0xca; {p++; goto _out; } }} break; case 127: -#line 370 "char_ref.rl" {te = p+1;{ output->first = 0x042d; {p++; goto _out; } }} break; case 128: -#line 371 "char_ref.rl" {te = p+1;{ output->first = 0x0116; {p++; goto _out; } }} break; case 129: -#line 372 "char_ref.rl" {te = p+1;{ output->first = 0x0001d508; {p++; goto _out; } }} break; case 130: -#line 373 "char_ref.rl" {te = p+1;{ output->first = 0xc8; {p++; goto _out; } }} break; case 131: -#line 375 "char_ref.rl" {te = p+1;{ output->first = 0x2208; {p++; goto _out; } }} break; case 132: -#line 376 "char_ref.rl" {te = p+1;{ output->first = 0x0112; {p++; goto _out; } }} break; case 133: -#line 377 "char_ref.rl" {te = p+1;{ output->first = 0x25fb; {p++; goto _out; } }} break; case 134: -#line 378 "char_ref.rl" {te = p+1;{ output->first = 0x25ab; {p++; goto _out; } }} break; case 135: -#line 379 "char_ref.rl" {te = p+1;{ output->first = 0x0118; {p++; goto _out; } }} break; case 136: -#line 380 "char_ref.rl" {te = p+1;{ output->first = 0x0001d53c; {p++; goto _out; } }} break; case 137: -#line 381 "char_ref.rl" {te = p+1;{ output->first = 0x0395; {p++; goto _out; } }} break; case 138: -#line 382 "char_ref.rl" {te = p+1;{ output->first = 0x2a75; {p++; goto _out; } }} break; case 139: -#line 383 "char_ref.rl" {te = p+1;{ output->first = 0x2242; {p++; goto _out; } }} break; case 140: -#line 384 "char_ref.rl" {te = p+1;{ output->first = 0x21cc; {p++; goto _out; } }} break; case 141: -#line 385 "char_ref.rl" {te = p+1;{ output->first = 0x2130; {p++; goto _out; } }} break; case 142: -#line 386 "char_ref.rl" {te = p+1;{ output->first = 0x2a73; {p++; goto _out; } }} break; case 143: -#line 387 "char_ref.rl" {te = p+1;{ output->first = 0x0397; {p++; goto _out; } }} break; case 144: -#line 388 "char_ref.rl" {te = p+1;{ output->first = 0xcb; {p++; goto _out; } }} break; case 145: -#line 390 "char_ref.rl" {te = p+1;{ output->first = 0x2203; {p++; goto _out; } }} break; case 146: -#line 391 "char_ref.rl" {te = p+1;{ output->first = 0x2147; {p++; goto _out; } }} break; case 147: -#line 392 "char_ref.rl" {te = p+1;{ output->first = 0x0424; {p++; goto _out; } }} break; case 148: -#line 393 "char_ref.rl" {te = p+1;{ output->first = 0x0001d509; {p++; goto _out; } }} break; case 149: -#line 394 "char_ref.rl" {te = p+1;{ output->first = 0x25fc; {p++; goto _out; } }} break; case 150: -#line 395 "char_ref.rl" {te = p+1;{ output->first = 0x25aa; {p++; goto _out; } }} break; case 151: -#line 396 "char_ref.rl" {te = p+1;{ output->first = 0x0001d53d; {p++; goto _out; } }} break; case 152: -#line 397 "char_ref.rl" {te = p+1;{ output->first = 0x2200; {p++; goto _out; } }} break; case 153: -#line 398 "char_ref.rl" {te = p+1;{ output->first = 0x2131; {p++; goto _out; } }} break; case 154: -#line 399 "char_ref.rl" {te = p+1;{ output->first = 0x2131; {p++; goto _out; } }} break; case 155: -#line 400 "char_ref.rl" {te = p+1;{ output->first = 0x0403; {p++; goto _out; } }} break; case 156: -#line 401 "char_ref.rl" {te = p+1;{ output->first = 0x3e; {p++; goto _out; } }} break; case 157: -#line 403 "char_ref.rl" {te = p+1;{ output->first = 0x0393; {p++; goto _out; } }} break; case 158: -#line 404 "char_ref.rl" {te = p+1;{ output->first = 0x03dc; {p++; goto _out; } }} break; case 159: -#line 405 "char_ref.rl" {te = p+1;{ output->first = 0x011e; {p++; goto _out; } }} break; case 160: -#line 406 "char_ref.rl" {te = p+1;{ output->first = 0x0122; {p++; goto _out; } }} break; case 161: -#line 407 "char_ref.rl" {te = p+1;{ output->first = 0x011c; {p++; goto _out; } }} break; case 162: -#line 408 "char_ref.rl" {te = p+1;{ output->first = 0x0413; {p++; goto _out; } }} break; case 163: -#line 409 "char_ref.rl" {te = p+1;{ output->first = 0x0120; {p++; goto _out; } }} break; case 164: -#line 410 "char_ref.rl" {te = p+1;{ output->first = 0x0001d50a; {p++; goto _out; } }} break; case 165: -#line 411 "char_ref.rl" {te = p+1;{ output->first = 0x22d9; {p++; goto _out; } }} break; case 166: -#line 412 "char_ref.rl" {te = p+1;{ output->first = 0x0001d53e; {p++; goto _out; } }} break; case 167: -#line 413 "char_ref.rl" {te = p+1;{ output->first = 0x2265; {p++; goto _out; } }} break; case 168: -#line 414 "char_ref.rl" {te = p+1;{ output->first = 0x22db; {p++; goto _out; } }} break; case 169: -#line 415 "char_ref.rl" {te = p+1;{ output->first = 0x2267; {p++; goto _out; } }} break; case 170: -#line 416 "char_ref.rl" {te = p+1;{ output->first = 0x2aa2; {p++; goto _out; } }} break; case 171: -#line 417 "char_ref.rl" {te = p+1;{ output->first = 0x2277; {p++; goto _out; } }} break; case 172: -#line 418 "char_ref.rl" {te = p+1;{ output->first = 0x2a7e; {p++; goto _out; } }} break; case 173: -#line 419 "char_ref.rl" {te = p+1;{ output->first = 0x2273; {p++; goto _out; } }} break; case 174: -#line 420 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4a2; {p++; goto _out; } }} break; case 175: -#line 421 "char_ref.rl" {te = p+1;{ output->first = 0x226b; {p++; goto _out; } }} break; case 176: -#line 422 "char_ref.rl" {te = p+1;{ output->first = 0x042a; {p++; goto _out; } }} break; case 177: -#line 423 "char_ref.rl" {te = p+1;{ output->first = 0x02c7; {p++; goto _out; } }} break; case 178: -#line 424 "char_ref.rl" {te = p+1;{ output->first = 0x5e; {p++; goto _out; } }} break; case 179: -#line 425 "char_ref.rl" {te = p+1;{ output->first = 0x0124; {p++; goto _out; } }} break; case 180: -#line 426 "char_ref.rl" {te = p+1;{ output->first = 0x210c; {p++; goto _out; } }} break; case 181: -#line 427 "char_ref.rl" {te = p+1;{ output->first = 0x210b; {p++; goto _out; } }} break; case 182: -#line 428 "char_ref.rl" {te = p+1;{ output->first = 0x210d; {p++; goto _out; } }} break; case 183: -#line 429 "char_ref.rl" {te = p+1;{ output->first = 0x2500; {p++; goto _out; } }} break; case 184: -#line 430 "char_ref.rl" {te = p+1;{ output->first = 0x210b; {p++; goto _out; } }} break; case 185: -#line 431 "char_ref.rl" {te = p+1;{ output->first = 0x0126; {p++; goto _out; } }} break; case 186: -#line 432 "char_ref.rl" {te = p+1;{ output->first = 0x224e; {p++; goto _out; } }} break; case 187: -#line 433 "char_ref.rl" {te = p+1;{ output->first = 0x224f; {p++; goto _out; } }} break; case 188: -#line 434 "char_ref.rl" {te = p+1;{ output->first = 0x0415; {p++; goto _out; } }} break; case 189: -#line 435 "char_ref.rl" {te = p+1;{ output->first = 0x0132; {p++; goto _out; } }} break; case 190: -#line 436 "char_ref.rl" {te = p+1;{ output->first = 0x0401; {p++; goto _out; } }} break; case 191: -#line 437 "char_ref.rl" {te = p+1;{ output->first = 0xcd; {p++; goto _out; } }} break; case 192: -#line 439 "char_ref.rl" {te = p+1;{ output->first = 0xce; {p++; goto _out; } }} break; case 193: -#line 441 "char_ref.rl" {te = p+1;{ output->first = 0x0418; {p++; goto _out; } }} break; case 194: -#line 442 "char_ref.rl" {te = p+1;{ output->first = 0x0130; {p++; goto _out; } }} break; case 195: -#line 443 "char_ref.rl" {te = p+1;{ output->first = 0x2111; {p++; goto _out; } }} break; case 196: -#line 444 "char_ref.rl" {te = p+1;{ output->first = 0xcc; {p++; goto _out; } }} break; case 197: -#line 446 "char_ref.rl" {te = p+1;{ output->first = 0x2111; {p++; goto _out; } }} break; case 198: -#line 447 "char_ref.rl" {te = p+1;{ output->first = 0x012a; {p++; goto _out; } }} break; case 199: -#line 448 "char_ref.rl" {te = p+1;{ output->first = 0x2148; {p++; goto _out; } }} break; case 200: -#line 449 "char_ref.rl" {te = p+1;{ output->first = 0x21d2; {p++; goto _out; } }} break; case 201: -#line 450 "char_ref.rl" {te = p+1;{ output->first = 0x222c; {p++; goto _out; } }} break; case 202: -#line 451 "char_ref.rl" {te = p+1;{ output->first = 0x222b; {p++; goto _out; } }} break; case 203: -#line 452 "char_ref.rl" {te = p+1;{ output->first = 0x22c2; {p++; goto _out; } }} break; case 204: -#line 453 "char_ref.rl" {te = p+1;{ output->first = 0x2063; {p++; goto _out; } }} break; case 205: -#line 454 "char_ref.rl" {te = p+1;{ output->first = 0x2062; {p++; goto _out; } }} break; case 206: -#line 455 "char_ref.rl" {te = p+1;{ output->first = 0x012e; {p++; goto _out; } }} break; case 207: -#line 456 "char_ref.rl" {te = p+1;{ output->first = 0x0001d540; {p++; goto _out; } }} break; case 208: -#line 457 "char_ref.rl" {te = p+1;{ output->first = 0x0399; {p++; goto _out; } }} break; case 209: -#line 458 "char_ref.rl" {te = p+1;{ output->first = 0x2110; {p++; goto _out; } }} break; case 210: -#line 459 "char_ref.rl" {te = p+1;{ output->first = 0x0128; {p++; goto _out; } }} break; case 211: -#line 460 "char_ref.rl" {te = p+1;{ output->first = 0x0406; {p++; goto _out; } }} break; case 212: -#line 461 "char_ref.rl" {te = p+1;{ output->first = 0xcf; {p++; goto _out; } }} break; case 213: -#line 463 "char_ref.rl" {te = p+1;{ output->first = 0x0134; {p++; goto _out; } }} break; case 214: -#line 464 "char_ref.rl" {te = p+1;{ output->first = 0x0419; {p++; goto _out; } }} break; case 215: -#line 465 "char_ref.rl" {te = p+1;{ output->first = 0x0001d50d; {p++; goto _out; } }} break; case 216: -#line 466 "char_ref.rl" {te = p+1;{ output->first = 0x0001d541; {p++; goto _out; } }} break; case 217: -#line 467 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4a5; {p++; goto _out; } }} break; case 218: -#line 468 "char_ref.rl" {te = p+1;{ output->first = 0x0408; {p++; goto _out; } }} break; case 219: -#line 469 "char_ref.rl" {te = p+1;{ output->first = 0x0404; {p++; goto _out; } }} break; case 220: -#line 470 "char_ref.rl" {te = p+1;{ output->first = 0x0425; {p++; goto _out; } }} break; case 221: -#line 471 "char_ref.rl" {te = p+1;{ output->first = 0x040c; {p++; goto _out; } }} break; case 222: -#line 472 "char_ref.rl" {te = p+1;{ output->first = 0x039a; {p++; goto _out; } }} break; case 223: -#line 473 "char_ref.rl" {te = p+1;{ output->first = 0x0136; {p++; goto _out; } }} break; case 224: -#line 474 "char_ref.rl" {te = p+1;{ output->first = 0x041a; {p++; goto _out; } }} break; case 225: -#line 475 "char_ref.rl" {te = p+1;{ output->first = 0x0001d50e; {p++; goto _out; } }} break; case 226: -#line 476 "char_ref.rl" {te = p+1;{ output->first = 0x0001d542; {p++; goto _out; } }} break; case 227: -#line 477 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4a6; {p++; goto _out; } }} break; case 228: -#line 478 "char_ref.rl" {te = p+1;{ output->first = 0x0409; {p++; goto _out; } }} break; case 229: -#line 479 "char_ref.rl" {te = p+1;{ output->first = 0x3c; {p++; goto _out; } }} break; case 230: -#line 481 "char_ref.rl" {te = p+1;{ output->first = 0x0139; {p++; goto _out; } }} break; case 231: -#line 482 "char_ref.rl" {te = p+1;{ output->first = 0x039b; {p++; goto _out; } }} break; case 232: -#line 483 "char_ref.rl" {te = p+1;{ output->first = 0x27ea; {p++; goto _out; } }} break; case 233: -#line 484 "char_ref.rl" {te = p+1;{ output->first = 0x2112; {p++; goto _out; } }} break; case 234: -#line 485 "char_ref.rl" {te = p+1;{ output->first = 0x219e; {p++; goto _out; } }} break; case 235: -#line 486 "char_ref.rl" {te = p+1;{ output->first = 0x013d; {p++; goto _out; } }} break; case 236: -#line 487 "char_ref.rl" {te = p+1;{ output->first = 0x013b; {p++; goto _out; } }} break; case 237: -#line 488 "char_ref.rl" {te = p+1;{ output->first = 0x041b; {p++; goto _out; } }} break; case 238: -#line 489 "char_ref.rl" {te = p+1;{ output->first = 0x27e8; {p++; goto _out; } }} break; case 239: -#line 490 "char_ref.rl" {te = p+1;{ output->first = 0x2190; {p++; goto _out; } }} break; case 240: -#line 491 "char_ref.rl" {te = p+1;{ output->first = 0x21e4; {p++; goto _out; } }} break; case 241: -#line 492 "char_ref.rl" {te = p+1;{ output->first = 0x21c6; {p++; goto _out; } }} break; case 242: -#line 493 "char_ref.rl" {te = p+1;{ output->first = 0x2308; {p++; goto _out; } }} break; case 243: -#line 494 "char_ref.rl" {te = p+1;{ output->first = 0x27e6; {p++; goto _out; } }} break; case 244: -#line 495 "char_ref.rl" {te = p+1;{ output->first = 0x2961; {p++; goto _out; } }} break; case 245: -#line 496 "char_ref.rl" {te = p+1;{ output->first = 0x21c3; {p++; goto _out; } }} break; case 246: -#line 497 "char_ref.rl" {te = p+1;{ output->first = 0x2959; {p++; goto _out; } }} break; case 247: -#line 498 "char_ref.rl" {te = p+1;{ output->first = 0x230a; {p++; goto _out; } }} break; case 248: -#line 499 "char_ref.rl" {te = p+1;{ output->first = 0x2194; {p++; goto _out; } }} break; case 249: -#line 500 "char_ref.rl" {te = p+1;{ output->first = 0x294e; {p++; goto _out; } }} break; case 250: -#line 501 "char_ref.rl" {te = p+1;{ output->first = 0x22a3; {p++; goto _out; } }} break; case 251: -#line 502 "char_ref.rl" {te = p+1;{ output->first = 0x21a4; {p++; goto _out; } }} break; case 252: -#line 503 "char_ref.rl" {te = p+1;{ output->first = 0x295a; {p++; goto _out; } }} break; case 253: -#line 504 "char_ref.rl" {te = p+1;{ output->first = 0x22b2; {p++; goto _out; } }} break; case 254: -#line 505 "char_ref.rl" {te = p+1;{ output->first = 0x29cf; {p++; goto _out; } }} break; case 255: -#line 506 "char_ref.rl" {te = p+1;{ output->first = 0x22b4; {p++; goto _out; } }} break; case 256: -#line 507 "char_ref.rl" {te = p+1;{ output->first = 0x2951; {p++; goto _out; } }} break; case 257: -#line 508 "char_ref.rl" {te = p+1;{ output->first = 0x2960; {p++; goto _out; } }} break; case 258: -#line 509 "char_ref.rl" {te = p+1;{ output->first = 0x21bf; {p++; goto _out; } }} break; case 259: -#line 510 "char_ref.rl" {te = p+1;{ output->first = 0x2958; {p++; goto _out; } }} break; case 260: -#line 511 "char_ref.rl" {te = p+1;{ output->first = 0x21bc; {p++; goto _out; } }} break; case 261: -#line 512 "char_ref.rl" {te = p+1;{ output->first = 0x2952; {p++; goto _out; } }} break; case 262: -#line 513 "char_ref.rl" {te = p+1;{ output->first = 0x21d0; {p++; goto _out; } }} break; case 263: -#line 514 "char_ref.rl" {te = p+1;{ output->first = 0x21d4; {p++; goto _out; } }} break; case 264: -#line 515 "char_ref.rl" {te = p+1;{ output->first = 0x22da; {p++; goto _out; } }} break; case 265: -#line 516 "char_ref.rl" {te = p+1;{ output->first = 0x2266; {p++; goto _out; } }} break; case 266: -#line 517 "char_ref.rl" {te = p+1;{ output->first = 0x2276; {p++; goto _out; } }} break; case 267: -#line 518 "char_ref.rl" {te = p+1;{ output->first = 0x2aa1; {p++; goto _out; } }} break; case 268: -#line 519 "char_ref.rl" {te = p+1;{ output->first = 0x2a7d; {p++; goto _out; } }} break; case 269: -#line 520 "char_ref.rl" {te = p+1;{ output->first = 0x2272; {p++; goto _out; } }} break; case 270: -#line 521 "char_ref.rl" {te = p+1;{ output->first = 0x0001d50f; {p++; goto _out; } }} break; case 271: -#line 522 "char_ref.rl" {te = p+1;{ output->first = 0x22d8; {p++; goto _out; } }} break; case 272: -#line 523 "char_ref.rl" {te = p+1;{ output->first = 0x21da; {p++; goto _out; } }} break; case 273: -#line 524 "char_ref.rl" {te = p+1;{ output->first = 0x013f; {p++; goto _out; } }} break; case 274: -#line 525 "char_ref.rl" {te = p+1;{ output->first = 0x27f5; {p++; goto _out; } }} break; case 275: -#line 526 "char_ref.rl" {te = p+1;{ output->first = 0x27f7; {p++; goto _out; } }} break; case 276: -#line 527 "char_ref.rl" {te = p+1;{ output->first = 0x27f6; {p++; goto _out; } }} break; case 277: -#line 528 "char_ref.rl" {te = p+1;{ output->first = 0x27f8; {p++; goto _out; } }} break; case 278: -#line 529 "char_ref.rl" {te = p+1;{ output->first = 0x27fa; {p++; goto _out; } }} break; case 279: -#line 530 "char_ref.rl" {te = p+1;{ output->first = 0x27f9; {p++; goto _out; } }} break; case 280: -#line 531 "char_ref.rl" {te = p+1;{ output->first = 0x0001d543; {p++; goto _out; } }} break; case 281: -#line 532 "char_ref.rl" {te = p+1;{ output->first = 0x2199; {p++; goto _out; } }} break; case 282: -#line 533 "char_ref.rl" {te = p+1;{ output->first = 0x2198; {p++; goto _out; } }} break; case 283: -#line 534 "char_ref.rl" {te = p+1;{ output->first = 0x2112; {p++; goto _out; } }} break; case 284: -#line 535 "char_ref.rl" {te = p+1;{ output->first = 0x21b0; {p++; goto _out; } }} break; case 285: -#line 536 "char_ref.rl" {te = p+1;{ output->first = 0x0141; {p++; goto _out; } }} break; case 286: -#line 537 "char_ref.rl" {te = p+1;{ output->first = 0x226a; {p++; goto _out; } }} break; case 287: -#line 538 "char_ref.rl" {te = p+1;{ output->first = 0x2905; {p++; goto _out; } }} break; case 288: -#line 539 "char_ref.rl" {te = p+1;{ output->first = 0x041c; {p++; goto _out; } }} break; case 289: -#line 540 "char_ref.rl" {te = p+1;{ output->first = 0x205f; {p++; goto _out; } }} break; case 290: -#line 541 "char_ref.rl" {te = p+1;{ output->first = 0x2133; {p++; goto _out; } }} break; case 291: -#line 542 "char_ref.rl" {te = p+1;{ output->first = 0x0001d510; {p++; goto _out; } }} break; case 292: -#line 543 "char_ref.rl" {te = p+1;{ output->first = 0x2213; {p++; goto _out; } }} break; case 293: -#line 544 "char_ref.rl" {te = p+1;{ output->first = 0x0001d544; {p++; goto _out; } }} break; case 294: -#line 545 "char_ref.rl" {te = p+1;{ output->first = 0x2133; {p++; goto _out; } }} break; case 295: -#line 546 "char_ref.rl" {te = p+1;{ output->first = 0x039c; {p++; goto _out; } }} break; case 296: -#line 547 "char_ref.rl" {te = p+1;{ output->first = 0x040a; {p++; goto _out; } }} break; case 297: -#line 548 "char_ref.rl" {te = p+1;{ output->first = 0x0143; {p++; goto _out; } }} break; case 298: -#line 549 "char_ref.rl" {te = p+1;{ output->first = 0x0147; {p++; goto _out; } }} break; case 299: -#line 550 "char_ref.rl" {te = p+1;{ output->first = 0x0145; {p++; goto _out; } }} break; case 300: -#line 551 "char_ref.rl" {te = p+1;{ output->first = 0x041d; {p++; goto _out; } }} break; case 301: -#line 552 "char_ref.rl" {te = p+1;{ output->first = 0x200b; {p++; goto _out; } }} break; case 302: -#line 553 "char_ref.rl" {te = p+1;{ output->first = 0x200b; {p++; goto _out; } }} break; case 303: -#line 554 "char_ref.rl" {te = p+1;{ output->first = 0x200b; {p++; goto _out; } }} break; case 304: -#line 555 "char_ref.rl" {te = p+1;{ output->first = 0x200b; {p++; goto _out; } }} break; case 305: -#line 556 "char_ref.rl" {te = p+1;{ output->first = 0x226b; {p++; goto _out; } }} break; case 306: -#line 557 "char_ref.rl" {te = p+1;{ output->first = 0x226a; {p++; goto _out; } }} break; case 307: -#line 558 "char_ref.rl" {te = p+1;{ output->first = 0x0a; {p++; goto _out; } }} break; case 308: -#line 559 "char_ref.rl" {te = p+1;{ output->first = 0x0001d511; {p++; goto _out; } }} break; case 309: -#line 560 "char_ref.rl" {te = p+1;{ output->first = 0x2060; {p++; goto _out; } }} break; case 310: -#line 561 "char_ref.rl" {te = p+1;{ output->first = 0xa0; {p++; goto _out; } }} break; case 311: -#line 562 "char_ref.rl" {te = p+1;{ output->first = 0x2115; {p++; goto _out; } }} break; case 312: -#line 563 "char_ref.rl" {te = p+1;{ output->first = 0x2aec; {p++; goto _out; } }} break; case 313: -#line 564 "char_ref.rl" {te = p+1;{ output->first = 0x2262; {p++; goto _out; } }} break; case 314: -#line 565 "char_ref.rl" {te = p+1;{ output->first = 0x226d; {p++; goto _out; } }} break; case 315: -#line 566 "char_ref.rl" {te = p+1;{ output->first = 0x2226; {p++; goto _out; } }} break; case 316: -#line 567 "char_ref.rl" {te = p+1;{ output->first = 0x2209; {p++; goto _out; } }} break; case 317: -#line 568 "char_ref.rl" {te = p+1;{ output->first = 0x2260; {p++; goto _out; } }} break; case 318: -#line 569 "char_ref.rl" {te = p+1;{ output->first = 0x2242; output->second = 0x0338; {p++; goto _out; } }} break; case 319: -#line 570 "char_ref.rl" {te = p+1;{ output->first = 0x2204; {p++; goto _out; } }} break; case 320: -#line 571 "char_ref.rl" {te = p+1;{ output->first = 0x226f; {p++; goto _out; } }} break; case 321: -#line 572 "char_ref.rl" {te = p+1;{ output->first = 0x2271; {p++; goto _out; } }} break; case 322: -#line 573 "char_ref.rl" {te = p+1;{ output->first = 0x2267; output->second = 0x0338; {p++; goto _out; } }} break; case 323: -#line 574 "char_ref.rl" {te = p+1;{ output->first = 0x226b; output->second = 0x0338; {p++; goto _out; } }} break; case 324: -#line 575 "char_ref.rl" {te = p+1;{ output->first = 0x2279; {p++; goto _out; } }} break; case 325: -#line 576 "char_ref.rl" {te = p+1;{ output->first = 0x2a7e; output->second = 0x0338; {p++; goto _out; } }} break; case 326: -#line 577 "char_ref.rl" {te = p+1;{ output->first = 0x2275; {p++; goto _out; } }} break; case 327: -#line 578 "char_ref.rl" {te = p+1;{ output->first = 0x224e; output->second = 0x0338; {p++; goto _out; } }} break; case 328: -#line 579 "char_ref.rl" {te = p+1;{ output->first = 0x224f; output->second = 0x0338; {p++; goto _out; } }} break; case 329: -#line 580 "char_ref.rl" {te = p+1;{ output->first = 0x22ea; {p++; goto _out; } }} break; case 330: -#line 581 "char_ref.rl" {te = p+1;{ output->first = 0x29cf; output->second = 0x0338; {p++; goto _out; } }} break; case 331: -#line 582 "char_ref.rl" {te = p+1;{ output->first = 0x22ec; {p++; goto _out; } }} break; case 332: -#line 583 "char_ref.rl" {te = p+1;{ output->first = 0x226e; {p++; goto _out; } }} break; case 333: -#line 584 "char_ref.rl" {te = p+1;{ output->first = 0x2270; {p++; goto _out; } }} break; case 334: -#line 585 "char_ref.rl" {te = p+1;{ output->first = 0x2278; {p++; goto _out; } }} break; case 335: -#line 586 "char_ref.rl" {te = p+1;{ output->first = 0x226a; output->second = 0x0338; {p++; goto _out; } }} break; case 336: -#line 587 "char_ref.rl" {te = p+1;{ output->first = 0x2a7d; output->second = 0x0338; {p++; goto _out; } }} break; case 337: -#line 588 "char_ref.rl" {te = p+1;{ output->first = 0x2274; {p++; goto _out; } }} break; case 338: -#line 589 "char_ref.rl" {te = p+1;{ output->first = 0x2aa2; output->second = 0x0338; {p++; goto _out; } }} break; case 339: -#line 590 "char_ref.rl" {te = p+1;{ output->first = 0x2aa1; output->second = 0x0338; {p++; goto _out; } }} break; case 340: -#line 591 "char_ref.rl" {te = p+1;{ output->first = 0x2280; {p++; goto _out; } }} break; case 341: -#line 592 "char_ref.rl" {te = p+1;{ output->first = 0x2aaf; output->second = 0x0338; {p++; goto _out; } }} break; case 342: -#line 593 "char_ref.rl" {te = p+1;{ output->first = 0x22e0; {p++; goto _out; } }} break; case 343: -#line 594 "char_ref.rl" {te = p+1;{ output->first = 0x220c; {p++; goto _out; } }} break; case 344: -#line 595 "char_ref.rl" {te = p+1;{ output->first = 0x22eb; {p++; goto _out; } }} break; case 345: -#line 596 "char_ref.rl" {te = p+1;{ output->first = 0x29d0; output->second = 0x0338; {p++; goto _out; } }} break; case 346: -#line 597 "char_ref.rl" {te = p+1;{ output->first = 0x22ed; {p++; goto _out; } }} break; case 347: -#line 598 "char_ref.rl" {te = p+1;{ output->first = 0x228f; output->second = 0x0338; {p++; goto _out; } }} break; case 348: -#line 599 "char_ref.rl" {te = p+1;{ output->first = 0x22e2; {p++; goto _out; } }} break; case 349: -#line 600 "char_ref.rl" {te = p+1;{ output->first = 0x2290; output->second = 0x0338; {p++; goto _out; } }} break; case 350: -#line 601 "char_ref.rl" {te = p+1;{ output->first = 0x22e3; {p++; goto _out; } }} break; case 351: -#line 602 "char_ref.rl" {te = p+1;{ output->first = 0x2282; output->second = 0x20d2; {p++; goto _out; } }} break; case 352: -#line 603 "char_ref.rl" {te = p+1;{ output->first = 0x2288; {p++; goto _out; } }} break; case 353: -#line 604 "char_ref.rl" {te = p+1;{ output->first = 0x2281; {p++; goto _out; } }} break; case 354: -#line 605 "char_ref.rl" {te = p+1;{ output->first = 0x2ab0; output->second = 0x0338; {p++; goto _out; } }} break; case 355: -#line 606 "char_ref.rl" {te = p+1;{ output->first = 0x22e1; {p++; goto _out; } }} break; case 356: -#line 607 "char_ref.rl" {te = p+1;{ output->first = 0x227f; output->second = 0x0338; {p++; goto _out; } }} break; case 357: -#line 608 "char_ref.rl" {te = p+1;{ output->first = 0x2283; output->second = 0x20d2; {p++; goto _out; } }} break; case 358: -#line 609 "char_ref.rl" {te = p+1;{ output->first = 0x2289; {p++; goto _out; } }} break; case 359: -#line 610 "char_ref.rl" {te = p+1;{ output->first = 0x2241; {p++; goto _out; } }} break; case 360: -#line 611 "char_ref.rl" {te = p+1;{ output->first = 0x2244; {p++; goto _out; } }} break; case 361: -#line 612 "char_ref.rl" {te = p+1;{ output->first = 0x2247; {p++; goto _out; } }} break; case 362: -#line 613 "char_ref.rl" {te = p+1;{ output->first = 0x2249; {p++; goto _out; } }} break; case 363: -#line 614 "char_ref.rl" {te = p+1;{ output->first = 0x2224; {p++; goto _out; } }} break; case 364: -#line 615 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4a9; {p++; goto _out; } }} break; case 365: -#line 616 "char_ref.rl" {te = p+1;{ output->first = 0xd1; {p++; goto _out; } }} break; case 366: -#line 618 "char_ref.rl" {te = p+1;{ output->first = 0x039d; {p++; goto _out; } }} break; case 367: -#line 619 "char_ref.rl" {te = p+1;{ output->first = 0x0152; {p++; goto _out; } }} break; case 368: -#line 620 "char_ref.rl" {te = p+1;{ output->first = 0xd3; {p++; goto _out; } }} break; case 369: -#line 622 "char_ref.rl" {te = p+1;{ output->first = 0xd4; {p++; goto _out; } }} break; case 370: -#line 624 "char_ref.rl" {te = p+1;{ output->first = 0x041e; {p++; goto _out; } }} break; case 371: -#line 625 "char_ref.rl" {te = p+1;{ output->first = 0x0150; {p++; goto _out; } }} break; case 372: -#line 626 "char_ref.rl" {te = p+1;{ output->first = 0x0001d512; {p++; goto _out; } }} break; case 373: -#line 627 "char_ref.rl" {te = p+1;{ output->first = 0xd2; {p++; goto _out; } }} break; case 374: -#line 629 "char_ref.rl" {te = p+1;{ output->first = 0x014c; {p++; goto _out; } }} break; case 375: -#line 630 "char_ref.rl" {te = p+1;{ output->first = 0x03a9; {p++; goto _out; } }} break; case 376: -#line 631 "char_ref.rl" {te = p+1;{ output->first = 0x039f; {p++; goto _out; } }} break; case 377: -#line 632 "char_ref.rl" {te = p+1;{ output->first = 0x0001d546; {p++; goto _out; } }} break; case 378: -#line 633 "char_ref.rl" {te = p+1;{ output->first = 0x201c; {p++; goto _out; } }} break; case 379: -#line 634 "char_ref.rl" {te = p+1;{ output->first = 0x2018; {p++; goto _out; } }} break; case 380: -#line 635 "char_ref.rl" {te = p+1;{ output->first = 0x2a54; {p++; goto _out; } }} break; case 381: -#line 636 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4aa; {p++; goto _out; } }} break; case 382: -#line 637 "char_ref.rl" {te = p+1;{ output->first = 0xd8; {p++; goto _out; } }} break; case 383: -#line 639 "char_ref.rl" {te = p+1;{ output->first = 0xd5; {p++; goto _out; } }} break; case 384: -#line 641 "char_ref.rl" {te = p+1;{ output->first = 0x2a37; {p++; goto _out; } }} break; case 385: -#line 642 "char_ref.rl" {te = p+1;{ output->first = 0xd6; {p++; goto _out; } }} break; case 386: -#line 644 "char_ref.rl" {te = p+1;{ output->first = 0x203e; {p++; goto _out; } }} break; case 387: -#line 645 "char_ref.rl" {te = p+1;{ output->first = 0x23de; {p++; goto _out; } }} break; case 388: -#line 646 "char_ref.rl" {te = p+1;{ output->first = 0x23b4; {p++; goto _out; } }} break; case 389: -#line 647 "char_ref.rl" {te = p+1;{ output->first = 0x23dc; {p++; goto _out; } }} break; case 390: -#line 648 "char_ref.rl" {te = p+1;{ output->first = 0x2202; {p++; goto _out; } }} break; case 391: -#line 649 "char_ref.rl" {te = p+1;{ output->first = 0x041f; {p++; goto _out; } }} break; case 392: -#line 650 "char_ref.rl" {te = p+1;{ output->first = 0x0001d513; {p++; goto _out; } }} break; case 393: -#line 651 "char_ref.rl" {te = p+1;{ output->first = 0x03a6; {p++; goto _out; } }} break; case 394: -#line 652 "char_ref.rl" {te = p+1;{ output->first = 0x03a0; {p++; goto _out; } }} break; case 395: -#line 653 "char_ref.rl" {te = p+1;{ output->first = 0xb1; {p++; goto _out; } }} break; case 396: -#line 654 "char_ref.rl" {te = p+1;{ output->first = 0x210c; {p++; goto _out; } }} break; case 397: -#line 655 "char_ref.rl" {te = p+1;{ output->first = 0x2119; {p++; goto _out; } }} break; case 398: -#line 656 "char_ref.rl" {te = p+1;{ output->first = 0x2abb; {p++; goto _out; } }} break; case 399: -#line 657 "char_ref.rl" {te = p+1;{ output->first = 0x227a; {p++; goto _out; } }} break; case 400: -#line 658 "char_ref.rl" {te = p+1;{ output->first = 0x2aaf; {p++; goto _out; } }} break; case 401: -#line 659 "char_ref.rl" {te = p+1;{ output->first = 0x227c; {p++; goto _out; } }} break; case 402: -#line 660 "char_ref.rl" {te = p+1;{ output->first = 0x227e; {p++; goto _out; } }} break; case 403: -#line 661 "char_ref.rl" {te = p+1;{ output->first = 0x2033; {p++; goto _out; } }} break; case 404: -#line 662 "char_ref.rl" {te = p+1;{ output->first = 0x220f; {p++; goto _out; } }} break; case 405: -#line 663 "char_ref.rl" {te = p+1;{ output->first = 0x2237; {p++; goto _out; } }} break; case 406: -#line 664 "char_ref.rl" {te = p+1;{ output->first = 0x221d; {p++; goto _out; } }} break; case 407: -#line 665 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4ab; {p++; goto _out; } }} break; case 408: -#line 666 "char_ref.rl" {te = p+1;{ output->first = 0x03a8; {p++; goto _out; } }} break; case 409: -#line 667 "char_ref.rl" {te = p+1;{ output->first = 0x22; {p++; goto _out; } }} break; case 410: -#line 669 "char_ref.rl" {te = p+1;{ output->first = 0x0001d514; {p++; goto _out; } }} break; case 411: -#line 670 "char_ref.rl" {te = p+1;{ output->first = 0x211a; {p++; goto _out; } }} break; case 412: -#line 671 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4ac; {p++; goto _out; } }} break; case 413: -#line 672 "char_ref.rl" {te = p+1;{ output->first = 0x2910; {p++; goto _out; } }} break; case 414: -#line 673 "char_ref.rl" {te = p+1;{ output->first = 0xae; {p++; goto _out; } }} break; case 415: -#line 675 "char_ref.rl" {te = p+1;{ output->first = 0x0154; {p++; goto _out; } }} break; case 416: -#line 676 "char_ref.rl" {te = p+1;{ output->first = 0x27eb; {p++; goto _out; } }} break; case 417: -#line 677 "char_ref.rl" {te = p+1;{ output->first = 0x21a0; {p++; goto _out; } }} break; case 418: -#line 678 "char_ref.rl" {te = p+1;{ output->first = 0x2916; {p++; goto _out; } }} break; case 419: -#line 679 "char_ref.rl" {te = p+1;{ output->first = 0x0158; {p++; goto _out; } }} break; case 420: -#line 680 "char_ref.rl" {te = p+1;{ output->first = 0x0156; {p++; goto _out; } }} break; case 421: -#line 681 "char_ref.rl" {te = p+1;{ output->first = 0x0420; {p++; goto _out; } }} break; case 422: -#line 682 "char_ref.rl" {te = p+1;{ output->first = 0x211c; {p++; goto _out; } }} break; case 423: -#line 683 "char_ref.rl" {te = p+1;{ output->first = 0x220b; {p++; goto _out; } }} break; case 424: -#line 684 "char_ref.rl" {te = p+1;{ output->first = 0x21cb; {p++; goto _out; } }} break; case 425: -#line 685 "char_ref.rl" {te = p+1;{ output->first = 0x296f; {p++; goto _out; } }} break; case 426: -#line 686 "char_ref.rl" {te = p+1;{ output->first = 0x211c; {p++; goto _out; } }} break; case 427: -#line 687 "char_ref.rl" {te = p+1;{ output->first = 0x03a1; {p++; goto _out; } }} break; case 428: -#line 688 "char_ref.rl" {te = p+1;{ output->first = 0x27e9; {p++; goto _out; } }} break; case 429: -#line 689 "char_ref.rl" {te = p+1;{ output->first = 0x2192; {p++; goto _out; } }} break; case 430: -#line 690 "char_ref.rl" {te = p+1;{ output->first = 0x21e5; {p++; goto _out; } }} break; case 431: -#line 691 "char_ref.rl" {te = p+1;{ output->first = 0x21c4; {p++; goto _out; } }} break; case 432: -#line 692 "char_ref.rl" {te = p+1;{ output->first = 0x2309; {p++; goto _out; } }} break; case 433: -#line 693 "char_ref.rl" {te = p+1;{ output->first = 0x27e7; {p++; goto _out; } }} break; case 434: -#line 694 "char_ref.rl" {te = p+1;{ output->first = 0x295d; {p++; goto _out; } }} break; case 435: -#line 695 "char_ref.rl" {te = p+1;{ output->first = 0x21c2; {p++; goto _out; } }} break; case 436: -#line 696 "char_ref.rl" {te = p+1;{ output->first = 0x2955; {p++; goto _out; } }} break; case 437: -#line 697 "char_ref.rl" {te = p+1;{ output->first = 0x230b; {p++; goto _out; } }} break; case 438: -#line 698 "char_ref.rl" {te = p+1;{ output->first = 0x22a2; {p++; goto _out; } }} break; case 439: -#line 699 "char_ref.rl" {te = p+1;{ output->first = 0x21a6; {p++; goto _out; } }} break; case 440: -#line 700 "char_ref.rl" {te = p+1;{ output->first = 0x295b; {p++; goto _out; } }} break; case 441: -#line 701 "char_ref.rl" {te = p+1;{ output->first = 0x22b3; {p++; goto _out; } }} break; case 442: -#line 702 "char_ref.rl" {te = p+1;{ output->first = 0x29d0; {p++; goto _out; } }} break; case 443: -#line 703 "char_ref.rl" {te = p+1;{ output->first = 0x22b5; {p++; goto _out; } }} break; case 444: -#line 704 "char_ref.rl" {te = p+1;{ output->first = 0x294f; {p++; goto _out; } }} break; case 445: -#line 705 "char_ref.rl" {te = p+1;{ output->first = 0x295c; {p++; goto _out; } }} break; case 446: -#line 706 "char_ref.rl" {te = p+1;{ output->first = 0x21be; {p++; goto _out; } }} break; case 447: -#line 707 "char_ref.rl" {te = p+1;{ output->first = 0x2954; {p++; goto _out; } }} break; case 448: -#line 708 "char_ref.rl" {te = p+1;{ output->first = 0x21c0; {p++; goto _out; } }} break; case 449: -#line 709 "char_ref.rl" {te = p+1;{ output->first = 0x2953; {p++; goto _out; } }} break; case 450: -#line 710 "char_ref.rl" {te = p+1;{ output->first = 0x21d2; {p++; goto _out; } }} break; case 451: -#line 711 "char_ref.rl" {te = p+1;{ output->first = 0x211d; {p++; goto _out; } }} break; case 452: -#line 712 "char_ref.rl" {te = p+1;{ output->first = 0x2970; {p++; goto _out; } }} break; case 453: -#line 713 "char_ref.rl" {te = p+1;{ output->first = 0x21db; {p++; goto _out; } }} break; case 454: -#line 714 "char_ref.rl" {te = p+1;{ output->first = 0x211b; {p++; goto _out; } }} break; case 455: -#line 715 "char_ref.rl" {te = p+1;{ output->first = 0x21b1; {p++; goto _out; } }} break; case 456: -#line 716 "char_ref.rl" {te = p+1;{ output->first = 0x29f4; {p++; goto _out; } }} break; case 457: -#line 717 "char_ref.rl" {te = p+1;{ output->first = 0x0429; {p++; goto _out; } }} break; case 458: -#line 718 "char_ref.rl" {te = p+1;{ output->first = 0x0428; {p++; goto _out; } }} break; case 459: -#line 719 "char_ref.rl" {te = p+1;{ output->first = 0x042c; {p++; goto _out; } }} break; case 460: -#line 720 "char_ref.rl" {te = p+1;{ output->first = 0x015a; {p++; goto _out; } }} break; case 461: -#line 721 "char_ref.rl" {te = p+1;{ output->first = 0x2abc; {p++; goto _out; } }} break; case 462: -#line 722 "char_ref.rl" {te = p+1;{ output->first = 0x0160; {p++; goto _out; } }} break; case 463: -#line 723 "char_ref.rl" {te = p+1;{ output->first = 0x015e; {p++; goto _out; } }} break; case 464: -#line 724 "char_ref.rl" {te = p+1;{ output->first = 0x015c; {p++; goto _out; } }} break; case 465: -#line 725 "char_ref.rl" {te = p+1;{ output->first = 0x0421; {p++; goto _out; } }} break; case 466: -#line 726 "char_ref.rl" {te = p+1;{ output->first = 0x0001d516; {p++; goto _out; } }} break; case 467: -#line 727 "char_ref.rl" {te = p+1;{ output->first = 0x2193; {p++; goto _out; } }} break; case 468: -#line 728 "char_ref.rl" {te = p+1;{ output->first = 0x2190; {p++; goto _out; } }} break; case 469: -#line 729 "char_ref.rl" {te = p+1;{ output->first = 0x2192; {p++; goto _out; } }} break; case 470: -#line 730 "char_ref.rl" {te = p+1;{ output->first = 0x2191; {p++; goto _out; } }} break; case 471: -#line 731 "char_ref.rl" {te = p+1;{ output->first = 0x03a3; {p++; goto _out; } }} break; case 472: -#line 732 "char_ref.rl" {te = p+1;{ output->first = 0x2218; {p++; goto _out; } }} break; case 473: -#line 733 "char_ref.rl" {te = p+1;{ output->first = 0x0001d54a; {p++; goto _out; } }} break; case 474: -#line 734 "char_ref.rl" {te = p+1;{ output->first = 0x221a; {p++; goto _out; } }} break; case 475: -#line 735 "char_ref.rl" {te = p+1;{ output->first = 0x25a1; {p++; goto _out; } }} break; case 476: -#line 736 "char_ref.rl" {te = p+1;{ output->first = 0x2293; {p++; goto _out; } }} break; case 477: -#line 737 "char_ref.rl" {te = p+1;{ output->first = 0x228f; {p++; goto _out; } }} break; case 478: -#line 738 "char_ref.rl" {te = p+1;{ output->first = 0x2291; {p++; goto _out; } }} break; case 479: -#line 739 "char_ref.rl" {te = p+1;{ output->first = 0x2290; {p++; goto _out; } }} break; case 480: -#line 740 "char_ref.rl" {te = p+1;{ output->first = 0x2292; {p++; goto _out; } }} break; case 481: -#line 741 "char_ref.rl" {te = p+1;{ output->first = 0x2294; {p++; goto _out; } }} break; case 482: -#line 742 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4ae; {p++; goto _out; } }} break; case 483: -#line 743 "char_ref.rl" {te = p+1;{ output->first = 0x22c6; {p++; goto _out; } }} break; case 484: -#line 744 "char_ref.rl" {te = p+1;{ output->first = 0x22d0; {p++; goto _out; } }} break; case 485: -#line 745 "char_ref.rl" {te = p+1;{ output->first = 0x22d0; {p++; goto _out; } }} break; case 486: -#line 746 "char_ref.rl" {te = p+1;{ output->first = 0x2286; {p++; goto _out; } }} break; case 487: -#line 747 "char_ref.rl" {te = p+1;{ output->first = 0x227b; {p++; goto _out; } }} break; case 488: -#line 748 "char_ref.rl" {te = p+1;{ output->first = 0x2ab0; {p++; goto _out; } }} break; case 489: -#line 749 "char_ref.rl" {te = p+1;{ output->first = 0x227d; {p++; goto _out; } }} break; case 490: -#line 750 "char_ref.rl" {te = p+1;{ output->first = 0x227f; {p++; goto _out; } }} break; case 491: -#line 751 "char_ref.rl" {te = p+1;{ output->first = 0x220b; {p++; goto _out; } }} break; case 492: -#line 752 "char_ref.rl" {te = p+1;{ output->first = 0x2211; {p++; goto _out; } }} break; case 493: -#line 753 "char_ref.rl" {te = p+1;{ output->first = 0x22d1; {p++; goto _out; } }} break; case 494: -#line 754 "char_ref.rl" {te = p+1;{ output->first = 0x2283; {p++; goto _out; } }} break; case 495: -#line 755 "char_ref.rl" {te = p+1;{ output->first = 0x2287; {p++; goto _out; } }} break; case 496: -#line 756 "char_ref.rl" {te = p+1;{ output->first = 0x22d1; {p++; goto _out; } }} break; case 497: -#line 757 "char_ref.rl" {te = p+1;{ output->first = 0xde; {p++; goto _out; } }} break; case 498: -#line 759 "char_ref.rl" {te = p+1;{ output->first = 0x2122; {p++; goto _out; } }} break; case 499: -#line 760 "char_ref.rl" {te = p+1;{ output->first = 0x040b; {p++; goto _out; } }} break; case 500: -#line 761 "char_ref.rl" {te = p+1;{ output->first = 0x0426; {p++; goto _out; } }} break; case 501: -#line 762 "char_ref.rl" {te = p+1;{ output->first = 0x09; {p++; goto _out; } }} break; case 502: -#line 763 "char_ref.rl" {te = p+1;{ output->first = 0x03a4; {p++; goto _out; } }} break; case 503: -#line 764 "char_ref.rl" {te = p+1;{ output->first = 0x0164; {p++; goto _out; } }} break; case 504: -#line 765 "char_ref.rl" {te = p+1;{ output->first = 0x0162; {p++; goto _out; } }} break; case 505: -#line 766 "char_ref.rl" {te = p+1;{ output->first = 0x0422; {p++; goto _out; } }} break; case 506: -#line 767 "char_ref.rl" {te = p+1;{ output->first = 0x0001d517; {p++; goto _out; } }} break; case 507: -#line 768 "char_ref.rl" {te = p+1;{ output->first = 0x2234; {p++; goto _out; } }} break; case 508: -#line 769 "char_ref.rl" {te = p+1;{ output->first = 0x0398; {p++; goto _out; } }} break; case 509: -#line 770 "char_ref.rl" {te = p+1;{ output->first = 0x205f; output->second = 0x200a; {p++; goto _out; } }} break; case 510: -#line 771 "char_ref.rl" {te = p+1;{ output->first = 0x2009; {p++; goto _out; } }} break; case 511: -#line 772 "char_ref.rl" {te = p+1;{ output->first = 0x223c; {p++; goto _out; } }} break; case 512: -#line 773 "char_ref.rl" {te = p+1;{ output->first = 0x2243; {p++; goto _out; } }} break; case 513: -#line 774 "char_ref.rl" {te = p+1;{ output->first = 0x2245; {p++; goto _out; } }} break; case 514: -#line 775 "char_ref.rl" {te = p+1;{ output->first = 0x2248; {p++; goto _out; } }} break; case 515: -#line 776 "char_ref.rl" {te = p+1;{ output->first = 0x0001d54b; {p++; goto _out; } }} break; case 516: -#line 777 "char_ref.rl" {te = p+1;{ output->first = 0x20db; {p++; goto _out; } }} break; case 517: -#line 778 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4af; {p++; goto _out; } }} break; case 518: -#line 779 "char_ref.rl" {te = p+1;{ output->first = 0x0166; {p++; goto _out; } }} break; case 519: -#line 780 "char_ref.rl" {te = p+1;{ output->first = 0xda; {p++; goto _out; } }} break; case 520: -#line 782 "char_ref.rl" {te = p+1;{ output->first = 0x219f; {p++; goto _out; } }} break; case 521: -#line 783 "char_ref.rl" {te = p+1;{ output->first = 0x2949; {p++; goto _out; } }} break; case 522: -#line 784 "char_ref.rl" {te = p+1;{ output->first = 0x040e; {p++; goto _out; } }} break; case 523: -#line 785 "char_ref.rl" {te = p+1;{ output->first = 0x016c; {p++; goto _out; } }} break; case 524: -#line 786 "char_ref.rl" {te = p+1;{ output->first = 0xdb; {p++; goto _out; } }} break; case 525: -#line 788 "char_ref.rl" {te = p+1;{ output->first = 0x0423; {p++; goto _out; } }} break; case 526: -#line 789 "char_ref.rl" {te = p+1;{ output->first = 0x0170; {p++; goto _out; } }} break; case 527: -#line 790 "char_ref.rl" {te = p+1;{ output->first = 0x0001d518; {p++; goto _out; } }} break; case 528: -#line 791 "char_ref.rl" {te = p+1;{ output->first = 0xd9; {p++; goto _out; } }} break; case 529: -#line 793 "char_ref.rl" {te = p+1;{ output->first = 0x016a; {p++; goto _out; } }} break; case 530: -#line 794 "char_ref.rl" {te = p+1;{ output->first = 0x5f; {p++; goto _out; } }} break; case 531: -#line 795 "char_ref.rl" {te = p+1;{ output->first = 0x23df; {p++; goto _out; } }} break; case 532: -#line 796 "char_ref.rl" {te = p+1;{ output->first = 0x23b5; {p++; goto _out; } }} break; case 533: -#line 797 "char_ref.rl" {te = p+1;{ output->first = 0x23dd; {p++; goto _out; } }} break; case 534: -#line 798 "char_ref.rl" {te = p+1;{ output->first = 0x22c3; {p++; goto _out; } }} break; case 535: -#line 799 "char_ref.rl" {te = p+1;{ output->first = 0x228e; {p++; goto _out; } }} break; case 536: -#line 800 "char_ref.rl" {te = p+1;{ output->first = 0x0172; {p++; goto _out; } }} break; case 537: -#line 801 "char_ref.rl" {te = p+1;{ output->first = 0x0001d54c; {p++; goto _out; } }} break; case 538: -#line 802 "char_ref.rl" {te = p+1;{ output->first = 0x2191; {p++; goto _out; } }} break; case 539: -#line 803 "char_ref.rl" {te = p+1;{ output->first = 0x2912; {p++; goto _out; } }} break; case 540: -#line 804 "char_ref.rl" {te = p+1;{ output->first = 0x21c5; {p++; goto _out; } }} break; case 541: -#line 805 "char_ref.rl" {te = p+1;{ output->first = 0x2195; {p++; goto _out; } }} break; case 542: -#line 806 "char_ref.rl" {te = p+1;{ output->first = 0x296e; {p++; goto _out; } }} break; case 543: -#line 807 "char_ref.rl" {te = p+1;{ output->first = 0x22a5; {p++; goto _out; } }} break; case 544: -#line 808 "char_ref.rl" {te = p+1;{ output->first = 0x21a5; {p++; goto _out; } }} break; case 545: -#line 809 "char_ref.rl" {te = p+1;{ output->first = 0x21d1; {p++; goto _out; } }} break; case 546: -#line 810 "char_ref.rl" {te = p+1;{ output->first = 0x21d5; {p++; goto _out; } }} break; case 547: -#line 811 "char_ref.rl" {te = p+1;{ output->first = 0x2196; {p++; goto _out; } }} break; case 548: -#line 812 "char_ref.rl" {te = p+1;{ output->first = 0x2197; {p++; goto _out; } }} break; case 549: -#line 813 "char_ref.rl" {te = p+1;{ output->first = 0x03d2; {p++; goto _out; } }} break; case 550: -#line 814 "char_ref.rl" {te = p+1;{ output->first = 0x03a5; {p++; goto _out; } }} break; case 551: -#line 815 "char_ref.rl" {te = p+1;{ output->first = 0x016e; {p++; goto _out; } }} break; case 552: -#line 816 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4b0; {p++; goto _out; } }} break; case 553: -#line 817 "char_ref.rl" {te = p+1;{ output->first = 0x0168; {p++; goto _out; } }} break; case 554: -#line 818 "char_ref.rl" {te = p+1;{ output->first = 0xdc; {p++; goto _out; } }} break; case 555: -#line 820 "char_ref.rl" {te = p+1;{ output->first = 0x22ab; {p++; goto _out; } }} break; case 556: -#line 821 "char_ref.rl" {te = p+1;{ output->first = 0x2aeb; {p++; goto _out; } }} break; case 557: -#line 822 "char_ref.rl" {te = p+1;{ output->first = 0x0412; {p++; goto _out; } }} break; case 558: -#line 823 "char_ref.rl" {te = p+1;{ output->first = 0x22a9; {p++; goto _out; } }} break; case 559: -#line 824 "char_ref.rl" {te = p+1;{ output->first = 0x2ae6; {p++; goto _out; } }} break; case 560: -#line 825 "char_ref.rl" {te = p+1;{ output->first = 0x22c1; {p++; goto _out; } }} break; case 561: -#line 826 "char_ref.rl" {te = p+1;{ output->first = 0x2016; {p++; goto _out; } }} break; case 562: -#line 827 "char_ref.rl" {te = p+1;{ output->first = 0x2016; {p++; goto _out; } }} break; case 563: -#line 828 "char_ref.rl" {te = p+1;{ output->first = 0x2223; {p++; goto _out; } }} break; case 564: -#line 829 "char_ref.rl" {te = p+1;{ output->first = 0x7c; {p++; goto _out; } }} break; case 565: -#line 830 "char_ref.rl" {te = p+1;{ output->first = 0x2758; {p++; goto _out; } }} break; case 566: -#line 831 "char_ref.rl" {te = p+1;{ output->first = 0x2240; {p++; goto _out; } }} break; case 567: -#line 832 "char_ref.rl" {te = p+1;{ output->first = 0x200a; {p++; goto _out; } }} break; case 568: -#line 833 "char_ref.rl" {te = p+1;{ output->first = 0x0001d519; {p++; goto _out; } }} break; case 569: -#line 834 "char_ref.rl" {te = p+1;{ output->first = 0x0001d54d; {p++; goto _out; } }} break; case 570: -#line 835 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4b1; {p++; goto _out; } }} break; case 571: -#line 836 "char_ref.rl" {te = p+1;{ output->first = 0x22aa; {p++; goto _out; } }} break; case 572: -#line 837 "char_ref.rl" {te = p+1;{ output->first = 0x0174; {p++; goto _out; } }} break; case 573: -#line 838 "char_ref.rl" {te = p+1;{ output->first = 0x22c0; {p++; goto _out; } }} break; case 574: -#line 839 "char_ref.rl" {te = p+1;{ output->first = 0x0001d51a; {p++; goto _out; } }} break; case 575: -#line 840 "char_ref.rl" {te = p+1;{ output->first = 0x0001d54e; {p++; goto _out; } }} break; case 576: -#line 841 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4b2; {p++; goto _out; } }} break; case 577: -#line 842 "char_ref.rl" {te = p+1;{ output->first = 0x0001d51b; {p++; goto _out; } }} break; case 578: -#line 843 "char_ref.rl" {te = p+1;{ output->first = 0x039e; {p++; goto _out; } }} break; case 579: -#line 844 "char_ref.rl" {te = p+1;{ output->first = 0x0001d54f; {p++; goto _out; } }} break; case 580: -#line 845 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4b3; {p++; goto _out; } }} break; case 581: -#line 846 "char_ref.rl" {te = p+1;{ output->first = 0x042f; {p++; goto _out; } }} break; case 582: -#line 847 "char_ref.rl" {te = p+1;{ output->first = 0x0407; {p++; goto _out; } }} break; case 583: -#line 848 "char_ref.rl" {te = p+1;{ output->first = 0x042e; {p++; goto _out; } }} break; case 584: -#line 849 "char_ref.rl" {te = p+1;{ output->first = 0xdd; {p++; goto _out; } }} break; case 585: -#line 851 "char_ref.rl" {te = p+1;{ output->first = 0x0176; {p++; goto _out; } }} break; case 586: -#line 852 "char_ref.rl" {te = p+1;{ output->first = 0x042b; {p++; goto _out; } }} break; case 587: -#line 853 "char_ref.rl" {te = p+1;{ output->first = 0x0001d51c; {p++; goto _out; } }} break; case 588: -#line 854 "char_ref.rl" {te = p+1;{ output->first = 0x0001d550; {p++; goto _out; } }} break; case 589: -#line 855 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4b4; {p++; goto _out; } }} break; case 590: -#line 856 "char_ref.rl" {te = p+1;{ output->first = 0x0178; {p++; goto _out; } }} break; case 591: -#line 857 "char_ref.rl" {te = p+1;{ output->first = 0x0416; {p++; goto _out; } }} break; case 592: -#line 858 "char_ref.rl" {te = p+1;{ output->first = 0x0179; {p++; goto _out; } }} break; case 593: -#line 859 "char_ref.rl" {te = p+1;{ output->first = 0x017d; {p++; goto _out; } }} break; case 594: -#line 860 "char_ref.rl" {te = p+1;{ output->first = 0x0417; {p++; goto _out; } }} break; case 595: -#line 861 "char_ref.rl" {te = p+1;{ output->first = 0x017b; {p++; goto _out; } }} break; case 596: -#line 862 "char_ref.rl" {te = p+1;{ output->first = 0x200b; {p++; goto _out; } }} break; case 597: -#line 863 "char_ref.rl" {te = p+1;{ output->first = 0x0396; {p++; goto _out; } }} break; case 598: -#line 864 "char_ref.rl" {te = p+1;{ output->first = 0x2128; {p++; goto _out; } }} break; case 599: -#line 865 "char_ref.rl" {te = p+1;{ output->first = 0x2124; {p++; goto _out; } }} break; case 600: -#line 866 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4b5; {p++; goto _out; } }} break; case 601: -#line 867 "char_ref.rl" {te = p+1;{ output->first = 0xe1; {p++; goto _out; } }} break; case 602: -#line 869 "char_ref.rl" {te = p+1;{ output->first = 0x0103; {p++; goto _out; } }} break; case 603: -#line 870 "char_ref.rl" {te = p+1;{ output->first = 0x223e; {p++; goto _out; } }} break; case 604: -#line 871 "char_ref.rl" {te = p+1;{ output->first = 0x223e; output->second = 0x0333; {p++; goto _out; } }} break; case 605: -#line 872 "char_ref.rl" {te = p+1;{ output->first = 0x223f; {p++; goto _out; } }} break; case 606: -#line 873 "char_ref.rl" {te = p+1;{ output->first = 0xe2; {p++; goto _out; } }} break; case 607: -#line 875 "char_ref.rl" {te = p+1;{ output->first = 0xb4; {p++; goto _out; } }} break; case 608: -#line 877 "char_ref.rl" {te = p+1;{ output->first = 0x0430; {p++; goto _out; } }} break; case 609: -#line 878 "char_ref.rl" {te = p+1;{ output->first = 0xe6; {p++; goto _out; } }} break; case 610: -#line 880 "char_ref.rl" {te = p+1;{ output->first = 0x2061; {p++; goto _out; } }} break; case 611: -#line 881 "char_ref.rl" {te = p+1;{ output->first = 0x0001d51e; {p++; goto _out; } }} break; case 612: -#line 882 "char_ref.rl" {te = p+1;{ output->first = 0xe0; {p++; goto _out; } }} break; case 613: -#line 884 "char_ref.rl" {te = p+1;{ output->first = 0x2135; {p++; goto _out; } }} break; case 614: -#line 885 "char_ref.rl" {te = p+1;{ output->first = 0x2135; {p++; goto _out; } }} break; case 615: -#line 886 "char_ref.rl" {te = p+1;{ output->first = 0x03b1; {p++; goto _out; } }} break; case 616: -#line 887 "char_ref.rl" {te = p+1;{ output->first = 0x0101; {p++; goto _out; } }} break; case 617: -#line 888 "char_ref.rl" {te = p+1;{ output->first = 0x2a3f; {p++; goto _out; } }} break; case 618: -#line 889 "char_ref.rl" {te = p+1;{ output->first = 0x26; {p++; goto _out; } }} break; case 619: -#line 891 "char_ref.rl" {te = p+1;{ output->first = 0x2227; {p++; goto _out; } }} break; case 620: -#line 892 "char_ref.rl" {te = p+1;{ output->first = 0x2a55; {p++; goto _out; } }} break; case 621: -#line 893 "char_ref.rl" {te = p+1;{ output->first = 0x2a5c; {p++; goto _out; } }} break; case 622: -#line 894 "char_ref.rl" {te = p+1;{ output->first = 0x2a58; {p++; goto _out; } }} break; case 623: -#line 895 "char_ref.rl" {te = p+1;{ output->first = 0x2a5a; {p++; goto _out; } }} break; case 624: -#line 896 "char_ref.rl" {te = p+1;{ output->first = 0x2220; {p++; goto _out; } }} break; case 625: -#line 897 "char_ref.rl" {te = p+1;{ output->first = 0x29a4; {p++; goto _out; } }} break; case 626: -#line 898 "char_ref.rl" {te = p+1;{ output->first = 0x2220; {p++; goto _out; } }} break; case 627: -#line 899 "char_ref.rl" {te = p+1;{ output->first = 0x2221; {p++; goto _out; } }} break; case 628: -#line 900 "char_ref.rl" {te = p+1;{ output->first = 0x29a8; {p++; goto _out; } }} break; case 629: -#line 901 "char_ref.rl" {te = p+1;{ output->first = 0x29a9; {p++; goto _out; } }} break; case 630: -#line 902 "char_ref.rl" {te = p+1;{ output->first = 0x29aa; {p++; goto _out; } }} break; case 631: -#line 903 "char_ref.rl" {te = p+1;{ output->first = 0x29ab; {p++; goto _out; } }} break; case 632: -#line 904 "char_ref.rl" {te = p+1;{ output->first = 0x29ac; {p++; goto _out; } }} break; case 633: -#line 905 "char_ref.rl" {te = p+1;{ output->first = 0x29ad; {p++; goto _out; } }} break; case 634: -#line 906 "char_ref.rl" {te = p+1;{ output->first = 0x29ae; {p++; goto _out; } }} break; case 635: -#line 907 "char_ref.rl" {te = p+1;{ output->first = 0x29af; {p++; goto _out; } }} break; case 636: -#line 908 "char_ref.rl" {te = p+1;{ output->first = 0x221f; {p++; goto _out; } }} break; case 637: -#line 909 "char_ref.rl" {te = p+1;{ output->first = 0x22be; {p++; goto _out; } }} break; case 638: -#line 910 "char_ref.rl" {te = p+1;{ output->first = 0x299d; {p++; goto _out; } }} break; case 639: -#line 911 "char_ref.rl" {te = p+1;{ output->first = 0x2222; {p++; goto _out; } }} break; case 640: -#line 912 "char_ref.rl" {te = p+1;{ output->first = 0xc5; {p++; goto _out; } }} break; case 641: -#line 913 "char_ref.rl" {te = p+1;{ output->first = 0x237c; {p++; goto _out; } }} break; case 642: -#line 914 "char_ref.rl" {te = p+1;{ output->first = 0x0105; {p++; goto _out; } }} break; case 643: -#line 915 "char_ref.rl" {te = p+1;{ output->first = 0x0001d552; {p++; goto _out; } }} break; case 644: -#line 916 "char_ref.rl" {te = p+1;{ output->first = 0x2248; {p++; goto _out; } }} break; case 645: -#line 917 "char_ref.rl" {te = p+1;{ output->first = 0x2a70; {p++; goto _out; } }} break; case 646: -#line 918 "char_ref.rl" {te = p+1;{ output->first = 0x2a6f; {p++; goto _out; } }} break; case 647: -#line 919 "char_ref.rl" {te = p+1;{ output->first = 0x224a; {p++; goto _out; } }} break; case 648: -#line 920 "char_ref.rl" {te = p+1;{ output->first = 0x224b; {p++; goto _out; } }} break; case 649: -#line 921 "char_ref.rl" {te = p+1;{ output->first = 0x27; {p++; goto _out; } }} break; case 650: -#line 922 "char_ref.rl" {te = p+1;{ output->first = 0x2248; {p++; goto _out; } }} break; case 651: -#line 923 "char_ref.rl" {te = p+1;{ output->first = 0x224a; {p++; goto _out; } }} break; case 652: -#line 924 "char_ref.rl" {te = p+1;{ output->first = 0xe5; {p++; goto _out; } }} break; case 653: -#line 926 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4b6; {p++; goto _out; } }} break; case 654: -#line 927 "char_ref.rl" {te = p+1;{ output->first = 0x2a; {p++; goto _out; } }} break; case 655: -#line 928 "char_ref.rl" {te = p+1;{ output->first = 0x2248; {p++; goto _out; } }} break; case 656: -#line 929 "char_ref.rl" {te = p+1;{ output->first = 0x224d; {p++; goto _out; } }} break; case 657: -#line 930 "char_ref.rl" {te = p+1;{ output->first = 0xe3; {p++; goto _out; } }} break; case 658: -#line 932 "char_ref.rl" {te = p+1;{ output->first = 0xe4; {p++; goto _out; } }} break; case 659: -#line 934 "char_ref.rl" {te = p+1;{ output->first = 0x2233; {p++; goto _out; } }} break; case 660: -#line 935 "char_ref.rl" {te = p+1;{ output->first = 0x2a11; {p++; goto _out; } }} break; case 661: -#line 936 "char_ref.rl" {te = p+1;{ output->first = 0x2aed; {p++; goto _out; } }} break; case 662: -#line 937 "char_ref.rl" {te = p+1;{ output->first = 0x224c; {p++; goto _out; } }} break; case 663: -#line 938 "char_ref.rl" {te = p+1;{ output->first = 0x03f6; {p++; goto _out; } }} break; case 664: -#line 939 "char_ref.rl" {te = p+1;{ output->first = 0x2035; {p++; goto _out; } }} break; case 665: -#line 940 "char_ref.rl" {te = p+1;{ output->first = 0x223d; {p++; goto _out; } }} break; case 666: -#line 941 "char_ref.rl" {te = p+1;{ output->first = 0x22cd; {p++; goto _out; } }} break; case 667: -#line 942 "char_ref.rl" {te = p+1;{ output->first = 0x22bd; {p++; goto _out; } }} break; case 668: -#line 943 "char_ref.rl" {te = p+1;{ output->first = 0x2305; {p++; goto _out; } }} break; case 669: -#line 944 "char_ref.rl" {te = p+1;{ output->first = 0x2305; {p++; goto _out; } }} break; case 670: -#line 945 "char_ref.rl" {te = p+1;{ output->first = 0x23b5; {p++; goto _out; } }} break; case 671: -#line 946 "char_ref.rl" {te = p+1;{ output->first = 0x23b6; {p++; goto _out; } }} break; case 672: -#line 947 "char_ref.rl" {te = p+1;{ output->first = 0x224c; {p++; goto _out; } }} break; case 673: -#line 948 "char_ref.rl" {te = p+1;{ output->first = 0x0431; {p++; goto _out; } }} break; case 674: -#line 949 "char_ref.rl" {te = p+1;{ output->first = 0x201e; {p++; goto _out; } }} break; case 675: -#line 950 "char_ref.rl" {te = p+1;{ output->first = 0x2235; {p++; goto _out; } }} break; case 676: -#line 951 "char_ref.rl" {te = p+1;{ output->first = 0x2235; {p++; goto _out; } }} break; case 677: -#line 952 "char_ref.rl" {te = p+1;{ output->first = 0x29b0; {p++; goto _out; } }} break; case 678: -#line 953 "char_ref.rl" {te = p+1;{ output->first = 0x03f6; {p++; goto _out; } }} break; case 679: -#line 954 "char_ref.rl" {te = p+1;{ output->first = 0x212c; {p++; goto _out; } }} break; case 680: -#line 955 "char_ref.rl" {te = p+1;{ output->first = 0x03b2; {p++; goto _out; } }} break; case 681: -#line 956 "char_ref.rl" {te = p+1;{ output->first = 0x2136; {p++; goto _out; } }} break; case 682: -#line 957 "char_ref.rl" {te = p+1;{ output->first = 0x226c; {p++; goto _out; } }} break; case 683: -#line 958 "char_ref.rl" {te = p+1;{ output->first = 0x0001d51f; {p++; goto _out; } }} break; case 684: -#line 959 "char_ref.rl" {te = p+1;{ output->first = 0x22c2; {p++; goto _out; } }} break; case 685: -#line 960 "char_ref.rl" {te = p+1;{ output->first = 0x25ef; {p++; goto _out; } }} break; case 686: -#line 961 "char_ref.rl" {te = p+1;{ output->first = 0x22c3; {p++; goto _out; } }} break; case 687: -#line 962 "char_ref.rl" {te = p+1;{ output->first = 0x2a00; {p++; goto _out; } }} break; case 688: -#line 963 "char_ref.rl" {te = p+1;{ output->first = 0x2a01; {p++; goto _out; } }} break; case 689: -#line 964 "char_ref.rl" {te = p+1;{ output->first = 0x2a02; {p++; goto _out; } }} break; case 690: -#line 965 "char_ref.rl" {te = p+1;{ output->first = 0x2a06; {p++; goto _out; } }} break; case 691: -#line 966 "char_ref.rl" {te = p+1;{ output->first = 0x2605; {p++; goto _out; } }} break; case 692: -#line 967 "char_ref.rl" {te = p+1;{ output->first = 0x25bd; {p++; goto _out; } }} break; case 693: -#line 968 "char_ref.rl" {te = p+1;{ output->first = 0x25b3; {p++; goto _out; } }} break; case 694: -#line 969 "char_ref.rl" {te = p+1;{ output->first = 0x2a04; {p++; goto _out; } }} break; case 695: -#line 970 "char_ref.rl" {te = p+1;{ output->first = 0x22c1; {p++; goto _out; } }} break; case 696: -#line 971 "char_ref.rl" {te = p+1;{ output->first = 0x22c0; {p++; goto _out; } }} break; case 697: -#line 972 "char_ref.rl" {te = p+1;{ output->first = 0x290d; {p++; goto _out; } }} break; case 698: -#line 973 "char_ref.rl" {te = p+1;{ output->first = 0x29eb; {p++; goto _out; } }} break; case 699: -#line 974 "char_ref.rl" {te = p+1;{ output->first = 0x25aa; {p++; goto _out; } }} break; case 700: -#line 975 "char_ref.rl" {te = p+1;{ output->first = 0x25b4; {p++; goto _out; } }} break; case 701: -#line 976 "char_ref.rl" {te = p+1;{ output->first = 0x25be; {p++; goto _out; } }} break; case 702: -#line 977 "char_ref.rl" {te = p+1;{ output->first = 0x25c2; {p++; goto _out; } }} break; case 703: -#line 978 "char_ref.rl" {te = p+1;{ output->first = 0x25b8; {p++; goto _out; } }} break; case 704: -#line 979 "char_ref.rl" {te = p+1;{ output->first = 0x2423; {p++; goto _out; } }} break; case 705: -#line 980 "char_ref.rl" {te = p+1;{ output->first = 0x2592; {p++; goto _out; } }} break; case 706: -#line 981 "char_ref.rl" {te = p+1;{ output->first = 0x2591; {p++; goto _out; } }} break; case 707: -#line 982 "char_ref.rl" {te = p+1;{ output->first = 0x2593; {p++; goto _out; } }} break; case 708: -#line 983 "char_ref.rl" {te = p+1;{ output->first = 0x2588; {p++; goto _out; } }} break; case 709: -#line 984 "char_ref.rl" {te = p+1;{ output->first = 0x3d; output->second = 0x20e5; {p++; goto _out; } }} break; case 710: -#line 985 "char_ref.rl" {te = p+1;{ output->first = 0x2261; output->second = 0x20e5; {p++; goto _out; } }} break; case 711: -#line 986 "char_ref.rl" {te = p+1;{ output->first = 0x2310; {p++; goto _out; } }} break; case 712: -#line 987 "char_ref.rl" {te = p+1;{ output->first = 0x0001d553; {p++; goto _out; } }} break; case 713: -#line 988 "char_ref.rl" {te = p+1;{ output->first = 0x22a5; {p++; goto _out; } }} break; case 714: -#line 989 "char_ref.rl" {te = p+1;{ output->first = 0x22a5; {p++; goto _out; } }} break; case 715: -#line 990 "char_ref.rl" {te = p+1;{ output->first = 0x22c8; {p++; goto _out; } }} break; case 716: -#line 991 "char_ref.rl" {te = p+1;{ output->first = 0x2557; {p++; goto _out; } }} break; case 717: -#line 992 "char_ref.rl" {te = p+1;{ output->first = 0x2554; {p++; goto _out; } }} break; case 718: -#line 993 "char_ref.rl" {te = p+1;{ output->first = 0x2556; {p++; goto _out; } }} break; case 719: -#line 994 "char_ref.rl" {te = p+1;{ output->first = 0x2553; {p++; goto _out; } }} break; case 720: -#line 995 "char_ref.rl" {te = p+1;{ output->first = 0x2550; {p++; goto _out; } }} break; case 721: -#line 996 "char_ref.rl" {te = p+1;{ output->first = 0x2566; {p++; goto _out; } }} break; case 722: -#line 997 "char_ref.rl" {te = p+1;{ output->first = 0x2569; {p++; goto _out; } }} break; case 723: -#line 998 "char_ref.rl" {te = p+1;{ output->first = 0x2564; {p++; goto _out; } }} break; case 724: -#line 999 "char_ref.rl" {te = p+1;{ output->first = 0x2567; {p++; goto _out; } }} break; case 725: -#line 1000 "char_ref.rl" {te = p+1;{ output->first = 0x255d; {p++; goto _out; } }} break; case 726: -#line 1001 "char_ref.rl" {te = p+1;{ output->first = 0x255a; {p++; goto _out; } }} break; case 727: -#line 1002 "char_ref.rl" {te = p+1;{ output->first = 0x255c; {p++; goto _out; } }} break; case 728: -#line 1003 "char_ref.rl" {te = p+1;{ output->first = 0x2559; {p++; goto _out; } }} break; case 729: -#line 1004 "char_ref.rl" {te = p+1;{ output->first = 0x2551; {p++; goto _out; } }} break; case 730: -#line 1005 "char_ref.rl" {te = p+1;{ output->first = 0x256c; {p++; goto _out; } }} break; case 731: -#line 1006 "char_ref.rl" {te = p+1;{ output->first = 0x2563; {p++; goto _out; } }} break; case 732: -#line 1007 "char_ref.rl" {te = p+1;{ output->first = 0x2560; {p++; goto _out; } }} break; case 733: -#line 1008 "char_ref.rl" {te = p+1;{ output->first = 0x256b; {p++; goto _out; } }} break; case 734: -#line 1009 "char_ref.rl" {te = p+1;{ output->first = 0x2562; {p++; goto _out; } }} break; case 735: -#line 1010 "char_ref.rl" {te = p+1;{ output->first = 0x255f; {p++; goto _out; } }} break; case 736: -#line 1011 "char_ref.rl" {te = p+1;{ output->first = 0x29c9; {p++; goto _out; } }} break; case 737: -#line 1012 "char_ref.rl" {te = p+1;{ output->first = 0x2555; {p++; goto _out; } }} break; case 738: -#line 1013 "char_ref.rl" {te = p+1;{ output->first = 0x2552; {p++; goto _out; } }} break; case 739: -#line 1014 "char_ref.rl" {te = p+1;{ output->first = 0x2510; {p++; goto _out; } }} break; case 740: -#line 1015 "char_ref.rl" {te = p+1;{ output->first = 0x250c; {p++; goto _out; } }} break; case 741: -#line 1016 "char_ref.rl" {te = p+1;{ output->first = 0x2500; {p++; goto _out; } }} break; case 742: -#line 1017 "char_ref.rl" {te = p+1;{ output->first = 0x2565; {p++; goto _out; } }} break; case 743: -#line 1018 "char_ref.rl" {te = p+1;{ output->first = 0x2568; {p++; goto _out; } }} break; case 744: -#line 1019 "char_ref.rl" {te = p+1;{ output->first = 0x252c; {p++; goto _out; } }} break; case 745: -#line 1020 "char_ref.rl" {te = p+1;{ output->first = 0x2534; {p++; goto _out; } }} break; case 746: -#line 1021 "char_ref.rl" {te = p+1;{ output->first = 0x229f; {p++; goto _out; } }} break; case 747: -#line 1022 "char_ref.rl" {te = p+1;{ output->first = 0x229e; {p++; goto _out; } }} break; case 748: -#line 1023 "char_ref.rl" {te = p+1;{ output->first = 0x22a0; {p++; goto _out; } }} break; case 749: -#line 1024 "char_ref.rl" {te = p+1;{ output->first = 0x255b; {p++; goto _out; } }} break; case 750: -#line 1025 "char_ref.rl" {te = p+1;{ output->first = 0x2558; {p++; goto _out; } }} break; case 751: -#line 1026 "char_ref.rl" {te = p+1;{ output->first = 0x2518; {p++; goto _out; } }} break; case 752: -#line 1027 "char_ref.rl" {te = p+1;{ output->first = 0x2514; {p++; goto _out; } }} break; case 753: -#line 1028 "char_ref.rl" {te = p+1;{ output->first = 0x2502; {p++; goto _out; } }} break; case 754: -#line 1029 "char_ref.rl" {te = p+1;{ output->first = 0x256a; {p++; goto _out; } }} break; case 755: -#line 1030 "char_ref.rl" {te = p+1;{ output->first = 0x2561; {p++; goto _out; } }} break; case 756: -#line 1031 "char_ref.rl" {te = p+1;{ output->first = 0x255e; {p++; goto _out; } }} break; case 757: -#line 1032 "char_ref.rl" {te = p+1;{ output->first = 0x253c; {p++; goto _out; } }} break; case 758: -#line 1033 "char_ref.rl" {te = p+1;{ output->first = 0x2524; {p++; goto _out; } }} break; case 759: -#line 1034 "char_ref.rl" {te = p+1;{ output->first = 0x251c; {p++; goto _out; } }} break; case 760: -#line 1035 "char_ref.rl" {te = p+1;{ output->first = 0x2035; {p++; goto _out; } }} break; case 761: -#line 1036 "char_ref.rl" {te = p+1;{ output->first = 0x02d8; {p++; goto _out; } }} break; case 762: -#line 1037 "char_ref.rl" {te = p+1;{ output->first = 0xa6; {p++; goto _out; } }} break; case 763: -#line 1039 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4b7; {p++; goto _out; } }} break; case 764: -#line 1040 "char_ref.rl" {te = p+1;{ output->first = 0x204f; {p++; goto _out; } }} break; case 765: -#line 1041 "char_ref.rl" {te = p+1;{ output->first = 0x223d; {p++; goto _out; } }} break; case 766: -#line 1042 "char_ref.rl" {te = p+1;{ output->first = 0x22cd; {p++; goto _out; } }} break; case 767: -#line 1043 "char_ref.rl" {te = p+1;{ output->first = 0x5c; {p++; goto _out; } }} break; case 768: -#line 1044 "char_ref.rl" {te = p+1;{ output->first = 0x29c5; {p++; goto _out; } }} break; case 769: -#line 1045 "char_ref.rl" {te = p+1;{ output->first = 0x27c8; {p++; goto _out; } }} break; case 770: -#line 1046 "char_ref.rl" {te = p+1;{ output->first = 0x2022; {p++; goto _out; } }} break; case 771: -#line 1047 "char_ref.rl" {te = p+1;{ output->first = 0x2022; {p++; goto _out; } }} break; case 772: -#line 1048 "char_ref.rl" {te = p+1;{ output->first = 0x224e; {p++; goto _out; } }} break; case 773: -#line 1049 "char_ref.rl" {te = p+1;{ output->first = 0x2aae; {p++; goto _out; } }} break; case 774: -#line 1050 "char_ref.rl" {te = p+1;{ output->first = 0x224f; {p++; goto _out; } }} break; case 775: -#line 1051 "char_ref.rl" {te = p+1;{ output->first = 0x224f; {p++; goto _out; } }} break; case 776: -#line 1052 "char_ref.rl" {te = p+1;{ output->first = 0x0107; {p++; goto _out; } }} break; case 777: -#line 1053 "char_ref.rl" {te = p+1;{ output->first = 0x2229; {p++; goto _out; } }} break; case 778: -#line 1054 "char_ref.rl" {te = p+1;{ output->first = 0x2a44; {p++; goto _out; } }} break; case 779: -#line 1055 "char_ref.rl" {te = p+1;{ output->first = 0x2a49; {p++; goto _out; } }} break; case 780: -#line 1056 "char_ref.rl" {te = p+1;{ output->first = 0x2a4b; {p++; goto _out; } }} break; case 781: -#line 1057 "char_ref.rl" {te = p+1;{ output->first = 0x2a47; {p++; goto _out; } }} break; case 782: -#line 1058 "char_ref.rl" {te = p+1;{ output->first = 0x2a40; {p++; goto _out; } }} break; case 783: -#line 1059 "char_ref.rl" {te = p+1;{ output->first = 0x2229; output->second = 0xfe00; {p++; goto _out; } }} break; case 784: -#line 1060 "char_ref.rl" {te = p+1;{ output->first = 0x2041; {p++; goto _out; } }} break; case 785: -#line 1061 "char_ref.rl" {te = p+1;{ output->first = 0x02c7; {p++; goto _out; } }} break; case 786: -#line 1062 "char_ref.rl" {te = p+1;{ output->first = 0x2a4d; {p++; goto _out; } }} break; case 787: -#line 1063 "char_ref.rl" {te = p+1;{ output->first = 0x010d; {p++; goto _out; } }} break; case 788: -#line 1064 "char_ref.rl" {te = p+1;{ output->first = 0xe7; {p++; goto _out; } }} break; case 789: -#line 1066 "char_ref.rl" {te = p+1;{ output->first = 0x0109; {p++; goto _out; } }} break; case 790: -#line 1067 "char_ref.rl" {te = p+1;{ output->first = 0x2a4c; {p++; goto _out; } }} break; case 791: -#line 1068 "char_ref.rl" {te = p+1;{ output->first = 0x2a50; {p++; goto _out; } }} break; case 792: -#line 1069 "char_ref.rl" {te = p+1;{ output->first = 0x010b; {p++; goto _out; } }} break; case 793: -#line 1070 "char_ref.rl" {te = p+1;{ output->first = 0xb8; {p++; goto _out; } }} break; case 794: -#line 1072 "char_ref.rl" {te = p+1;{ output->first = 0x29b2; {p++; goto _out; } }} break; case 795: -#line 1073 "char_ref.rl" {te = p+1;{ output->first = 0xa2; {p++; goto _out; } }} break; case 796: -#line 1075 "char_ref.rl" {te = p+1;{ output->first = 0xb7; {p++; goto _out; } }} break; case 797: -#line 1076 "char_ref.rl" {te = p+1;{ output->first = 0x0001d520; {p++; goto _out; } }} break; case 798: -#line 1077 "char_ref.rl" {te = p+1;{ output->first = 0x0447; {p++; goto _out; } }} break; case 799: -#line 1078 "char_ref.rl" {te = p+1;{ output->first = 0x2713; {p++; goto _out; } }} break; case 800: -#line 1079 "char_ref.rl" {te = p+1;{ output->first = 0x2713; {p++; goto _out; } }} break; case 801: -#line 1080 "char_ref.rl" {te = p+1;{ output->first = 0x03c7; {p++; goto _out; } }} break; case 802: -#line 1081 "char_ref.rl" {te = p+1;{ output->first = 0x25cb; {p++; goto _out; } }} break; case 803: -#line 1082 "char_ref.rl" {te = p+1;{ output->first = 0x29c3; {p++; goto _out; } }} break; case 804: -#line 1083 "char_ref.rl" {te = p+1;{ output->first = 0x02c6; {p++; goto _out; } }} break; case 805: -#line 1084 "char_ref.rl" {te = p+1;{ output->first = 0x2257; {p++; goto _out; } }} break; case 806: -#line 1085 "char_ref.rl" {te = p+1;{ output->first = 0x21ba; {p++; goto _out; } }} break; case 807: -#line 1086 "char_ref.rl" {te = p+1;{ output->first = 0x21bb; {p++; goto _out; } }} break; case 808: -#line 1087 "char_ref.rl" {te = p+1;{ output->first = 0xae; {p++; goto _out; } }} break; case 809: -#line 1088 "char_ref.rl" {te = p+1;{ output->first = 0x24c8; {p++; goto _out; } }} break; case 810: -#line 1089 "char_ref.rl" {te = p+1;{ output->first = 0x229b; {p++; goto _out; } }} break; case 811: -#line 1090 "char_ref.rl" {te = p+1;{ output->first = 0x229a; {p++; goto _out; } }} break; case 812: -#line 1091 "char_ref.rl" {te = p+1;{ output->first = 0x229d; {p++; goto _out; } }} break; case 813: -#line 1092 "char_ref.rl" {te = p+1;{ output->first = 0x2257; {p++; goto _out; } }} break; case 814: -#line 1093 "char_ref.rl" {te = p+1;{ output->first = 0x2a10; {p++; goto _out; } }} break; case 815: -#line 1094 "char_ref.rl" {te = p+1;{ output->first = 0x2aef; {p++; goto _out; } }} break; case 816: -#line 1095 "char_ref.rl" {te = p+1;{ output->first = 0x29c2; {p++; goto _out; } }} break; case 817: -#line 1096 "char_ref.rl" {te = p+1;{ output->first = 0x2663; {p++; goto _out; } }} break; case 818: -#line 1097 "char_ref.rl" {te = p+1;{ output->first = 0x2663; {p++; goto _out; } }} break; case 819: -#line 1098 "char_ref.rl" {te = p+1;{ output->first = 0x3a; {p++; goto _out; } }} break; case 820: -#line 1099 "char_ref.rl" {te = p+1;{ output->first = 0x2254; {p++; goto _out; } }} break; case 821: -#line 1100 "char_ref.rl" {te = p+1;{ output->first = 0x2254; {p++; goto _out; } }} break; case 822: -#line 1101 "char_ref.rl" {te = p+1;{ output->first = 0x2c; {p++; goto _out; } }} break; case 823: -#line 1102 "char_ref.rl" {te = p+1;{ output->first = 0x40; {p++; goto _out; } }} break; case 824: -#line 1103 "char_ref.rl" {te = p+1;{ output->first = 0x2201; {p++; goto _out; } }} break; case 825: -#line 1104 "char_ref.rl" {te = p+1;{ output->first = 0x2218; {p++; goto _out; } }} break; case 826: -#line 1105 "char_ref.rl" {te = p+1;{ output->first = 0x2201; {p++; goto _out; } }} break; case 827: -#line 1106 "char_ref.rl" {te = p+1;{ output->first = 0x2102; {p++; goto _out; } }} break; case 828: -#line 1107 "char_ref.rl" {te = p+1;{ output->first = 0x2245; {p++; goto _out; } }} break; case 829: -#line 1108 "char_ref.rl" {te = p+1;{ output->first = 0x2a6d; {p++; goto _out; } }} break; case 830: -#line 1109 "char_ref.rl" {te = p+1;{ output->first = 0x222e; {p++; goto _out; } }} break; case 831: -#line 1110 "char_ref.rl" {te = p+1;{ output->first = 0x0001d554; {p++; goto _out; } }} break; case 832: -#line 1111 "char_ref.rl" {te = p+1;{ output->first = 0x2210; {p++; goto _out; } }} break; case 833: -#line 1112 "char_ref.rl" {te = p+1;{ output->first = 0xa9; {p++; goto _out; } }} break; case 834: -#line 1114 "char_ref.rl" {te = p+1;{ output->first = 0x2117; {p++; goto _out; } }} break; case 835: -#line 1115 "char_ref.rl" {te = p+1;{ output->first = 0x21b5; {p++; goto _out; } }} break; case 836: -#line 1116 "char_ref.rl" {te = p+1;{ output->first = 0x2717; {p++; goto _out; } }} break; case 837: -#line 1117 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4b8; {p++; goto _out; } }} break; case 838: -#line 1118 "char_ref.rl" {te = p+1;{ output->first = 0x2acf; {p++; goto _out; } }} break; case 839: -#line 1119 "char_ref.rl" {te = p+1;{ output->first = 0x2ad1; {p++; goto _out; } }} break; case 840: -#line 1120 "char_ref.rl" {te = p+1;{ output->first = 0x2ad0; {p++; goto _out; } }} break; case 841: -#line 1121 "char_ref.rl" {te = p+1;{ output->first = 0x2ad2; {p++; goto _out; } }} break; case 842: -#line 1122 "char_ref.rl" {te = p+1;{ output->first = 0x22ef; {p++; goto _out; } }} break; case 843: -#line 1123 "char_ref.rl" {te = p+1;{ output->first = 0x2938; {p++; goto _out; } }} break; case 844: -#line 1124 "char_ref.rl" {te = p+1;{ output->first = 0x2935; {p++; goto _out; } }} break; case 845: -#line 1125 "char_ref.rl" {te = p+1;{ output->first = 0x22de; {p++; goto _out; } }} break; case 846: -#line 1126 "char_ref.rl" {te = p+1;{ output->first = 0x22df; {p++; goto _out; } }} break; case 847: -#line 1127 "char_ref.rl" {te = p+1;{ output->first = 0x21b6; {p++; goto _out; } }} break; case 848: -#line 1128 "char_ref.rl" {te = p+1;{ output->first = 0x293d; {p++; goto _out; } }} break; case 849: -#line 1129 "char_ref.rl" {te = p+1;{ output->first = 0x222a; {p++; goto _out; } }} break; case 850: -#line 1130 "char_ref.rl" {te = p+1;{ output->first = 0x2a48; {p++; goto _out; } }} break; case 851: -#line 1131 "char_ref.rl" {te = p+1;{ output->first = 0x2a46; {p++; goto _out; } }} break; case 852: -#line 1132 "char_ref.rl" {te = p+1;{ output->first = 0x2a4a; {p++; goto _out; } }} break; case 853: -#line 1133 "char_ref.rl" {te = p+1;{ output->first = 0x228d; {p++; goto _out; } }} break; case 854: -#line 1134 "char_ref.rl" {te = p+1;{ output->first = 0x2a45; {p++; goto _out; } }} break; case 855: -#line 1135 "char_ref.rl" {te = p+1;{ output->first = 0x222a; output->second = 0xfe00; {p++; goto _out; } }} break; case 856: -#line 1136 "char_ref.rl" {te = p+1;{ output->first = 0x21b7; {p++; goto _out; } }} break; case 857: -#line 1137 "char_ref.rl" {te = p+1;{ output->first = 0x293c; {p++; goto _out; } }} break; case 858: -#line 1138 "char_ref.rl" {te = p+1;{ output->first = 0x22de; {p++; goto _out; } }} break; case 859: -#line 1139 "char_ref.rl" {te = p+1;{ output->first = 0x22df; {p++; goto _out; } }} break; case 860: -#line 1140 "char_ref.rl" {te = p+1;{ output->first = 0x22ce; {p++; goto _out; } }} break; case 861: -#line 1141 "char_ref.rl" {te = p+1;{ output->first = 0x22cf; {p++; goto _out; } }} break; case 862: -#line 1142 "char_ref.rl" {te = p+1;{ output->first = 0xa4; {p++; goto _out; } }} break; case 863: -#line 1144 "char_ref.rl" {te = p+1;{ output->first = 0x21b6; {p++; goto _out; } }} break; case 864: -#line 1145 "char_ref.rl" {te = p+1;{ output->first = 0x21b7; {p++; goto _out; } }} break; case 865: -#line 1146 "char_ref.rl" {te = p+1;{ output->first = 0x22ce; {p++; goto _out; } }} break; case 866: -#line 1147 "char_ref.rl" {te = p+1;{ output->first = 0x22cf; {p++; goto _out; } }} break; case 867: -#line 1148 "char_ref.rl" {te = p+1;{ output->first = 0x2232; {p++; goto _out; } }} break; case 868: -#line 1149 "char_ref.rl" {te = p+1;{ output->first = 0x2231; {p++; goto _out; } }} break; case 869: -#line 1150 "char_ref.rl" {te = p+1;{ output->first = 0x232d; {p++; goto _out; } }} break; case 870: -#line 1151 "char_ref.rl" {te = p+1;{ output->first = 0x21d3; {p++; goto _out; } }} break; case 871: -#line 1152 "char_ref.rl" {te = p+1;{ output->first = 0x2965; {p++; goto _out; } }} break; case 872: -#line 1153 "char_ref.rl" {te = p+1;{ output->first = 0x2020; {p++; goto _out; } }} break; case 873: -#line 1154 "char_ref.rl" {te = p+1;{ output->first = 0x2138; {p++; goto _out; } }} break; case 874: -#line 1155 "char_ref.rl" {te = p+1;{ output->first = 0x2193; {p++; goto _out; } }} break; case 875: -#line 1156 "char_ref.rl" {te = p+1;{ output->first = 0x2010; {p++; goto _out; } }} break; case 876: -#line 1157 "char_ref.rl" {te = p+1;{ output->first = 0x22a3; {p++; goto _out; } }} break; case 877: -#line 1158 "char_ref.rl" {te = p+1;{ output->first = 0x290f; {p++; goto _out; } }} break; case 878: -#line 1159 "char_ref.rl" {te = p+1;{ output->first = 0x02dd; {p++; goto _out; } }} break; case 879: -#line 1160 "char_ref.rl" {te = p+1;{ output->first = 0x010f; {p++; goto _out; } }} break; case 880: -#line 1161 "char_ref.rl" {te = p+1;{ output->first = 0x0434; {p++; goto _out; } }} break; case 881: -#line 1162 "char_ref.rl" {te = p+1;{ output->first = 0x2146; {p++; goto _out; } }} break; case 882: -#line 1163 "char_ref.rl" {te = p+1;{ output->first = 0x2021; {p++; goto _out; } }} break; case 883: -#line 1164 "char_ref.rl" {te = p+1;{ output->first = 0x21ca; {p++; goto _out; } }} break; case 884: -#line 1165 "char_ref.rl" {te = p+1;{ output->first = 0x2a77; {p++; goto _out; } }} break; case 885: -#line 1166 "char_ref.rl" {te = p+1;{ output->first = 0xb0; {p++; goto _out; } }} break; case 886: -#line 1168 "char_ref.rl" {te = p+1;{ output->first = 0x03b4; {p++; goto _out; } }} break; case 887: -#line 1169 "char_ref.rl" {te = p+1;{ output->first = 0x29b1; {p++; goto _out; } }} break; case 888: -#line 1170 "char_ref.rl" {te = p+1;{ output->first = 0x297f; {p++; goto _out; } }} break; case 889: -#line 1171 "char_ref.rl" {te = p+1;{ output->first = 0x0001d521; {p++; goto _out; } }} break; case 890: -#line 1172 "char_ref.rl" {te = p+1;{ output->first = 0x21c3; {p++; goto _out; } }} break; case 891: -#line 1173 "char_ref.rl" {te = p+1;{ output->first = 0x21c2; {p++; goto _out; } }} break; case 892: -#line 1174 "char_ref.rl" {te = p+1;{ output->first = 0x22c4; {p++; goto _out; } }} break; case 893: -#line 1175 "char_ref.rl" {te = p+1;{ output->first = 0x22c4; {p++; goto _out; } }} break; case 894: -#line 1176 "char_ref.rl" {te = p+1;{ output->first = 0x2666; {p++; goto _out; } }} break; case 895: -#line 1177 "char_ref.rl" {te = p+1;{ output->first = 0x2666; {p++; goto _out; } }} break; case 896: -#line 1178 "char_ref.rl" {te = p+1;{ output->first = 0xa8; {p++; goto _out; } }} break; case 897: -#line 1179 "char_ref.rl" {te = p+1;{ output->first = 0x03dd; {p++; goto _out; } }} break; case 898: -#line 1180 "char_ref.rl" {te = p+1;{ output->first = 0x22f2; {p++; goto _out; } }} break; case 899: -#line 1181 "char_ref.rl" {te = p+1;{ output->first = 0xf7; {p++; goto _out; } }} break; case 900: -#line 1182 "char_ref.rl" {te = p+1;{ output->first = 0xf7; {p++; goto _out; } }} break; case 901: -#line 1184 "char_ref.rl" {te = p+1;{ output->first = 0x22c7; {p++; goto _out; } }} break; case 902: -#line 1185 "char_ref.rl" {te = p+1;{ output->first = 0x22c7; {p++; goto _out; } }} break; case 903: -#line 1186 "char_ref.rl" {te = p+1;{ output->first = 0x0452; {p++; goto _out; } }} break; case 904: -#line 1187 "char_ref.rl" {te = p+1;{ output->first = 0x231e; {p++; goto _out; } }} break; case 905: -#line 1188 "char_ref.rl" {te = p+1;{ output->first = 0x230d; {p++; goto _out; } }} break; case 906: -#line 1189 "char_ref.rl" {te = p+1;{ output->first = 0x24; {p++; goto _out; } }} break; case 907: -#line 1190 "char_ref.rl" {te = p+1;{ output->first = 0x0001d555; {p++; goto _out; } }} break; case 908: -#line 1191 "char_ref.rl" {te = p+1;{ output->first = 0x02d9; {p++; goto _out; } }} break; case 909: -#line 1192 "char_ref.rl" {te = p+1;{ output->first = 0x2250; {p++; goto _out; } }} break; case 910: -#line 1193 "char_ref.rl" {te = p+1;{ output->first = 0x2251; {p++; goto _out; } }} break; case 911: -#line 1194 "char_ref.rl" {te = p+1;{ output->first = 0x2238; {p++; goto _out; } }} break; case 912: -#line 1195 "char_ref.rl" {te = p+1;{ output->first = 0x2214; {p++; goto _out; } }} break; case 913: -#line 1196 "char_ref.rl" {te = p+1;{ output->first = 0x22a1; {p++; goto _out; } }} break; case 914: -#line 1197 "char_ref.rl" {te = p+1;{ output->first = 0x2306; {p++; goto _out; } }} break; case 915: -#line 1198 "char_ref.rl" {te = p+1;{ output->first = 0x2193; {p++; goto _out; } }} break; case 916: -#line 1199 "char_ref.rl" {te = p+1;{ output->first = 0x21ca; {p++; goto _out; } }} break; case 917: -#line 1200 "char_ref.rl" {te = p+1;{ output->first = 0x21c3; {p++; goto _out; } }} break; case 918: -#line 1201 "char_ref.rl" {te = p+1;{ output->first = 0x21c2; {p++; goto _out; } }} break; case 919: -#line 1202 "char_ref.rl" {te = p+1;{ output->first = 0x2910; {p++; goto _out; } }} break; case 920: -#line 1203 "char_ref.rl" {te = p+1;{ output->first = 0x231f; {p++; goto _out; } }} break; case 921: -#line 1204 "char_ref.rl" {te = p+1;{ output->first = 0x230c; {p++; goto _out; } }} break; case 922: -#line 1205 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4b9; {p++; goto _out; } }} break; case 923: -#line 1206 "char_ref.rl" {te = p+1;{ output->first = 0x0455; {p++; goto _out; } }} break; case 924: -#line 1207 "char_ref.rl" {te = p+1;{ output->first = 0x29f6; {p++; goto _out; } }} break; case 925: -#line 1208 "char_ref.rl" {te = p+1;{ output->first = 0x0111; {p++; goto _out; } }} break; case 926: -#line 1209 "char_ref.rl" {te = p+1;{ output->first = 0x22f1; {p++; goto _out; } }} break; case 927: -#line 1210 "char_ref.rl" {te = p+1;{ output->first = 0x25bf; {p++; goto _out; } }} break; case 928: -#line 1211 "char_ref.rl" {te = p+1;{ output->first = 0x25be; {p++; goto _out; } }} break; case 929: -#line 1212 "char_ref.rl" {te = p+1;{ output->first = 0x21f5; {p++; goto _out; } }} break; case 930: -#line 1213 "char_ref.rl" {te = p+1;{ output->first = 0x296f; {p++; goto _out; } }} break; case 931: -#line 1214 "char_ref.rl" {te = p+1;{ output->first = 0x29a6; {p++; goto _out; } }} break; case 932: -#line 1215 "char_ref.rl" {te = p+1;{ output->first = 0x045f; {p++; goto _out; } }} break; case 933: -#line 1216 "char_ref.rl" {te = p+1;{ output->first = 0x27ff; {p++; goto _out; } }} break; case 934: -#line 1217 "char_ref.rl" {te = p+1;{ output->first = 0x2a77; {p++; goto _out; } }} break; case 935: -#line 1218 "char_ref.rl" {te = p+1;{ output->first = 0x2251; {p++; goto _out; } }} break; case 936: -#line 1219 "char_ref.rl" {te = p+1;{ output->first = 0xe9; {p++; goto _out; } }} break; case 937: -#line 1221 "char_ref.rl" {te = p+1;{ output->first = 0x2a6e; {p++; goto _out; } }} break; case 938: -#line 1222 "char_ref.rl" {te = p+1;{ output->first = 0x011b; {p++; goto _out; } }} break; case 939: -#line 1223 "char_ref.rl" {te = p+1;{ output->first = 0x2256; {p++; goto _out; } }} break; case 940: -#line 1224 "char_ref.rl" {te = p+1;{ output->first = 0xea; {p++; goto _out; } }} break; case 941: -#line 1226 "char_ref.rl" {te = p+1;{ output->first = 0x2255; {p++; goto _out; } }} break; case 942: -#line 1227 "char_ref.rl" {te = p+1;{ output->first = 0x044d; {p++; goto _out; } }} break; case 943: -#line 1228 "char_ref.rl" {te = p+1;{ output->first = 0x0117; {p++; goto _out; } }} break; case 944: -#line 1229 "char_ref.rl" {te = p+1;{ output->first = 0x2147; {p++; goto _out; } }} break; case 945: -#line 1230 "char_ref.rl" {te = p+1;{ output->first = 0x2252; {p++; goto _out; } }} break; case 946: -#line 1231 "char_ref.rl" {te = p+1;{ output->first = 0x0001d522; {p++; goto _out; } }} break; case 947: -#line 1232 "char_ref.rl" {te = p+1;{ output->first = 0x2a9a; {p++; goto _out; } }} break; case 948: -#line 1233 "char_ref.rl" {te = p+1;{ output->first = 0xe8; {p++; goto _out; } }} break; case 949: -#line 1235 "char_ref.rl" {te = p+1;{ output->first = 0x2a96; {p++; goto _out; } }} break; case 950: -#line 1236 "char_ref.rl" {te = p+1;{ output->first = 0x2a98; {p++; goto _out; } }} break; case 951: -#line 1237 "char_ref.rl" {te = p+1;{ output->first = 0x2a99; {p++; goto _out; } }} break; case 952: -#line 1238 "char_ref.rl" {te = p+1;{ output->first = 0x23e7; {p++; goto _out; } }} break; case 953: -#line 1239 "char_ref.rl" {te = p+1;{ output->first = 0x2113; {p++; goto _out; } }} break; case 954: -#line 1240 "char_ref.rl" {te = p+1;{ output->first = 0x2a95; {p++; goto _out; } }} break; case 955: -#line 1241 "char_ref.rl" {te = p+1;{ output->first = 0x2a97; {p++; goto _out; } }} break; case 956: -#line 1242 "char_ref.rl" {te = p+1;{ output->first = 0x0113; {p++; goto _out; } }} break; case 957: -#line 1243 "char_ref.rl" {te = p+1;{ output->first = 0x2205; {p++; goto _out; } }} break; case 958: -#line 1244 "char_ref.rl" {te = p+1;{ output->first = 0x2205; {p++; goto _out; } }} break; case 959: -#line 1245 "char_ref.rl" {te = p+1;{ output->first = 0x2205; {p++; goto _out; } }} break; case 960: -#line 1246 "char_ref.rl" {te = p+1;{ output->first = 0x2004; {p++; goto _out; } }} break; case 961: -#line 1247 "char_ref.rl" {te = p+1;{ output->first = 0x2005; {p++; goto _out; } }} break; case 962: -#line 1248 "char_ref.rl" {te = p+1;{ output->first = 0x2003; {p++; goto _out; } }} break; case 963: -#line 1249 "char_ref.rl" {te = p+1;{ output->first = 0x014b; {p++; goto _out; } }} break; case 964: -#line 1250 "char_ref.rl" {te = p+1;{ output->first = 0x2002; {p++; goto _out; } }} break; case 965: -#line 1251 "char_ref.rl" {te = p+1;{ output->first = 0x0119; {p++; goto _out; } }} break; case 966: -#line 1252 "char_ref.rl" {te = p+1;{ output->first = 0x0001d556; {p++; goto _out; } }} break; case 967: -#line 1253 "char_ref.rl" {te = p+1;{ output->first = 0x22d5; {p++; goto _out; } }} break; case 968: -#line 1254 "char_ref.rl" {te = p+1;{ output->first = 0x29e3; {p++; goto _out; } }} break; case 969: -#line 1255 "char_ref.rl" {te = p+1;{ output->first = 0x2a71; {p++; goto _out; } }} break; case 970: -#line 1256 "char_ref.rl" {te = p+1;{ output->first = 0x03b5; {p++; goto _out; } }} break; case 971: -#line 1257 "char_ref.rl" {te = p+1;{ output->first = 0x03b5; {p++; goto _out; } }} break; case 972: -#line 1258 "char_ref.rl" {te = p+1;{ output->first = 0x03f5; {p++; goto _out; } }} break; case 973: -#line 1259 "char_ref.rl" {te = p+1;{ output->first = 0x2256; {p++; goto _out; } }} break; case 974: -#line 1260 "char_ref.rl" {te = p+1;{ output->first = 0x2255; {p++; goto _out; } }} break; case 975: -#line 1261 "char_ref.rl" {te = p+1;{ output->first = 0x2242; {p++; goto _out; } }} break; case 976: -#line 1262 "char_ref.rl" {te = p+1;{ output->first = 0x2a96; {p++; goto _out; } }} break; case 977: -#line 1263 "char_ref.rl" {te = p+1;{ output->first = 0x2a95; {p++; goto _out; } }} break; case 978: -#line 1264 "char_ref.rl" {te = p+1;{ output->first = 0x3d; {p++; goto _out; } }} break; case 979: -#line 1265 "char_ref.rl" {te = p+1;{ output->first = 0x225f; {p++; goto _out; } }} break; case 980: -#line 1266 "char_ref.rl" {te = p+1;{ output->first = 0x2261; {p++; goto _out; } }} break; case 981: -#line 1267 "char_ref.rl" {te = p+1;{ output->first = 0x2a78; {p++; goto _out; } }} break; case 982: -#line 1268 "char_ref.rl" {te = p+1;{ output->first = 0x29e5; {p++; goto _out; } }} break; case 983: -#line 1269 "char_ref.rl" {te = p+1;{ output->first = 0x2253; {p++; goto _out; } }} break; case 984: -#line 1270 "char_ref.rl" {te = p+1;{ output->first = 0x2971; {p++; goto _out; } }} break; case 985: -#line 1271 "char_ref.rl" {te = p+1;{ output->first = 0x212f; {p++; goto _out; } }} break; case 986: -#line 1272 "char_ref.rl" {te = p+1;{ output->first = 0x2250; {p++; goto _out; } }} break; case 987: -#line 1273 "char_ref.rl" {te = p+1;{ output->first = 0x2242; {p++; goto _out; } }} break; case 988: -#line 1274 "char_ref.rl" {te = p+1;{ output->first = 0x03b7; {p++; goto _out; } }} break; case 989: -#line 1275 "char_ref.rl" {te = p+1;{ output->first = 0xf0; {p++; goto _out; } }} break; case 990: -#line 1277 "char_ref.rl" {te = p+1;{ output->first = 0xeb; {p++; goto _out; } }} break; case 991: -#line 1279 "char_ref.rl" {te = p+1;{ output->first = 0x20ac; {p++; goto _out; } }} break; case 992: -#line 1280 "char_ref.rl" {te = p+1;{ output->first = 0x21; {p++; goto _out; } }} break; case 993: -#line 1281 "char_ref.rl" {te = p+1;{ output->first = 0x2203; {p++; goto _out; } }} break; case 994: -#line 1282 "char_ref.rl" {te = p+1;{ output->first = 0x2130; {p++; goto _out; } }} break; case 995: -#line 1283 "char_ref.rl" {te = p+1;{ output->first = 0x2147; {p++; goto _out; } }} break; case 996: -#line 1284 "char_ref.rl" {te = p+1;{ output->first = 0x2252; {p++; goto _out; } }} break; case 997: -#line 1285 "char_ref.rl" {te = p+1;{ output->first = 0x0444; {p++; goto _out; } }} break; case 998: -#line 1286 "char_ref.rl" {te = p+1;{ output->first = 0x2640; {p++; goto _out; } }} break; case 999: -#line 1287 "char_ref.rl" {te = p+1;{ output->first = 0xfb03; {p++; goto _out; } }} break; case 1000: -#line 1288 "char_ref.rl" {te = p+1;{ output->first = 0xfb00; {p++; goto _out; } }} break; case 1001: -#line 1289 "char_ref.rl" {te = p+1;{ output->first = 0xfb04; {p++; goto _out; } }} break; case 1002: -#line 1290 "char_ref.rl" {te = p+1;{ output->first = 0x0001d523; {p++; goto _out; } }} break; case 1003: -#line 1291 "char_ref.rl" {te = p+1;{ output->first = 0xfb01; {p++; goto _out; } }} break; case 1004: -#line 1292 "char_ref.rl" {te = p+1;{ output->first = 0x66; output->second = 0x6a; {p++; goto _out; } }} break; case 1005: -#line 1293 "char_ref.rl" {te = p+1;{ output->first = 0x266d; {p++; goto _out; } }} break; case 1006: -#line 1294 "char_ref.rl" {te = p+1;{ output->first = 0xfb02; {p++; goto _out; } }} break; case 1007: -#line 1295 "char_ref.rl" {te = p+1;{ output->first = 0x25b1; {p++; goto _out; } }} break; case 1008: -#line 1296 "char_ref.rl" {te = p+1;{ output->first = 0x0192; {p++; goto _out; } }} break; case 1009: -#line 1297 "char_ref.rl" {te = p+1;{ output->first = 0x0001d557; {p++; goto _out; } }} break; case 1010: -#line 1298 "char_ref.rl" {te = p+1;{ output->first = 0x2200; {p++; goto _out; } }} break; case 1011: -#line 1299 "char_ref.rl" {te = p+1;{ output->first = 0x22d4; {p++; goto _out; } }} break; case 1012: -#line 1300 "char_ref.rl" {te = p+1;{ output->first = 0x2ad9; {p++; goto _out; } }} break; case 1013: -#line 1301 "char_ref.rl" {te = p+1;{ output->first = 0x2a0d; {p++; goto _out; } }} break; case 1014: -#line 1302 "char_ref.rl" {te = p+1;{ output->first = 0xbd; {p++; goto _out; } }} break; case 1015: -#line 1304 "char_ref.rl" {te = p+1;{ output->first = 0x2153; {p++; goto _out; } }} break; case 1016: -#line 1305 "char_ref.rl" {te = p+1;{ output->first = 0xbc; {p++; goto _out; } }} break; case 1017: -#line 1307 "char_ref.rl" {te = p+1;{ output->first = 0x2155; {p++; goto _out; } }} break; case 1018: -#line 1308 "char_ref.rl" {te = p+1;{ output->first = 0x2159; {p++; goto _out; } }} break; case 1019: -#line 1309 "char_ref.rl" {te = p+1;{ output->first = 0x215b; {p++; goto _out; } }} break; case 1020: -#line 1310 "char_ref.rl" {te = p+1;{ output->first = 0x2154; {p++; goto _out; } }} break; case 1021: -#line 1311 "char_ref.rl" {te = p+1;{ output->first = 0x2156; {p++; goto _out; } }} break; case 1022: -#line 1312 "char_ref.rl" {te = p+1;{ output->first = 0xbe; {p++; goto _out; } }} break; case 1023: -#line 1314 "char_ref.rl" {te = p+1;{ output->first = 0x2157; {p++; goto _out; } }} break; case 1024: -#line 1315 "char_ref.rl" {te = p+1;{ output->first = 0x215c; {p++; goto _out; } }} break; case 1025: -#line 1316 "char_ref.rl" {te = p+1;{ output->first = 0x2158; {p++; goto _out; } }} break; case 1026: -#line 1317 "char_ref.rl" {te = p+1;{ output->first = 0x215a; {p++; goto _out; } }} break; case 1027: -#line 1318 "char_ref.rl" {te = p+1;{ output->first = 0x215d; {p++; goto _out; } }} break; case 1028: -#line 1319 "char_ref.rl" {te = p+1;{ output->first = 0x215e; {p++; goto _out; } }} break; case 1029: -#line 1320 "char_ref.rl" {te = p+1;{ output->first = 0x2044; {p++; goto _out; } }} break; case 1030: -#line 1321 "char_ref.rl" {te = p+1;{ output->first = 0x2322; {p++; goto _out; } }} break; case 1031: -#line 1322 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4bb; {p++; goto _out; } }} break; case 1032: -#line 1323 "char_ref.rl" {te = p+1;{ output->first = 0x2267; {p++; goto _out; } }} break; case 1033: -#line 1324 "char_ref.rl" {te = p+1;{ output->first = 0x2a8c; {p++; goto _out; } }} break; case 1034: -#line 1325 "char_ref.rl" {te = p+1;{ output->first = 0x01f5; {p++; goto _out; } }} break; case 1035: -#line 1326 "char_ref.rl" {te = p+1;{ output->first = 0x03b3; {p++; goto _out; } }} break; case 1036: -#line 1327 "char_ref.rl" {te = p+1;{ output->first = 0x03dd; {p++; goto _out; } }} break; case 1037: -#line 1328 "char_ref.rl" {te = p+1;{ output->first = 0x2a86; {p++; goto _out; } }} break; case 1038: -#line 1329 "char_ref.rl" {te = p+1;{ output->first = 0x011f; {p++; goto _out; } }} break; case 1039: -#line 1330 "char_ref.rl" {te = p+1;{ output->first = 0x011d; {p++; goto _out; } }} break; case 1040: -#line 1331 "char_ref.rl" {te = p+1;{ output->first = 0x0433; {p++; goto _out; } }} break; case 1041: -#line 1332 "char_ref.rl" {te = p+1;{ output->first = 0x0121; {p++; goto _out; } }} break; case 1042: -#line 1333 "char_ref.rl" {te = p+1;{ output->first = 0x2265; {p++; goto _out; } }} break; case 1043: -#line 1334 "char_ref.rl" {te = p+1;{ output->first = 0x22db; {p++; goto _out; } }} break; case 1044: -#line 1335 "char_ref.rl" {te = p+1;{ output->first = 0x2265; {p++; goto _out; } }} break; case 1045: -#line 1336 "char_ref.rl" {te = p+1;{ output->first = 0x2267; {p++; goto _out; } }} break; case 1046: -#line 1337 "char_ref.rl" {te = p+1;{ output->first = 0x2a7e; {p++; goto _out; } }} break; case 1047: -#line 1338 "char_ref.rl" {te = p+1;{ output->first = 0x2a7e; {p++; goto _out; } }} break; case 1048: -#line 1339 "char_ref.rl" {te = p+1;{ output->first = 0x2aa9; {p++; goto _out; } }} break; case 1049: -#line 1340 "char_ref.rl" {te = p+1;{ output->first = 0x2a80; {p++; goto _out; } }} break; case 1050: -#line 1341 "char_ref.rl" {te = p+1;{ output->first = 0x2a82; {p++; goto _out; } }} break; case 1051: -#line 1342 "char_ref.rl" {te = p+1;{ output->first = 0x2a84; {p++; goto _out; } }} break; case 1052: -#line 1343 "char_ref.rl" {te = p+1;{ output->first = 0x22db; output->second = 0xfe00; {p++; goto _out; } }} break; case 1053: -#line 1344 "char_ref.rl" {te = p+1;{ output->first = 0x2a94; {p++; goto _out; } }} break; case 1054: -#line 1345 "char_ref.rl" {te = p+1;{ output->first = 0x0001d524; {p++; goto _out; } }} break; case 1055: -#line 1346 "char_ref.rl" {te = p+1;{ output->first = 0x226b; {p++; goto _out; } }} break; case 1056: -#line 1347 "char_ref.rl" {te = p+1;{ output->first = 0x22d9; {p++; goto _out; } }} break; case 1057: -#line 1348 "char_ref.rl" {te = p+1;{ output->first = 0x2137; {p++; goto _out; } }} break; case 1058: -#line 1349 "char_ref.rl" {te = p+1;{ output->first = 0x0453; {p++; goto _out; } }} break; case 1059: -#line 1350 "char_ref.rl" {te = p+1;{ output->first = 0x2277; {p++; goto _out; } }} break; case 1060: -#line 1351 "char_ref.rl" {te = p+1;{ output->first = 0x2a92; {p++; goto _out; } }} break; case 1061: -#line 1352 "char_ref.rl" {te = p+1;{ output->first = 0x2aa5; {p++; goto _out; } }} break; case 1062: -#line 1353 "char_ref.rl" {te = p+1;{ output->first = 0x2aa4; {p++; goto _out; } }} break; case 1063: -#line 1354 "char_ref.rl" {te = p+1;{ output->first = 0x2269; {p++; goto _out; } }} break; case 1064: -#line 1355 "char_ref.rl" {te = p+1;{ output->first = 0x2a8a; {p++; goto _out; } }} break; case 1065: -#line 1356 "char_ref.rl" {te = p+1;{ output->first = 0x2a8a; {p++; goto _out; } }} break; case 1066: -#line 1357 "char_ref.rl" {te = p+1;{ output->first = 0x2a88; {p++; goto _out; } }} break; case 1067: -#line 1358 "char_ref.rl" {te = p+1;{ output->first = 0x2a88; {p++; goto _out; } }} break; case 1068: -#line 1359 "char_ref.rl" {te = p+1;{ output->first = 0x2269; {p++; goto _out; } }} break; case 1069: -#line 1360 "char_ref.rl" {te = p+1;{ output->first = 0x22e7; {p++; goto _out; } }} break; case 1070: -#line 1361 "char_ref.rl" {te = p+1;{ output->first = 0x0001d558; {p++; goto _out; } }} break; case 1071: -#line 1362 "char_ref.rl" {te = p+1;{ output->first = 0x60; {p++; goto _out; } }} break; case 1072: -#line 1363 "char_ref.rl" {te = p+1;{ output->first = 0x210a; {p++; goto _out; } }} break; case 1073: -#line 1364 "char_ref.rl" {te = p+1;{ output->first = 0x2273; {p++; goto _out; } }} break; case 1074: -#line 1365 "char_ref.rl" {te = p+1;{ output->first = 0x2a8e; {p++; goto _out; } }} break; case 1075: -#line 1366 "char_ref.rl" {te = p+1;{ output->first = 0x2a90; {p++; goto _out; } }} break; case 1076: -#line 1367 "char_ref.rl" {te = p+1;{ output->first = 0x3e; {p++; goto _out; } }} break; case 1077: -#line 1369 "char_ref.rl" {te = p+1;{ output->first = 0x2aa7; {p++; goto _out; } }} break; case 1078: -#line 1370 "char_ref.rl" {te = p+1;{ output->first = 0x2a7a; {p++; goto _out; } }} break; case 1079: -#line 1371 "char_ref.rl" {te = p+1;{ output->first = 0x22d7; {p++; goto _out; } }} break; case 1080: -#line 1372 "char_ref.rl" {te = p+1;{ output->first = 0x2995; {p++; goto _out; } }} break; case 1081: -#line 1373 "char_ref.rl" {te = p+1;{ output->first = 0x2a7c; {p++; goto _out; } }} break; case 1082: -#line 1374 "char_ref.rl" {te = p+1;{ output->first = 0x2a86; {p++; goto _out; } }} break; case 1083: -#line 1375 "char_ref.rl" {te = p+1;{ output->first = 0x2978; {p++; goto _out; } }} break; case 1084: -#line 1376 "char_ref.rl" {te = p+1;{ output->first = 0x22d7; {p++; goto _out; } }} break; case 1085: -#line 1377 "char_ref.rl" {te = p+1;{ output->first = 0x22db; {p++; goto _out; } }} break; case 1086: -#line 1378 "char_ref.rl" {te = p+1;{ output->first = 0x2a8c; {p++; goto _out; } }} break; case 1087: -#line 1379 "char_ref.rl" {te = p+1;{ output->first = 0x2277; {p++; goto _out; } }} break; case 1088: -#line 1380 "char_ref.rl" {te = p+1;{ output->first = 0x2273; {p++; goto _out; } }} break; case 1089: -#line 1381 "char_ref.rl" {te = p+1;{ output->first = 0x2269; output->second = 0xfe00; {p++; goto _out; } }} break; case 1090: -#line 1382 "char_ref.rl" {te = p+1;{ output->first = 0x2269; output->second = 0xfe00; {p++; goto _out; } }} break; case 1091: -#line 1383 "char_ref.rl" {te = p+1;{ output->first = 0x21d4; {p++; goto _out; } }} break; case 1092: -#line 1384 "char_ref.rl" {te = p+1;{ output->first = 0x200a; {p++; goto _out; } }} break; case 1093: -#line 1385 "char_ref.rl" {te = p+1;{ output->first = 0xbd; {p++; goto _out; } }} break; case 1094: -#line 1386 "char_ref.rl" {te = p+1;{ output->first = 0x210b; {p++; goto _out; } }} break; case 1095: -#line 1387 "char_ref.rl" {te = p+1;{ output->first = 0x044a; {p++; goto _out; } }} break; case 1096: -#line 1388 "char_ref.rl" {te = p+1;{ output->first = 0x2194; {p++; goto _out; } }} break; case 1097: -#line 1389 "char_ref.rl" {te = p+1;{ output->first = 0x2948; {p++; goto _out; } }} break; case 1098: -#line 1390 "char_ref.rl" {te = p+1;{ output->first = 0x21ad; {p++; goto _out; } }} break; case 1099: -#line 1391 "char_ref.rl" {te = p+1;{ output->first = 0x210f; {p++; goto _out; } }} break; case 1100: -#line 1392 "char_ref.rl" {te = p+1;{ output->first = 0x0125; {p++; goto _out; } }} break; case 1101: -#line 1393 "char_ref.rl" {te = p+1;{ output->first = 0x2665; {p++; goto _out; } }} break; case 1102: -#line 1394 "char_ref.rl" {te = p+1;{ output->first = 0x2665; {p++; goto _out; } }} break; case 1103: -#line 1395 "char_ref.rl" {te = p+1;{ output->first = 0x2026; {p++; goto _out; } }} break; case 1104: -#line 1396 "char_ref.rl" {te = p+1;{ output->first = 0x22b9; {p++; goto _out; } }} break; case 1105: -#line 1397 "char_ref.rl" {te = p+1;{ output->first = 0x0001d525; {p++; goto _out; } }} break; case 1106: -#line 1398 "char_ref.rl" {te = p+1;{ output->first = 0x2925; {p++; goto _out; } }} break; case 1107: -#line 1399 "char_ref.rl" {te = p+1;{ output->first = 0x2926; {p++; goto _out; } }} break; case 1108: -#line 1400 "char_ref.rl" {te = p+1;{ output->first = 0x21ff; {p++; goto _out; } }} break; case 1109: -#line 1401 "char_ref.rl" {te = p+1;{ output->first = 0x223b; {p++; goto _out; } }} break; case 1110: -#line 1402 "char_ref.rl" {te = p+1;{ output->first = 0x21a9; {p++; goto _out; } }} break; case 1111: -#line 1403 "char_ref.rl" {te = p+1;{ output->first = 0x21aa; {p++; goto _out; } }} break; case 1112: -#line 1404 "char_ref.rl" {te = p+1;{ output->first = 0x0001d559; {p++; goto _out; } }} break; case 1113: -#line 1405 "char_ref.rl" {te = p+1;{ output->first = 0x2015; {p++; goto _out; } }} break; case 1114: -#line 1406 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4bd; {p++; goto _out; } }} break; case 1115: -#line 1407 "char_ref.rl" {te = p+1;{ output->first = 0x210f; {p++; goto _out; } }} break; case 1116: -#line 1408 "char_ref.rl" {te = p+1;{ output->first = 0x0127; {p++; goto _out; } }} break; case 1117: -#line 1409 "char_ref.rl" {te = p+1;{ output->first = 0x2043; {p++; goto _out; } }} break; case 1118: -#line 1410 "char_ref.rl" {te = p+1;{ output->first = 0x2010; {p++; goto _out; } }} break; case 1119: -#line 1411 "char_ref.rl" {te = p+1;{ output->first = 0xed; {p++; goto _out; } }} break; case 1120: -#line 1413 "char_ref.rl" {te = p+1;{ output->first = 0x2063; {p++; goto _out; } }} break; case 1121: -#line 1414 "char_ref.rl" {te = p+1;{ output->first = 0xee; {p++; goto _out; } }} break; case 1122: -#line 1416 "char_ref.rl" {te = p+1;{ output->first = 0x0438; {p++; goto _out; } }} break; case 1123: -#line 1417 "char_ref.rl" {te = p+1;{ output->first = 0x0435; {p++; goto _out; } }} break; case 1124: -#line 1418 "char_ref.rl" {te = p+1;{ output->first = 0xa1; {p++; goto _out; } }} break; case 1125: -#line 1420 "char_ref.rl" {te = p+1;{ output->first = 0x21d4; {p++; goto _out; } }} break; case 1126: -#line 1421 "char_ref.rl" {te = p+1;{ output->first = 0x0001d526; {p++; goto _out; } }} break; case 1127: -#line 1422 "char_ref.rl" {te = p+1;{ output->first = 0xec; {p++; goto _out; } }} break; case 1128: -#line 1424 "char_ref.rl" {te = p+1;{ output->first = 0x2148; {p++; goto _out; } }} break; case 1129: -#line 1425 "char_ref.rl" {te = p+1;{ output->first = 0x2a0c; {p++; goto _out; } }} break; case 1130: -#line 1426 "char_ref.rl" {te = p+1;{ output->first = 0x222d; {p++; goto _out; } }} break; case 1131: -#line 1427 "char_ref.rl" {te = p+1;{ output->first = 0x29dc; {p++; goto _out; } }} break; case 1132: -#line 1428 "char_ref.rl" {te = p+1;{ output->first = 0x2129; {p++; goto _out; } }} break; case 1133: -#line 1429 "char_ref.rl" {te = p+1;{ output->first = 0x0133; {p++; goto _out; } }} break; case 1134: -#line 1430 "char_ref.rl" {te = p+1;{ output->first = 0x012b; {p++; goto _out; } }} break; case 1135: -#line 1431 "char_ref.rl" {te = p+1;{ output->first = 0x2111; {p++; goto _out; } }} break; case 1136: -#line 1432 "char_ref.rl" {te = p+1;{ output->first = 0x2110; {p++; goto _out; } }} break; case 1137: -#line 1433 "char_ref.rl" {te = p+1;{ output->first = 0x2111; {p++; goto _out; } }} break; case 1138: -#line 1434 "char_ref.rl" {te = p+1;{ output->first = 0x0131; {p++; goto _out; } }} break; case 1139: -#line 1435 "char_ref.rl" {te = p+1;{ output->first = 0x22b7; {p++; goto _out; } }} break; case 1140: -#line 1436 "char_ref.rl" {te = p+1;{ output->first = 0x01b5; {p++; goto _out; } }} break; case 1141: -#line 1437 "char_ref.rl" {te = p+1;{ output->first = 0x2208; {p++; goto _out; } }} break; case 1142: -#line 1438 "char_ref.rl" {te = p+1;{ output->first = 0x2105; {p++; goto _out; } }} break; case 1143: -#line 1439 "char_ref.rl" {te = p+1;{ output->first = 0x221e; {p++; goto _out; } }} break; case 1144: -#line 1440 "char_ref.rl" {te = p+1;{ output->first = 0x29dd; {p++; goto _out; } }} break; case 1145: -#line 1441 "char_ref.rl" {te = p+1;{ output->first = 0x0131; {p++; goto _out; } }} break; case 1146: -#line 1442 "char_ref.rl" {te = p+1;{ output->first = 0x222b; {p++; goto _out; } }} break; case 1147: -#line 1443 "char_ref.rl" {te = p+1;{ output->first = 0x22ba; {p++; goto _out; } }} break; case 1148: -#line 1444 "char_ref.rl" {te = p+1;{ output->first = 0x2124; {p++; goto _out; } }} break; case 1149: -#line 1445 "char_ref.rl" {te = p+1;{ output->first = 0x22ba; {p++; goto _out; } }} break; case 1150: -#line 1446 "char_ref.rl" {te = p+1;{ output->first = 0x2a17; {p++; goto _out; } }} break; case 1151: -#line 1447 "char_ref.rl" {te = p+1;{ output->first = 0x2a3c; {p++; goto _out; } }} break; case 1152: -#line 1448 "char_ref.rl" {te = p+1;{ output->first = 0x0451; {p++; goto _out; } }} break; case 1153: -#line 1449 "char_ref.rl" {te = p+1;{ output->first = 0x012f; {p++; goto _out; } }} break; case 1154: -#line 1450 "char_ref.rl" {te = p+1;{ output->first = 0x0001d55a; {p++; goto _out; } }} break; case 1155: -#line 1451 "char_ref.rl" {te = p+1;{ output->first = 0x03b9; {p++; goto _out; } }} break; case 1156: -#line 1452 "char_ref.rl" {te = p+1;{ output->first = 0x2a3c; {p++; goto _out; } }} break; case 1157: -#line 1453 "char_ref.rl" {te = p+1;{ output->first = 0xbf; {p++; goto _out; } }} break; case 1158: -#line 1455 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4be; {p++; goto _out; } }} break; case 1159: -#line 1456 "char_ref.rl" {te = p+1;{ output->first = 0x2208; {p++; goto _out; } }} break; case 1160: -#line 1457 "char_ref.rl" {te = p+1;{ output->first = 0x22f9; {p++; goto _out; } }} break; case 1161: -#line 1458 "char_ref.rl" {te = p+1;{ output->first = 0x22f5; {p++; goto _out; } }} break; case 1162: -#line 1459 "char_ref.rl" {te = p+1;{ output->first = 0x22f4; {p++; goto _out; } }} break; case 1163: -#line 1460 "char_ref.rl" {te = p+1;{ output->first = 0x22f3; {p++; goto _out; } }} break; case 1164: -#line 1461 "char_ref.rl" {te = p+1;{ output->first = 0x2208; {p++; goto _out; } }} break; case 1165: -#line 1462 "char_ref.rl" {te = p+1;{ output->first = 0x2062; {p++; goto _out; } }} break; case 1166: -#line 1463 "char_ref.rl" {te = p+1;{ output->first = 0x0129; {p++; goto _out; } }} break; case 1167: -#line 1464 "char_ref.rl" {te = p+1;{ output->first = 0x0456; {p++; goto _out; } }} break; case 1168: -#line 1465 "char_ref.rl" {te = p+1;{ output->first = 0xef; {p++; goto _out; } }} break; case 1169: -#line 1467 "char_ref.rl" {te = p+1;{ output->first = 0x0135; {p++; goto _out; } }} break; case 1170: -#line 1468 "char_ref.rl" {te = p+1;{ output->first = 0x0439; {p++; goto _out; } }} break; case 1171: -#line 1469 "char_ref.rl" {te = p+1;{ output->first = 0x0001d527; {p++; goto _out; } }} break; case 1172: -#line 1470 "char_ref.rl" {te = p+1;{ output->first = 0x0237; {p++; goto _out; } }} break; case 1173: -#line 1471 "char_ref.rl" {te = p+1;{ output->first = 0x0001d55b; {p++; goto _out; } }} break; case 1174: -#line 1472 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4bf; {p++; goto _out; } }} break; case 1175: -#line 1473 "char_ref.rl" {te = p+1;{ output->first = 0x0458; {p++; goto _out; } }} break; case 1176: -#line 1474 "char_ref.rl" {te = p+1;{ output->first = 0x0454; {p++; goto _out; } }} break; case 1177: -#line 1475 "char_ref.rl" {te = p+1;{ output->first = 0x03ba; {p++; goto _out; } }} break; case 1178: -#line 1476 "char_ref.rl" {te = p+1;{ output->first = 0x03f0; {p++; goto _out; } }} break; case 1179: -#line 1477 "char_ref.rl" {te = p+1;{ output->first = 0x0137; {p++; goto _out; } }} break; case 1180: -#line 1478 "char_ref.rl" {te = p+1;{ output->first = 0x043a; {p++; goto _out; } }} break; case 1181: -#line 1479 "char_ref.rl" {te = p+1;{ output->first = 0x0001d528; {p++; goto _out; } }} break; case 1182: -#line 1480 "char_ref.rl" {te = p+1;{ output->first = 0x0138; {p++; goto _out; } }} break; case 1183: -#line 1481 "char_ref.rl" {te = p+1;{ output->first = 0x0445; {p++; goto _out; } }} break; case 1184: -#line 1482 "char_ref.rl" {te = p+1;{ output->first = 0x045c; {p++; goto _out; } }} break; case 1185: -#line 1483 "char_ref.rl" {te = p+1;{ output->first = 0x0001d55c; {p++; goto _out; } }} break; case 1186: -#line 1484 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4c0; {p++; goto _out; } }} break; case 1187: -#line 1485 "char_ref.rl" {te = p+1;{ output->first = 0x21da; {p++; goto _out; } }} break; case 1188: -#line 1486 "char_ref.rl" {te = p+1;{ output->first = 0x21d0; {p++; goto _out; } }} break; case 1189: -#line 1487 "char_ref.rl" {te = p+1;{ output->first = 0x291b; {p++; goto _out; } }} break; case 1190: -#line 1488 "char_ref.rl" {te = p+1;{ output->first = 0x290e; {p++; goto _out; } }} break; case 1191: -#line 1489 "char_ref.rl" {te = p+1;{ output->first = 0x2266; {p++; goto _out; } }} break; case 1192: -#line 1490 "char_ref.rl" {te = p+1;{ output->first = 0x2a8b; {p++; goto _out; } }} break; case 1193: -#line 1491 "char_ref.rl" {te = p+1;{ output->first = 0x2962; {p++; goto _out; } }} break; case 1194: -#line 1492 "char_ref.rl" {te = p+1;{ output->first = 0x013a; {p++; goto _out; } }} break; case 1195: -#line 1493 "char_ref.rl" {te = p+1;{ output->first = 0x29b4; {p++; goto _out; } }} break; case 1196: -#line 1494 "char_ref.rl" {te = p+1;{ output->first = 0x2112; {p++; goto _out; } }} break; case 1197: -#line 1495 "char_ref.rl" {te = p+1;{ output->first = 0x03bb; {p++; goto _out; } }} break; case 1198: -#line 1496 "char_ref.rl" {te = p+1;{ output->first = 0x27e8; {p++; goto _out; } }} break; case 1199: -#line 1497 "char_ref.rl" {te = p+1;{ output->first = 0x2991; {p++; goto _out; } }} break; case 1200: -#line 1498 "char_ref.rl" {te = p+1;{ output->first = 0x27e8; {p++; goto _out; } }} break; case 1201: -#line 1499 "char_ref.rl" {te = p+1;{ output->first = 0x2a85; {p++; goto _out; } }} break; case 1202: -#line 1500 "char_ref.rl" {te = p+1;{ output->first = 0xab; {p++; goto _out; } }} break; case 1203: -#line 1502 "char_ref.rl" {te = p+1;{ output->first = 0x2190; {p++; goto _out; } }} break; case 1204: -#line 1503 "char_ref.rl" {te = p+1;{ output->first = 0x21e4; {p++; goto _out; } }} break; case 1205: -#line 1504 "char_ref.rl" {te = p+1;{ output->first = 0x291f; {p++; goto _out; } }} break; case 1206: -#line 1505 "char_ref.rl" {te = p+1;{ output->first = 0x291d; {p++; goto _out; } }} break; case 1207: -#line 1506 "char_ref.rl" {te = p+1;{ output->first = 0x21a9; {p++; goto _out; } }} break; case 1208: -#line 1507 "char_ref.rl" {te = p+1;{ output->first = 0x21ab; {p++; goto _out; } }} break; case 1209: -#line 1508 "char_ref.rl" {te = p+1;{ output->first = 0x2939; {p++; goto _out; } }} break; case 1210: -#line 1509 "char_ref.rl" {te = p+1;{ output->first = 0x2973; {p++; goto _out; } }} break; case 1211: -#line 1510 "char_ref.rl" {te = p+1;{ output->first = 0x21a2; {p++; goto _out; } }} break; case 1212: -#line 1511 "char_ref.rl" {te = p+1;{ output->first = 0x2aab; {p++; goto _out; } }} break; case 1213: -#line 1512 "char_ref.rl" {te = p+1;{ output->first = 0x2919; {p++; goto _out; } }} break; case 1214: -#line 1513 "char_ref.rl" {te = p+1;{ output->first = 0x2aad; {p++; goto _out; } }} break; case 1215: -#line 1514 "char_ref.rl" {te = p+1;{ output->first = 0x2aad; output->second = 0xfe00; {p++; goto _out; } }} break; case 1216: -#line 1515 "char_ref.rl" {te = p+1;{ output->first = 0x290c; {p++; goto _out; } }} break; case 1217: -#line 1516 "char_ref.rl" {te = p+1;{ output->first = 0x2772; {p++; goto _out; } }} break; case 1218: -#line 1517 "char_ref.rl" {te = p+1;{ output->first = 0x7b; {p++; goto _out; } }} break; case 1219: -#line 1518 "char_ref.rl" {te = p+1;{ output->first = 0x5b; {p++; goto _out; } }} break; case 1220: -#line 1519 "char_ref.rl" {te = p+1;{ output->first = 0x298b; {p++; goto _out; } }} break; case 1221: -#line 1520 "char_ref.rl" {te = p+1;{ output->first = 0x298f; {p++; goto _out; } }} break; case 1222: -#line 1521 "char_ref.rl" {te = p+1;{ output->first = 0x298d; {p++; goto _out; } }} break; case 1223: -#line 1522 "char_ref.rl" {te = p+1;{ output->first = 0x013e; {p++; goto _out; } }} break; case 1224: -#line 1523 "char_ref.rl" {te = p+1;{ output->first = 0x013c; {p++; goto _out; } }} break; case 1225: -#line 1524 "char_ref.rl" {te = p+1;{ output->first = 0x2308; {p++; goto _out; } }} break; case 1226: -#line 1525 "char_ref.rl" {te = p+1;{ output->first = 0x7b; {p++; goto _out; } }} break; case 1227: -#line 1526 "char_ref.rl" {te = p+1;{ output->first = 0x043b; {p++; goto _out; } }} break; case 1228: -#line 1527 "char_ref.rl" {te = p+1;{ output->first = 0x2936; {p++; goto _out; } }} break; case 1229: -#line 1528 "char_ref.rl" {te = p+1;{ output->first = 0x201c; {p++; goto _out; } }} break; case 1230: -#line 1529 "char_ref.rl" {te = p+1;{ output->first = 0x201e; {p++; goto _out; } }} break; case 1231: -#line 1530 "char_ref.rl" {te = p+1;{ output->first = 0x2967; {p++; goto _out; } }} break; case 1232: -#line 1531 "char_ref.rl" {te = p+1;{ output->first = 0x294b; {p++; goto _out; } }} break; case 1233: -#line 1532 "char_ref.rl" {te = p+1;{ output->first = 0x21b2; {p++; goto _out; } }} break; case 1234: -#line 1533 "char_ref.rl" {te = p+1;{ output->first = 0x2264; {p++; goto _out; } }} break; case 1235: -#line 1534 "char_ref.rl" {te = p+1;{ output->first = 0x2190; {p++; goto _out; } }} break; case 1236: -#line 1535 "char_ref.rl" {te = p+1;{ output->first = 0x21a2; {p++; goto _out; } }} break; case 1237: -#line 1536 "char_ref.rl" {te = p+1;{ output->first = 0x21bd; {p++; goto _out; } }} break; case 1238: -#line 1537 "char_ref.rl" {te = p+1;{ output->first = 0x21bc; {p++; goto _out; } }} break; case 1239: -#line 1538 "char_ref.rl" {te = p+1;{ output->first = 0x21c7; {p++; goto _out; } }} break; case 1240: -#line 1539 "char_ref.rl" {te = p+1;{ output->first = 0x2194; {p++; goto _out; } }} break; case 1241: -#line 1540 "char_ref.rl" {te = p+1;{ output->first = 0x21c6; {p++; goto _out; } }} break; case 1242: -#line 1541 "char_ref.rl" {te = p+1;{ output->first = 0x21cb; {p++; goto _out; } }} break; case 1243: -#line 1542 "char_ref.rl" {te = p+1;{ output->first = 0x21ad; {p++; goto _out; } }} break; case 1244: -#line 1543 "char_ref.rl" {te = p+1;{ output->first = 0x22cb; {p++; goto _out; } }} break; case 1245: -#line 1544 "char_ref.rl" {te = p+1;{ output->first = 0x22da; {p++; goto _out; } }} break; case 1246: -#line 1545 "char_ref.rl" {te = p+1;{ output->first = 0x2264; {p++; goto _out; } }} break; case 1247: -#line 1546 "char_ref.rl" {te = p+1;{ output->first = 0x2266; {p++; goto _out; } }} break; case 1248: -#line 1547 "char_ref.rl" {te = p+1;{ output->first = 0x2a7d; {p++; goto _out; } }} break; case 1249: -#line 1548 "char_ref.rl" {te = p+1;{ output->first = 0x2a7d; {p++; goto _out; } }} break; case 1250: -#line 1549 "char_ref.rl" {te = p+1;{ output->first = 0x2aa8; {p++; goto _out; } }} break; case 1251: -#line 1550 "char_ref.rl" {te = p+1;{ output->first = 0x2a7f; {p++; goto _out; } }} break; case 1252: -#line 1551 "char_ref.rl" {te = p+1;{ output->first = 0x2a81; {p++; goto _out; } }} break; case 1253: -#line 1552 "char_ref.rl" {te = p+1;{ output->first = 0x2a83; {p++; goto _out; } }} break; case 1254: -#line 1553 "char_ref.rl" {te = p+1;{ output->first = 0x22da; output->second = 0xfe00; {p++; goto _out; } }} break; case 1255: -#line 1554 "char_ref.rl" {te = p+1;{ output->first = 0x2a93; {p++; goto _out; } }} break; case 1256: -#line 1555 "char_ref.rl" {te = p+1;{ output->first = 0x2a85; {p++; goto _out; } }} break; case 1257: -#line 1556 "char_ref.rl" {te = p+1;{ output->first = 0x22d6; {p++; goto _out; } }} break; case 1258: -#line 1557 "char_ref.rl" {te = p+1;{ output->first = 0x22da; {p++; goto _out; } }} break; case 1259: -#line 1558 "char_ref.rl" {te = p+1;{ output->first = 0x2a8b; {p++; goto _out; } }} break; case 1260: -#line 1559 "char_ref.rl" {te = p+1;{ output->first = 0x2276; {p++; goto _out; } }} break; case 1261: -#line 1560 "char_ref.rl" {te = p+1;{ output->first = 0x2272; {p++; goto _out; } }} break; case 1262: -#line 1561 "char_ref.rl" {te = p+1;{ output->first = 0x297c; {p++; goto _out; } }} break; case 1263: -#line 1562 "char_ref.rl" {te = p+1;{ output->first = 0x230a; {p++; goto _out; } }} break; case 1264: -#line 1563 "char_ref.rl" {te = p+1;{ output->first = 0x0001d529; {p++; goto _out; } }} break; case 1265: -#line 1564 "char_ref.rl" {te = p+1;{ output->first = 0x2276; {p++; goto _out; } }} break; case 1266: -#line 1565 "char_ref.rl" {te = p+1;{ output->first = 0x2a91; {p++; goto _out; } }} break; case 1267: -#line 1566 "char_ref.rl" {te = p+1;{ output->first = 0x21bd; {p++; goto _out; } }} break; case 1268: -#line 1567 "char_ref.rl" {te = p+1;{ output->first = 0x21bc; {p++; goto _out; } }} break; case 1269: -#line 1568 "char_ref.rl" {te = p+1;{ output->first = 0x296a; {p++; goto _out; } }} break; case 1270: -#line 1569 "char_ref.rl" {te = p+1;{ output->first = 0x2584; {p++; goto _out; } }} break; case 1271: -#line 1570 "char_ref.rl" {te = p+1;{ output->first = 0x0459; {p++; goto _out; } }} break; case 1272: -#line 1571 "char_ref.rl" {te = p+1;{ output->first = 0x226a; {p++; goto _out; } }} break; case 1273: -#line 1572 "char_ref.rl" {te = p+1;{ output->first = 0x21c7; {p++; goto _out; } }} break; case 1274: -#line 1573 "char_ref.rl" {te = p+1;{ output->first = 0x231e; {p++; goto _out; } }} break; case 1275: -#line 1574 "char_ref.rl" {te = p+1;{ output->first = 0x296b; {p++; goto _out; } }} break; case 1276: -#line 1575 "char_ref.rl" {te = p+1;{ output->first = 0x25fa; {p++; goto _out; } }} break; case 1277: -#line 1576 "char_ref.rl" {te = p+1;{ output->first = 0x0140; {p++; goto _out; } }} break; case 1278: -#line 1577 "char_ref.rl" {te = p+1;{ output->first = 0x23b0; {p++; goto _out; } }} break; case 1279: -#line 1578 "char_ref.rl" {te = p+1;{ output->first = 0x23b0; {p++; goto _out; } }} break; case 1280: -#line 1579 "char_ref.rl" {te = p+1;{ output->first = 0x2268; {p++; goto _out; } }} break; case 1281: -#line 1580 "char_ref.rl" {te = p+1;{ output->first = 0x2a89; {p++; goto _out; } }} break; case 1282: -#line 1581 "char_ref.rl" {te = p+1;{ output->first = 0x2a89; {p++; goto _out; } }} break; case 1283: -#line 1582 "char_ref.rl" {te = p+1;{ output->first = 0x2a87; {p++; goto _out; } }} break; case 1284: -#line 1583 "char_ref.rl" {te = p+1;{ output->first = 0x2a87; {p++; goto _out; } }} break; case 1285: -#line 1584 "char_ref.rl" {te = p+1;{ output->first = 0x2268; {p++; goto _out; } }} break; case 1286: -#line 1585 "char_ref.rl" {te = p+1;{ output->first = 0x22e6; {p++; goto _out; } }} break; case 1287: -#line 1586 "char_ref.rl" {te = p+1;{ output->first = 0x27ec; {p++; goto _out; } }} break; case 1288: -#line 1587 "char_ref.rl" {te = p+1;{ output->first = 0x21fd; {p++; goto _out; } }} break; case 1289: -#line 1588 "char_ref.rl" {te = p+1;{ output->first = 0x27e6; {p++; goto _out; } }} break; case 1290: -#line 1589 "char_ref.rl" {te = p+1;{ output->first = 0x27f5; {p++; goto _out; } }} break; case 1291: -#line 1590 "char_ref.rl" {te = p+1;{ output->first = 0x27f7; {p++; goto _out; } }} break; case 1292: -#line 1591 "char_ref.rl" {te = p+1;{ output->first = 0x27fc; {p++; goto _out; } }} break; case 1293: -#line 1592 "char_ref.rl" {te = p+1;{ output->first = 0x27f6; {p++; goto _out; } }} break; case 1294: -#line 1593 "char_ref.rl" {te = p+1;{ output->first = 0x21ab; {p++; goto _out; } }} break; case 1295: -#line 1594 "char_ref.rl" {te = p+1;{ output->first = 0x21ac; {p++; goto _out; } }} break; case 1296: -#line 1595 "char_ref.rl" {te = p+1;{ output->first = 0x2985; {p++; goto _out; } }} break; case 1297: -#line 1596 "char_ref.rl" {te = p+1;{ output->first = 0x0001d55d; {p++; goto _out; } }} break; case 1298: -#line 1597 "char_ref.rl" {te = p+1;{ output->first = 0x2a2d; {p++; goto _out; } }} break; case 1299: -#line 1598 "char_ref.rl" {te = p+1;{ output->first = 0x2a34; {p++; goto _out; } }} break; case 1300: -#line 1599 "char_ref.rl" {te = p+1;{ output->first = 0x2217; {p++; goto _out; } }} break; case 1301: -#line 1600 "char_ref.rl" {te = p+1;{ output->first = 0x5f; {p++; goto _out; } }} break; case 1302: -#line 1601 "char_ref.rl" {te = p+1;{ output->first = 0x25ca; {p++; goto _out; } }} break; case 1303: -#line 1602 "char_ref.rl" {te = p+1;{ output->first = 0x25ca; {p++; goto _out; } }} break; case 1304: -#line 1603 "char_ref.rl" {te = p+1;{ output->first = 0x29eb; {p++; goto _out; } }} break; case 1305: -#line 1604 "char_ref.rl" {te = p+1;{ output->first = 0x28; {p++; goto _out; } }} break; case 1306: -#line 1605 "char_ref.rl" {te = p+1;{ output->first = 0x2993; {p++; goto _out; } }} break; case 1307: -#line 1606 "char_ref.rl" {te = p+1;{ output->first = 0x21c6; {p++; goto _out; } }} break; case 1308: -#line 1607 "char_ref.rl" {te = p+1;{ output->first = 0x231f; {p++; goto _out; } }} break; case 1309: -#line 1608 "char_ref.rl" {te = p+1;{ output->first = 0x21cb; {p++; goto _out; } }} break; case 1310: -#line 1609 "char_ref.rl" {te = p+1;{ output->first = 0x296d; {p++; goto _out; } }} break; case 1311: -#line 1610 "char_ref.rl" {te = p+1;{ output->first = 0x200e; {p++; goto _out; } }} break; case 1312: -#line 1611 "char_ref.rl" {te = p+1;{ output->first = 0x22bf; {p++; goto _out; } }} break; case 1313: -#line 1612 "char_ref.rl" {te = p+1;{ output->first = 0x2039; {p++; goto _out; } }} break; case 1314: -#line 1613 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4c1; {p++; goto _out; } }} break; case 1315: -#line 1614 "char_ref.rl" {te = p+1;{ output->first = 0x21b0; {p++; goto _out; } }} break; case 1316: -#line 1615 "char_ref.rl" {te = p+1;{ output->first = 0x2272; {p++; goto _out; } }} break; case 1317: -#line 1616 "char_ref.rl" {te = p+1;{ output->first = 0x2a8d; {p++; goto _out; } }} break; case 1318: -#line 1617 "char_ref.rl" {te = p+1;{ output->first = 0x2a8f; {p++; goto _out; } }} break; case 1319: -#line 1618 "char_ref.rl" {te = p+1;{ output->first = 0x5b; {p++; goto _out; } }} break; case 1320: -#line 1619 "char_ref.rl" {te = p+1;{ output->first = 0x2018; {p++; goto _out; } }} break; case 1321: -#line 1620 "char_ref.rl" {te = p+1;{ output->first = 0x201a; {p++; goto _out; } }} break; case 1322: -#line 1621 "char_ref.rl" {te = p+1;{ output->first = 0x0142; {p++; goto _out; } }} break; case 1323: -#line 1622 "char_ref.rl" {te = p+1;{ output->first = 0x3c; {p++; goto _out; } }} break; case 1324: -#line 1624 "char_ref.rl" {te = p+1;{ output->first = 0x2aa6; {p++; goto _out; } }} break; case 1325: -#line 1625 "char_ref.rl" {te = p+1;{ output->first = 0x2a79; {p++; goto _out; } }} break; case 1326: -#line 1626 "char_ref.rl" {te = p+1;{ output->first = 0x22d6; {p++; goto _out; } }} break; case 1327: -#line 1627 "char_ref.rl" {te = p+1;{ output->first = 0x22cb; {p++; goto _out; } }} break; case 1328: -#line 1628 "char_ref.rl" {te = p+1;{ output->first = 0x22c9; {p++; goto _out; } }} break; case 1329: -#line 1629 "char_ref.rl" {te = p+1;{ output->first = 0x2976; {p++; goto _out; } }} break; case 1330: -#line 1630 "char_ref.rl" {te = p+1;{ output->first = 0x2a7b; {p++; goto _out; } }} break; case 1331: -#line 1631 "char_ref.rl" {te = p+1;{ output->first = 0x2996; {p++; goto _out; } }} break; case 1332: -#line 1632 "char_ref.rl" {te = p+1;{ output->first = 0x25c3; {p++; goto _out; } }} break; case 1333: -#line 1633 "char_ref.rl" {te = p+1;{ output->first = 0x22b4; {p++; goto _out; } }} break; case 1334: -#line 1634 "char_ref.rl" {te = p+1;{ output->first = 0x25c2; {p++; goto _out; } }} break; case 1335: -#line 1635 "char_ref.rl" {te = p+1;{ output->first = 0x294a; {p++; goto _out; } }} break; case 1336: -#line 1636 "char_ref.rl" {te = p+1;{ output->first = 0x2966; {p++; goto _out; } }} break; case 1337: -#line 1637 "char_ref.rl" {te = p+1;{ output->first = 0x2268; output->second = 0xfe00; {p++; goto _out; } }} break; case 1338: -#line 1638 "char_ref.rl" {te = p+1;{ output->first = 0x2268; output->second = 0xfe00; {p++; goto _out; } }} break; case 1339: -#line 1639 "char_ref.rl" {te = p+1;{ output->first = 0x223a; {p++; goto _out; } }} break; case 1340: -#line 1640 "char_ref.rl" {te = p+1;{ output->first = 0xaf; {p++; goto _out; } }} break; case 1341: -#line 1642 "char_ref.rl" {te = p+1;{ output->first = 0x2642; {p++; goto _out; } }} break; case 1342: -#line 1643 "char_ref.rl" {te = p+1;{ output->first = 0x2720; {p++; goto _out; } }} break; case 1343: -#line 1644 "char_ref.rl" {te = p+1;{ output->first = 0x2720; {p++; goto _out; } }} break; case 1344: -#line 1645 "char_ref.rl" {te = p+1;{ output->first = 0x21a6; {p++; goto _out; } }} break; case 1345: -#line 1646 "char_ref.rl" {te = p+1;{ output->first = 0x21a6; {p++; goto _out; } }} break; case 1346: -#line 1647 "char_ref.rl" {te = p+1;{ output->first = 0x21a7; {p++; goto _out; } }} break; case 1347: -#line 1648 "char_ref.rl" {te = p+1;{ output->first = 0x21a4; {p++; goto _out; } }} break; case 1348: -#line 1649 "char_ref.rl" {te = p+1;{ output->first = 0x21a5; {p++; goto _out; } }} break; case 1349: -#line 1650 "char_ref.rl" {te = p+1;{ output->first = 0x25ae; {p++; goto _out; } }} break; case 1350: -#line 1651 "char_ref.rl" {te = p+1;{ output->first = 0x2a29; {p++; goto _out; } }} break; case 1351: -#line 1652 "char_ref.rl" {te = p+1;{ output->first = 0x043c; {p++; goto _out; } }} break; case 1352: -#line 1653 "char_ref.rl" {te = p+1;{ output->first = 0x2014; {p++; goto _out; } }} break; case 1353: -#line 1654 "char_ref.rl" {te = p+1;{ output->first = 0x2221; {p++; goto _out; } }} break; case 1354: -#line 1655 "char_ref.rl" {te = p+1;{ output->first = 0x0001d52a; {p++; goto _out; } }} break; case 1355: -#line 1656 "char_ref.rl" {te = p+1;{ output->first = 0x2127; {p++; goto _out; } }} break; case 1356: -#line 1657 "char_ref.rl" {te = p+1;{ output->first = 0xb5; {p++; goto _out; } }} break; case 1357: -#line 1659 "char_ref.rl" {te = p+1;{ output->first = 0x2223; {p++; goto _out; } }} break; case 1358: -#line 1660 "char_ref.rl" {te = p+1;{ output->first = 0x2a; {p++; goto _out; } }} break; case 1359: -#line 1661 "char_ref.rl" {te = p+1;{ output->first = 0x2af0; {p++; goto _out; } }} break; case 1360: -#line 1662 "char_ref.rl" {te = p+1;{ output->first = 0xb7; {p++; goto _out; } }} break; case 1361: -#line 1664 "char_ref.rl" {te = p+1;{ output->first = 0x2212; {p++; goto _out; } }} break; case 1362: -#line 1665 "char_ref.rl" {te = p+1;{ output->first = 0x229f; {p++; goto _out; } }} break; case 1363: -#line 1666 "char_ref.rl" {te = p+1;{ output->first = 0x2238; {p++; goto _out; } }} break; case 1364: -#line 1667 "char_ref.rl" {te = p+1;{ output->first = 0x2a2a; {p++; goto _out; } }} break; case 1365: -#line 1668 "char_ref.rl" {te = p+1;{ output->first = 0x2adb; {p++; goto _out; } }} break; case 1366: -#line 1669 "char_ref.rl" {te = p+1;{ output->first = 0x2026; {p++; goto _out; } }} break; case 1367: -#line 1670 "char_ref.rl" {te = p+1;{ output->first = 0x2213; {p++; goto _out; } }} break; case 1368: -#line 1671 "char_ref.rl" {te = p+1;{ output->first = 0x22a7; {p++; goto _out; } }} break; case 1369: -#line 1672 "char_ref.rl" {te = p+1;{ output->first = 0x0001d55e; {p++; goto _out; } }} break; case 1370: -#line 1673 "char_ref.rl" {te = p+1;{ output->first = 0x2213; {p++; goto _out; } }} break; case 1371: -#line 1674 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4c2; {p++; goto _out; } }} break; case 1372: -#line 1675 "char_ref.rl" {te = p+1;{ output->first = 0x223e; {p++; goto _out; } }} break; case 1373: -#line 1676 "char_ref.rl" {te = p+1;{ output->first = 0x03bc; {p++; goto _out; } }} break; case 1374: -#line 1677 "char_ref.rl" {te = p+1;{ output->first = 0x22b8; {p++; goto _out; } }} break; case 1375: -#line 1678 "char_ref.rl" {te = p+1;{ output->first = 0x22b8; {p++; goto _out; } }} break; case 1376: -#line 1679 "char_ref.rl" {te = p+1;{ output->first = 0x22d9; output->second = 0x0338; {p++; goto _out; } }} break; case 1377: -#line 1680 "char_ref.rl" {te = p+1;{ output->first = 0x226b; output->second = 0x20d2; {p++; goto _out; } }} break; case 1378: -#line 1681 "char_ref.rl" {te = p+1;{ output->first = 0x226b; output->second = 0x0338; {p++; goto _out; } }} break; case 1379: -#line 1682 "char_ref.rl" {te = p+1;{ output->first = 0x21cd; {p++; goto _out; } }} break; case 1380: -#line 1683 "char_ref.rl" {te = p+1;{ output->first = 0x21ce; {p++; goto _out; } }} break; case 1381: -#line 1684 "char_ref.rl" {te = p+1;{ output->first = 0x22d8; output->second = 0x0338; {p++; goto _out; } }} break; case 1382: -#line 1685 "char_ref.rl" {te = p+1;{ output->first = 0x226a; output->second = 0x20d2; {p++; goto _out; } }} break; case 1383: -#line 1686 "char_ref.rl" {te = p+1;{ output->first = 0x226a; output->second = 0x0338; {p++; goto _out; } }} break; case 1384: -#line 1687 "char_ref.rl" {te = p+1;{ output->first = 0x21cf; {p++; goto _out; } }} break; case 1385: -#line 1688 "char_ref.rl" {te = p+1;{ output->first = 0x22af; {p++; goto _out; } }} break; case 1386: -#line 1689 "char_ref.rl" {te = p+1;{ output->first = 0x22ae; {p++; goto _out; } }} break; case 1387: -#line 1690 "char_ref.rl" {te = p+1;{ output->first = 0x2207; {p++; goto _out; } }} break; case 1388: -#line 1691 "char_ref.rl" {te = p+1;{ output->first = 0x0144; {p++; goto _out; } }} break; case 1389: -#line 1692 "char_ref.rl" {te = p+1;{ output->first = 0x2220; output->second = 0x20d2; {p++; goto _out; } }} break; case 1390: -#line 1693 "char_ref.rl" {te = p+1;{ output->first = 0x2249; {p++; goto _out; } }} break; case 1391: -#line 1694 "char_ref.rl" {te = p+1;{ output->first = 0x2a70; output->second = 0x0338; {p++; goto _out; } }} break; case 1392: -#line 1695 "char_ref.rl" {te = p+1;{ output->first = 0x224b; output->second = 0x0338; {p++; goto _out; } }} break; case 1393: -#line 1696 "char_ref.rl" {te = p+1;{ output->first = 0x0149; {p++; goto _out; } }} break; case 1394: -#line 1697 "char_ref.rl" {te = p+1;{ output->first = 0x2249; {p++; goto _out; } }} break; case 1395: -#line 1698 "char_ref.rl" {te = p+1;{ output->first = 0x266e; {p++; goto _out; } }} break; case 1396: -#line 1699 "char_ref.rl" {te = p+1;{ output->first = 0x266e; {p++; goto _out; } }} break; case 1397: -#line 1700 "char_ref.rl" {te = p+1;{ output->first = 0x2115; {p++; goto _out; } }} break; case 1398: -#line 1701 "char_ref.rl" {te = p+1;{ output->first = 0xa0; {p++; goto _out; } }} break; case 1399: -#line 1703 "char_ref.rl" {te = p+1;{ output->first = 0x224e; output->second = 0x0338; {p++; goto _out; } }} break; case 1400: -#line 1704 "char_ref.rl" {te = p+1;{ output->first = 0x224f; output->second = 0x0338; {p++; goto _out; } }} break; case 1401: -#line 1705 "char_ref.rl" {te = p+1;{ output->first = 0x2a43; {p++; goto _out; } }} break; case 1402: -#line 1706 "char_ref.rl" {te = p+1;{ output->first = 0x0148; {p++; goto _out; } }} break; case 1403: -#line 1707 "char_ref.rl" {te = p+1;{ output->first = 0x0146; {p++; goto _out; } }} break; case 1404: -#line 1708 "char_ref.rl" {te = p+1;{ output->first = 0x2247; {p++; goto _out; } }} break; case 1405: -#line 1709 "char_ref.rl" {te = p+1;{ output->first = 0x2a6d; output->second = 0x0338; {p++; goto _out; } }} break; case 1406: -#line 1710 "char_ref.rl" {te = p+1;{ output->first = 0x2a42; {p++; goto _out; } }} break; case 1407: -#line 1711 "char_ref.rl" {te = p+1;{ output->first = 0x043d; {p++; goto _out; } }} break; case 1408: -#line 1712 "char_ref.rl" {te = p+1;{ output->first = 0x2013; {p++; goto _out; } }} break; case 1409: -#line 1713 "char_ref.rl" {te = p+1;{ output->first = 0x2260; {p++; goto _out; } }} break; case 1410: -#line 1714 "char_ref.rl" {te = p+1;{ output->first = 0x21d7; {p++; goto _out; } }} break; case 1411: -#line 1715 "char_ref.rl" {te = p+1;{ output->first = 0x2924; {p++; goto _out; } }} break; case 1412: -#line 1716 "char_ref.rl" {te = p+1;{ output->first = 0x2197; {p++; goto _out; } }} break; case 1413: -#line 1717 "char_ref.rl" {te = p+1;{ output->first = 0x2197; {p++; goto _out; } }} break; case 1414: -#line 1718 "char_ref.rl" {te = p+1;{ output->first = 0x2250; output->second = 0x0338; {p++; goto _out; } }} break; case 1415: -#line 1719 "char_ref.rl" {te = p+1;{ output->first = 0x2262; {p++; goto _out; } }} break; case 1416: -#line 1720 "char_ref.rl" {te = p+1;{ output->first = 0x2928; {p++; goto _out; } }} break; case 1417: -#line 1721 "char_ref.rl" {te = p+1;{ output->first = 0x2242; output->second = 0x0338; {p++; goto _out; } }} break; case 1418: -#line 1722 "char_ref.rl" {te = p+1;{ output->first = 0x2204; {p++; goto _out; } }} break; case 1419: -#line 1723 "char_ref.rl" {te = p+1;{ output->first = 0x2204; {p++; goto _out; } }} break; case 1420: -#line 1724 "char_ref.rl" {te = p+1;{ output->first = 0x0001d52b; {p++; goto _out; } }} break; case 1421: -#line 1725 "char_ref.rl" {te = p+1;{ output->first = 0x2267; output->second = 0x0338; {p++; goto _out; } }} break; case 1422: -#line 1726 "char_ref.rl" {te = p+1;{ output->first = 0x2271; {p++; goto _out; } }} break; case 1423: -#line 1727 "char_ref.rl" {te = p+1;{ output->first = 0x2271; {p++; goto _out; } }} break; case 1424: -#line 1728 "char_ref.rl" {te = p+1;{ output->first = 0x2267; output->second = 0x0338; {p++; goto _out; } }} break; case 1425: -#line 1729 "char_ref.rl" {te = p+1;{ output->first = 0x2a7e; output->second = 0x0338; {p++; goto _out; } }} break; case 1426: -#line 1730 "char_ref.rl" {te = p+1;{ output->first = 0x2a7e; output->second = 0x0338; {p++; goto _out; } }} break; case 1427: -#line 1731 "char_ref.rl" {te = p+1;{ output->first = 0x2275; {p++; goto _out; } }} break; case 1428: -#line 1732 "char_ref.rl" {te = p+1;{ output->first = 0x226f; {p++; goto _out; } }} break; case 1429: -#line 1733 "char_ref.rl" {te = p+1;{ output->first = 0x226f; {p++; goto _out; } }} break; case 1430: -#line 1734 "char_ref.rl" {te = p+1;{ output->first = 0x21ce; {p++; goto _out; } }} break; case 1431: -#line 1735 "char_ref.rl" {te = p+1;{ output->first = 0x21ae; {p++; goto _out; } }} break; case 1432: -#line 1736 "char_ref.rl" {te = p+1;{ output->first = 0x2af2; {p++; goto _out; } }} break; case 1433: -#line 1737 "char_ref.rl" {te = p+1;{ output->first = 0x220b; {p++; goto _out; } }} break; case 1434: -#line 1738 "char_ref.rl" {te = p+1;{ output->first = 0x22fc; {p++; goto _out; } }} break; case 1435: -#line 1739 "char_ref.rl" {te = p+1;{ output->first = 0x22fa; {p++; goto _out; } }} break; case 1436: -#line 1740 "char_ref.rl" {te = p+1;{ output->first = 0x220b; {p++; goto _out; } }} break; case 1437: -#line 1741 "char_ref.rl" {te = p+1;{ output->first = 0x045a; {p++; goto _out; } }} break; case 1438: -#line 1742 "char_ref.rl" {te = p+1;{ output->first = 0x21cd; {p++; goto _out; } }} break; case 1439: -#line 1743 "char_ref.rl" {te = p+1;{ output->first = 0x2266; output->second = 0x0338; {p++; goto _out; } }} break; case 1440: -#line 1744 "char_ref.rl" {te = p+1;{ output->first = 0x219a; {p++; goto _out; } }} break; case 1441: -#line 1745 "char_ref.rl" {te = p+1;{ output->first = 0x2025; {p++; goto _out; } }} break; case 1442: -#line 1746 "char_ref.rl" {te = p+1;{ output->first = 0x2270; {p++; goto _out; } }} break; case 1443: -#line 1747 "char_ref.rl" {te = p+1;{ output->first = 0x219a; {p++; goto _out; } }} break; case 1444: -#line 1748 "char_ref.rl" {te = p+1;{ output->first = 0x21ae; {p++; goto _out; } }} break; case 1445: -#line 1749 "char_ref.rl" {te = p+1;{ output->first = 0x2270; {p++; goto _out; } }} break; case 1446: -#line 1750 "char_ref.rl" {te = p+1;{ output->first = 0x2266; output->second = 0x0338; {p++; goto _out; } }} break; case 1447: -#line 1751 "char_ref.rl" {te = p+1;{ output->first = 0x2a7d; output->second = 0x0338; {p++; goto _out; } }} break; case 1448: -#line 1752 "char_ref.rl" {te = p+1;{ output->first = 0x2a7d; output->second = 0x0338; {p++; goto _out; } }} break; case 1449: -#line 1753 "char_ref.rl" {te = p+1;{ output->first = 0x226e; {p++; goto _out; } }} break; case 1450: -#line 1754 "char_ref.rl" {te = p+1;{ output->first = 0x2274; {p++; goto _out; } }} break; case 1451: -#line 1755 "char_ref.rl" {te = p+1;{ output->first = 0x226e; {p++; goto _out; } }} break; case 1452: -#line 1756 "char_ref.rl" {te = p+1;{ output->first = 0x22ea; {p++; goto _out; } }} break; case 1453: -#line 1757 "char_ref.rl" {te = p+1;{ output->first = 0x22ec; {p++; goto _out; } }} break; case 1454: -#line 1758 "char_ref.rl" {te = p+1;{ output->first = 0x2224; {p++; goto _out; } }} break; case 1455: -#line 1759 "char_ref.rl" {te = p+1;{ output->first = 0x0001d55f; {p++; goto _out; } }} break; case 1456: -#line 1760 "char_ref.rl" {te = p+1;{ output->first = 0xac; {p++; goto _out; } }} break; case 1457: -#line 1761 "char_ref.rl" {te = p+1;{ output->first = 0x2209; {p++; goto _out; } }} break; case 1458: -#line 1762 "char_ref.rl" {te = p+1;{ output->first = 0x22f9; output->second = 0x0338; {p++; goto _out; } }} break; case 1459: -#line 1763 "char_ref.rl" {te = p+1;{ output->first = 0x22f5; output->second = 0x0338; {p++; goto _out; } }} break; case 1460: -#line 1764 "char_ref.rl" {te = p+1;{ output->first = 0x2209; {p++; goto _out; } }} break; case 1461: -#line 1765 "char_ref.rl" {te = p+1;{ output->first = 0x22f7; {p++; goto _out; } }} break; case 1462: -#line 1766 "char_ref.rl" {te = p+1;{ output->first = 0x22f6; {p++; goto _out; } }} break; case 1463: -#line 1767 "char_ref.rl" {te = p+1;{ output->first = 0x220c; {p++; goto _out; } }} break; case 1464: -#line 1768 "char_ref.rl" {te = p+1;{ output->first = 0x220c; {p++; goto _out; } }} break; case 1465: -#line 1769 "char_ref.rl" {te = p+1;{ output->first = 0x22fe; {p++; goto _out; } }} break; case 1466: -#line 1770 "char_ref.rl" {te = p+1;{ output->first = 0x22fd; {p++; goto _out; } }} break; case 1467: -#line 1772 "char_ref.rl" {te = p+1;{ output->first = 0x2226; {p++; goto _out; } }} break; case 1468: -#line 1773 "char_ref.rl" {te = p+1;{ output->first = 0x2226; {p++; goto _out; } }} break; case 1469: -#line 1774 "char_ref.rl" {te = p+1;{ output->first = 0x2afd; output->second = 0x20e5; {p++; goto _out; } }} break; case 1470: -#line 1775 "char_ref.rl" {te = p+1;{ output->first = 0x2202; output->second = 0x0338; {p++; goto _out; } }} break; case 1471: -#line 1776 "char_ref.rl" {te = p+1;{ output->first = 0x2a14; {p++; goto _out; } }} break; case 1472: -#line 1777 "char_ref.rl" {te = p+1;{ output->first = 0x2280; {p++; goto _out; } }} break; case 1473: -#line 1778 "char_ref.rl" {te = p+1;{ output->first = 0x22e0; {p++; goto _out; } }} break; case 1474: -#line 1779 "char_ref.rl" {te = p+1;{ output->first = 0x2aaf; output->second = 0x0338; {p++; goto _out; } }} break; case 1475: -#line 1780 "char_ref.rl" {te = p+1;{ output->first = 0x2280; {p++; goto _out; } }} break; case 1476: -#line 1781 "char_ref.rl" {te = p+1;{ output->first = 0x2aaf; output->second = 0x0338; {p++; goto _out; } }} break; case 1477: -#line 1782 "char_ref.rl" {te = p+1;{ output->first = 0x21cf; {p++; goto _out; } }} break; case 1478: -#line 1783 "char_ref.rl" {te = p+1;{ output->first = 0x219b; {p++; goto _out; } }} break; case 1479: -#line 1784 "char_ref.rl" {te = p+1;{ output->first = 0x2933; output->second = 0x0338; {p++; goto _out; } }} break; case 1480: -#line 1785 "char_ref.rl" {te = p+1;{ output->first = 0x219d; output->second = 0x0338; {p++; goto _out; } }} break; case 1481: -#line 1786 "char_ref.rl" {te = p+1;{ output->first = 0x219b; {p++; goto _out; } }} break; case 1482: -#line 1787 "char_ref.rl" {te = p+1;{ output->first = 0x22eb; {p++; goto _out; } }} break; case 1483: -#line 1788 "char_ref.rl" {te = p+1;{ output->first = 0x22ed; {p++; goto _out; } }} break; case 1484: -#line 1789 "char_ref.rl" {te = p+1;{ output->first = 0x2281; {p++; goto _out; } }} break; case 1485: -#line 1790 "char_ref.rl" {te = p+1;{ output->first = 0x22e1; {p++; goto _out; } }} break; case 1486: -#line 1791 "char_ref.rl" {te = p+1;{ output->first = 0x2ab0; output->second = 0x0338; {p++; goto _out; } }} break; case 1487: -#line 1792 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4c3; {p++; goto _out; } }} break; case 1488: -#line 1793 "char_ref.rl" {te = p+1;{ output->first = 0x2224; {p++; goto _out; } }} break; case 1489: -#line 1794 "char_ref.rl" {te = p+1;{ output->first = 0x2226; {p++; goto _out; } }} break; case 1490: -#line 1795 "char_ref.rl" {te = p+1;{ output->first = 0x2241; {p++; goto _out; } }} break; case 1491: -#line 1796 "char_ref.rl" {te = p+1;{ output->first = 0x2244; {p++; goto _out; } }} break; case 1492: -#line 1797 "char_ref.rl" {te = p+1;{ output->first = 0x2244; {p++; goto _out; } }} break; case 1493: -#line 1798 "char_ref.rl" {te = p+1;{ output->first = 0x2224; {p++; goto _out; } }} break; case 1494: -#line 1799 "char_ref.rl" {te = p+1;{ output->first = 0x2226; {p++; goto _out; } }} break; case 1495: -#line 1800 "char_ref.rl" {te = p+1;{ output->first = 0x22e2; {p++; goto _out; } }} break; case 1496: -#line 1801 "char_ref.rl" {te = p+1;{ output->first = 0x22e3; {p++; goto _out; } }} break; case 1497: -#line 1802 "char_ref.rl" {te = p+1;{ output->first = 0x2284; {p++; goto _out; } }} break; case 1498: -#line 1803 "char_ref.rl" {te = p+1;{ output->first = 0x2ac5; output->second = 0x0338; {p++; goto _out; } }} break; case 1499: -#line 1804 "char_ref.rl" {te = p+1;{ output->first = 0x2288; {p++; goto _out; } }} break; case 1500: -#line 1805 "char_ref.rl" {te = p+1;{ output->first = 0x2282; output->second = 0x20d2; {p++; goto _out; } }} break; case 1501: -#line 1806 "char_ref.rl" {te = p+1;{ output->first = 0x2288; {p++; goto _out; } }} break; case 1502: -#line 1807 "char_ref.rl" {te = p+1;{ output->first = 0x2ac5; output->second = 0x0338; {p++; goto _out; } }} break; case 1503: -#line 1808 "char_ref.rl" {te = p+1;{ output->first = 0x2281; {p++; goto _out; } }} break; case 1504: -#line 1809 "char_ref.rl" {te = p+1;{ output->first = 0x2ab0; output->second = 0x0338; {p++; goto _out; } }} break; case 1505: -#line 1810 "char_ref.rl" {te = p+1;{ output->first = 0x2285; {p++; goto _out; } }} break; case 1506: -#line 1811 "char_ref.rl" {te = p+1;{ output->first = 0x2ac6; output->second = 0x0338; {p++; goto _out; } }} break; case 1507: -#line 1812 "char_ref.rl" {te = p+1;{ output->first = 0x2289; {p++; goto _out; } }} break; case 1508: -#line 1813 "char_ref.rl" {te = p+1;{ output->first = 0x2283; output->second = 0x20d2; {p++; goto _out; } }} break; case 1509: -#line 1814 "char_ref.rl" {te = p+1;{ output->first = 0x2289; {p++; goto _out; } }} break; case 1510: -#line 1815 "char_ref.rl" {te = p+1;{ output->first = 0x2ac6; output->second = 0x0338; {p++; goto _out; } }} break; case 1511: -#line 1816 "char_ref.rl" {te = p+1;{ output->first = 0x2279; {p++; goto _out; } }} break; case 1512: -#line 1817 "char_ref.rl" {te = p+1;{ output->first = 0xf1; {p++; goto _out; } }} break; case 1513: -#line 1819 "char_ref.rl" {te = p+1;{ output->first = 0x2278; {p++; goto _out; } }} break; case 1514: -#line 1820 "char_ref.rl" {te = p+1;{ output->first = 0x22ea; {p++; goto _out; } }} break; case 1515: -#line 1821 "char_ref.rl" {te = p+1;{ output->first = 0x22ec; {p++; goto _out; } }} break; case 1516: -#line 1822 "char_ref.rl" {te = p+1;{ output->first = 0x22eb; {p++; goto _out; } }} break; case 1517: -#line 1823 "char_ref.rl" {te = p+1;{ output->first = 0x22ed; {p++; goto _out; } }} break; case 1518: -#line 1824 "char_ref.rl" {te = p+1;{ output->first = 0x03bd; {p++; goto _out; } }} break; case 1519: -#line 1825 "char_ref.rl" {te = p+1;{ output->first = 0x23; {p++; goto _out; } }} break; case 1520: -#line 1826 "char_ref.rl" {te = p+1;{ output->first = 0x2116; {p++; goto _out; } }} break; case 1521: -#line 1827 "char_ref.rl" {te = p+1;{ output->first = 0x2007; {p++; goto _out; } }} break; case 1522: -#line 1828 "char_ref.rl" {te = p+1;{ output->first = 0x22ad; {p++; goto _out; } }} break; case 1523: -#line 1829 "char_ref.rl" {te = p+1;{ output->first = 0x2904; {p++; goto _out; } }} break; case 1524: -#line 1830 "char_ref.rl" {te = p+1;{ output->first = 0x224d; output->second = 0x20d2; {p++; goto _out; } }} break; case 1525: -#line 1831 "char_ref.rl" {te = p+1;{ output->first = 0x22ac; {p++; goto _out; } }} break; case 1526: -#line 1832 "char_ref.rl" {te = p+1;{ output->first = 0x2265; output->second = 0x20d2; {p++; goto _out; } }} break; case 1527: -#line 1833 "char_ref.rl" {te = p+1;{ output->first = 0x3e; output->second = 0x20d2; {p++; goto _out; } }} break; case 1528: -#line 1834 "char_ref.rl" {te = p+1;{ output->first = 0x29de; {p++; goto _out; } }} break; case 1529: -#line 1835 "char_ref.rl" {te = p+1;{ output->first = 0x2902; {p++; goto _out; } }} break; case 1530: -#line 1836 "char_ref.rl" {te = p+1;{ output->first = 0x2264; output->second = 0x20d2; {p++; goto _out; } }} break; case 1531: -#line 1837 "char_ref.rl" {te = p+1;{ output->first = 0x3c; output->second = 0x20d2; {p++; goto _out; } }} break; case 1532: -#line 1838 "char_ref.rl" {te = p+1;{ output->first = 0x22b4; output->second = 0x20d2; {p++; goto _out; } }} break; case 1533: -#line 1839 "char_ref.rl" {te = p+1;{ output->first = 0x2903; {p++; goto _out; } }} break; case 1534: -#line 1840 "char_ref.rl" {te = p+1;{ output->first = 0x22b5; output->second = 0x20d2; {p++; goto _out; } }} break; case 1535: -#line 1841 "char_ref.rl" {te = p+1;{ output->first = 0x223c; output->second = 0x20d2; {p++; goto _out; } }} break; case 1536: -#line 1842 "char_ref.rl" {te = p+1;{ output->first = 0x21d6; {p++; goto _out; } }} break; case 1537: -#line 1843 "char_ref.rl" {te = p+1;{ output->first = 0x2923; {p++; goto _out; } }} break; case 1538: -#line 1844 "char_ref.rl" {te = p+1;{ output->first = 0x2196; {p++; goto _out; } }} break; case 1539: -#line 1845 "char_ref.rl" {te = p+1;{ output->first = 0x2196; {p++; goto _out; } }} break; case 1540: -#line 1846 "char_ref.rl" {te = p+1;{ output->first = 0x2927; {p++; goto _out; } }} break; case 1541: -#line 1847 "char_ref.rl" {te = p+1;{ output->first = 0x24c8; {p++; goto _out; } }} break; case 1542: -#line 1848 "char_ref.rl" {te = p+1;{ output->first = 0xf3; {p++; goto _out; } }} break; case 1543: -#line 1850 "char_ref.rl" {te = p+1;{ output->first = 0x229b; {p++; goto _out; } }} break; case 1544: -#line 1851 "char_ref.rl" {te = p+1;{ output->first = 0x229a; {p++; goto _out; } }} break; case 1545: -#line 1852 "char_ref.rl" {te = p+1;{ output->first = 0xf4; {p++; goto _out; } }} break; case 1546: -#line 1854 "char_ref.rl" {te = p+1;{ output->first = 0x043e; {p++; goto _out; } }} break; case 1547: -#line 1855 "char_ref.rl" {te = p+1;{ output->first = 0x229d; {p++; goto _out; } }} break; case 1548: -#line 1856 "char_ref.rl" {te = p+1;{ output->first = 0x0151; {p++; goto _out; } }} break; case 1549: -#line 1857 "char_ref.rl" {te = p+1;{ output->first = 0x2a38; {p++; goto _out; } }} break; case 1550: -#line 1858 "char_ref.rl" {te = p+1;{ output->first = 0x2299; {p++; goto _out; } }} break; case 1551: -#line 1859 "char_ref.rl" {te = p+1;{ output->first = 0x29bc; {p++; goto _out; } }} break; case 1552: -#line 1860 "char_ref.rl" {te = p+1;{ output->first = 0x0153; {p++; goto _out; } }} break; case 1553: -#line 1861 "char_ref.rl" {te = p+1;{ output->first = 0x29bf; {p++; goto _out; } }} break; case 1554: -#line 1862 "char_ref.rl" {te = p+1;{ output->first = 0x0001d52c; {p++; goto _out; } }} break; case 1555: -#line 1863 "char_ref.rl" {te = p+1;{ output->first = 0x02db; {p++; goto _out; } }} break; case 1556: -#line 1864 "char_ref.rl" {te = p+1;{ output->first = 0xf2; {p++; goto _out; } }} break; case 1557: -#line 1866 "char_ref.rl" {te = p+1;{ output->first = 0x29c1; {p++; goto _out; } }} break; case 1558: -#line 1867 "char_ref.rl" {te = p+1;{ output->first = 0x29b5; {p++; goto _out; } }} break; case 1559: -#line 1868 "char_ref.rl" {te = p+1;{ output->first = 0x03a9; {p++; goto _out; } }} break; case 1560: -#line 1869 "char_ref.rl" {te = p+1;{ output->first = 0x222e; {p++; goto _out; } }} break; case 1561: -#line 1870 "char_ref.rl" {te = p+1;{ output->first = 0x21ba; {p++; goto _out; } }} break; case 1562: -#line 1871 "char_ref.rl" {te = p+1;{ output->first = 0x29be; {p++; goto _out; } }} break; case 1563: -#line 1872 "char_ref.rl" {te = p+1;{ output->first = 0x29bb; {p++; goto _out; } }} break; case 1564: -#line 1873 "char_ref.rl" {te = p+1;{ output->first = 0x203e; {p++; goto _out; } }} break; case 1565: -#line 1874 "char_ref.rl" {te = p+1;{ output->first = 0x29c0; {p++; goto _out; } }} break; case 1566: -#line 1875 "char_ref.rl" {te = p+1;{ output->first = 0x014d; {p++; goto _out; } }} break; case 1567: -#line 1876 "char_ref.rl" {te = p+1;{ output->first = 0x03c9; {p++; goto _out; } }} break; case 1568: -#line 1877 "char_ref.rl" {te = p+1;{ output->first = 0x03bf; {p++; goto _out; } }} break; case 1569: -#line 1878 "char_ref.rl" {te = p+1;{ output->first = 0x29b6; {p++; goto _out; } }} break; case 1570: -#line 1879 "char_ref.rl" {te = p+1;{ output->first = 0x2296; {p++; goto _out; } }} break; case 1571: -#line 1880 "char_ref.rl" {te = p+1;{ output->first = 0x0001d560; {p++; goto _out; } }} break; case 1572: -#line 1881 "char_ref.rl" {te = p+1;{ output->first = 0x29b7; {p++; goto _out; } }} break; case 1573: -#line 1882 "char_ref.rl" {te = p+1;{ output->first = 0x29b9; {p++; goto _out; } }} break; case 1574: -#line 1883 "char_ref.rl" {te = p+1;{ output->first = 0x2295; {p++; goto _out; } }} break; case 1575: -#line 1884 "char_ref.rl" {te = p+1;{ output->first = 0x2228; {p++; goto _out; } }} break; case 1576: -#line 1885 "char_ref.rl" {te = p+1;{ output->first = 0x21bb; {p++; goto _out; } }} break; case 1577: -#line 1886 "char_ref.rl" {te = p+1;{ output->first = 0x2a5d; {p++; goto _out; } }} break; case 1578: -#line 1887 "char_ref.rl" {te = p+1;{ output->first = 0x2134; {p++; goto _out; } }} break; case 1579: -#line 1888 "char_ref.rl" {te = p+1;{ output->first = 0x2134; {p++; goto _out; } }} break; case 1580: -#line 1889 "char_ref.rl" {te = p+1;{ output->first = 0xaa; {p++; goto _out; } }} break; case 1581: -#line 1891 "char_ref.rl" {te = p+1;{ output->first = 0xba; {p++; goto _out; } }} break; case 1582: -#line 1893 "char_ref.rl" {te = p+1;{ output->first = 0x22b6; {p++; goto _out; } }} break; case 1583: -#line 1894 "char_ref.rl" {te = p+1;{ output->first = 0x2a56; {p++; goto _out; } }} break; case 1584: -#line 1895 "char_ref.rl" {te = p+1;{ output->first = 0x2a57; {p++; goto _out; } }} break; case 1585: -#line 1896 "char_ref.rl" {te = p+1;{ output->first = 0x2a5b; {p++; goto _out; } }} break; case 1586: -#line 1897 "char_ref.rl" {te = p+1;{ output->first = 0x2134; {p++; goto _out; } }} break; case 1587: -#line 1898 "char_ref.rl" {te = p+1;{ output->first = 0xf8; {p++; goto _out; } }} break; case 1588: -#line 1900 "char_ref.rl" {te = p+1;{ output->first = 0x2298; {p++; goto _out; } }} break; case 1589: -#line 1901 "char_ref.rl" {te = p+1;{ output->first = 0xf5; {p++; goto _out; } }} break; case 1590: -#line 1903 "char_ref.rl" {te = p+1;{ output->first = 0x2297; {p++; goto _out; } }} break; case 1591: -#line 1904 "char_ref.rl" {te = p+1;{ output->first = 0x2a36; {p++; goto _out; } }} break; case 1592: -#line 1905 "char_ref.rl" {te = p+1;{ output->first = 0xf6; {p++; goto _out; } }} break; case 1593: -#line 1907 "char_ref.rl" {te = p+1;{ output->first = 0x233d; {p++; goto _out; } }} break; case 1594: -#line 1908 "char_ref.rl" {te = p+1;{ output->first = 0x2225; {p++; goto _out; } }} break; case 1595: -#line 1909 "char_ref.rl" {te = p+1;{ output->first = 0xb6; {p++; goto _out; } }} break; case 1596: -#line 1911 "char_ref.rl" {te = p+1;{ output->first = 0x2225; {p++; goto _out; } }} break; case 1597: -#line 1912 "char_ref.rl" {te = p+1;{ output->first = 0x2af3; {p++; goto _out; } }} break; case 1598: -#line 1913 "char_ref.rl" {te = p+1;{ output->first = 0x2afd; {p++; goto _out; } }} break; case 1599: -#line 1914 "char_ref.rl" {te = p+1;{ output->first = 0x2202; {p++; goto _out; } }} break; case 1600: -#line 1915 "char_ref.rl" {te = p+1;{ output->first = 0x043f; {p++; goto _out; } }} break; case 1601: -#line 1916 "char_ref.rl" {te = p+1;{ output->first = 0x25; {p++; goto _out; } }} break; case 1602: -#line 1917 "char_ref.rl" {te = p+1;{ output->first = 0x2e; {p++; goto _out; } }} break; case 1603: -#line 1918 "char_ref.rl" {te = p+1;{ output->first = 0x2030; {p++; goto _out; } }} break; case 1604: -#line 1919 "char_ref.rl" {te = p+1;{ output->first = 0x22a5; {p++; goto _out; } }} break; case 1605: -#line 1920 "char_ref.rl" {te = p+1;{ output->first = 0x2031; {p++; goto _out; } }} break; case 1606: -#line 1921 "char_ref.rl" {te = p+1;{ output->first = 0x0001d52d; {p++; goto _out; } }} break; case 1607: -#line 1922 "char_ref.rl" {te = p+1;{ output->first = 0x03c6; {p++; goto _out; } }} break; case 1608: -#line 1923 "char_ref.rl" {te = p+1;{ output->first = 0x03d5; {p++; goto _out; } }} break; case 1609: -#line 1924 "char_ref.rl" {te = p+1;{ output->first = 0x2133; {p++; goto _out; } }} break; case 1610: -#line 1925 "char_ref.rl" {te = p+1;{ output->first = 0x260e; {p++; goto _out; } }} break; case 1611: -#line 1926 "char_ref.rl" {te = p+1;{ output->first = 0x03c0; {p++; goto _out; } }} break; case 1612: -#line 1927 "char_ref.rl" {te = p+1;{ output->first = 0x22d4; {p++; goto _out; } }} break; case 1613: -#line 1928 "char_ref.rl" {te = p+1;{ output->first = 0x03d6; {p++; goto _out; } }} break; case 1614: -#line 1929 "char_ref.rl" {te = p+1;{ output->first = 0x210f; {p++; goto _out; } }} break; case 1615: -#line 1930 "char_ref.rl" {te = p+1;{ output->first = 0x210e; {p++; goto _out; } }} break; case 1616: -#line 1931 "char_ref.rl" {te = p+1;{ output->first = 0x210f; {p++; goto _out; } }} break; case 1617: -#line 1932 "char_ref.rl" {te = p+1;{ output->first = 0x2b; {p++; goto _out; } }} break; case 1618: -#line 1933 "char_ref.rl" {te = p+1;{ output->first = 0x2a23; {p++; goto _out; } }} break; case 1619: -#line 1934 "char_ref.rl" {te = p+1;{ output->first = 0x229e; {p++; goto _out; } }} break; case 1620: -#line 1935 "char_ref.rl" {te = p+1;{ output->first = 0x2a22; {p++; goto _out; } }} break; case 1621: -#line 1936 "char_ref.rl" {te = p+1;{ output->first = 0x2214; {p++; goto _out; } }} break; case 1622: -#line 1937 "char_ref.rl" {te = p+1;{ output->first = 0x2a25; {p++; goto _out; } }} break; case 1623: -#line 1938 "char_ref.rl" {te = p+1;{ output->first = 0x2a72; {p++; goto _out; } }} break; case 1624: -#line 1939 "char_ref.rl" {te = p+1;{ output->first = 0xb1; {p++; goto _out; } }} break; case 1625: -#line 1941 "char_ref.rl" {te = p+1;{ output->first = 0x2a26; {p++; goto _out; } }} break; case 1626: -#line 1942 "char_ref.rl" {te = p+1;{ output->first = 0x2a27; {p++; goto _out; } }} break; case 1627: -#line 1943 "char_ref.rl" {te = p+1;{ output->first = 0xb1; {p++; goto _out; } }} break; case 1628: -#line 1944 "char_ref.rl" {te = p+1;{ output->first = 0x2a15; {p++; goto _out; } }} break; case 1629: -#line 1945 "char_ref.rl" {te = p+1;{ output->first = 0x0001d561; {p++; goto _out; } }} break; case 1630: -#line 1946 "char_ref.rl" {te = p+1;{ output->first = 0xa3; {p++; goto _out; } }} break; case 1631: -#line 1948 "char_ref.rl" {te = p+1;{ output->first = 0x227a; {p++; goto _out; } }} break; case 1632: -#line 1949 "char_ref.rl" {te = p+1;{ output->first = 0x2ab3; {p++; goto _out; } }} break; case 1633: -#line 1950 "char_ref.rl" {te = p+1;{ output->first = 0x2ab7; {p++; goto _out; } }} break; case 1634: -#line 1951 "char_ref.rl" {te = p+1;{ output->first = 0x227c; {p++; goto _out; } }} break; case 1635: -#line 1952 "char_ref.rl" {te = p+1;{ output->first = 0x2aaf; {p++; goto _out; } }} break; case 1636: -#line 1953 "char_ref.rl" {te = p+1;{ output->first = 0x227a; {p++; goto _out; } }} break; case 1637: -#line 1954 "char_ref.rl" {te = p+1;{ output->first = 0x2ab7; {p++; goto _out; } }} break; case 1638: -#line 1955 "char_ref.rl" {te = p+1;{ output->first = 0x227c; {p++; goto _out; } }} break; case 1639: -#line 1956 "char_ref.rl" {te = p+1;{ output->first = 0x2aaf; {p++; goto _out; } }} break; case 1640: -#line 1957 "char_ref.rl" {te = p+1;{ output->first = 0x2ab9; {p++; goto _out; } }} break; case 1641: -#line 1958 "char_ref.rl" {te = p+1;{ output->first = 0x2ab5; {p++; goto _out; } }} break; case 1642: -#line 1959 "char_ref.rl" {te = p+1;{ output->first = 0x22e8; {p++; goto _out; } }} break; case 1643: -#line 1960 "char_ref.rl" {te = p+1;{ output->first = 0x227e; {p++; goto _out; } }} break; case 1644: -#line 1961 "char_ref.rl" {te = p+1;{ output->first = 0x2032; {p++; goto _out; } }} break; case 1645: -#line 1962 "char_ref.rl" {te = p+1;{ output->first = 0x2119; {p++; goto _out; } }} break; case 1646: -#line 1963 "char_ref.rl" {te = p+1;{ output->first = 0x2ab5; {p++; goto _out; } }} break; case 1647: -#line 1964 "char_ref.rl" {te = p+1;{ output->first = 0x2ab9; {p++; goto _out; } }} break; case 1648: -#line 1965 "char_ref.rl" {te = p+1;{ output->first = 0x22e8; {p++; goto _out; } }} break; case 1649: -#line 1966 "char_ref.rl" {te = p+1;{ output->first = 0x220f; {p++; goto _out; } }} break; case 1650: -#line 1967 "char_ref.rl" {te = p+1;{ output->first = 0x232e; {p++; goto _out; } }} break; case 1651: -#line 1968 "char_ref.rl" {te = p+1;{ output->first = 0x2312; {p++; goto _out; } }} break; case 1652: -#line 1969 "char_ref.rl" {te = p+1;{ output->first = 0x2313; {p++; goto _out; } }} break; case 1653: -#line 1970 "char_ref.rl" {te = p+1;{ output->first = 0x221d; {p++; goto _out; } }} break; case 1654: -#line 1971 "char_ref.rl" {te = p+1;{ output->first = 0x221d; {p++; goto _out; } }} break; case 1655: -#line 1972 "char_ref.rl" {te = p+1;{ output->first = 0x227e; {p++; goto _out; } }} break; case 1656: -#line 1973 "char_ref.rl" {te = p+1;{ output->first = 0x22b0; {p++; goto _out; } }} break; case 1657: -#line 1974 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4c5; {p++; goto _out; } }} break; case 1658: -#line 1975 "char_ref.rl" {te = p+1;{ output->first = 0x03c8; {p++; goto _out; } }} break; case 1659: -#line 1976 "char_ref.rl" {te = p+1;{ output->first = 0x2008; {p++; goto _out; } }} break; case 1660: -#line 1977 "char_ref.rl" {te = p+1;{ output->first = 0x0001d52e; {p++; goto _out; } }} break; case 1661: -#line 1978 "char_ref.rl" {te = p+1;{ output->first = 0x2a0c; {p++; goto _out; } }} break; case 1662: -#line 1979 "char_ref.rl" {te = p+1;{ output->first = 0x0001d562; {p++; goto _out; } }} break; case 1663: -#line 1980 "char_ref.rl" {te = p+1;{ output->first = 0x2057; {p++; goto _out; } }} break; case 1664: -#line 1981 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4c6; {p++; goto _out; } }} break; case 1665: -#line 1982 "char_ref.rl" {te = p+1;{ output->first = 0x210d; {p++; goto _out; } }} break; case 1666: -#line 1983 "char_ref.rl" {te = p+1;{ output->first = 0x2a16; {p++; goto _out; } }} break; case 1667: -#line 1984 "char_ref.rl" {te = p+1;{ output->first = 0x3f; {p++; goto _out; } }} break; case 1668: -#line 1985 "char_ref.rl" {te = p+1;{ output->first = 0x225f; {p++; goto _out; } }} break; case 1669: -#line 1986 "char_ref.rl" {te = p+1;{ output->first = 0x22; {p++; goto _out; } }} break; case 1670: -#line 1988 "char_ref.rl" {te = p+1;{ output->first = 0x21db; {p++; goto _out; } }} break; case 1671: -#line 1989 "char_ref.rl" {te = p+1;{ output->first = 0x21d2; {p++; goto _out; } }} break; case 1672: -#line 1990 "char_ref.rl" {te = p+1;{ output->first = 0x291c; {p++; goto _out; } }} break; case 1673: -#line 1991 "char_ref.rl" {te = p+1;{ output->first = 0x290f; {p++; goto _out; } }} break; case 1674: -#line 1992 "char_ref.rl" {te = p+1;{ output->first = 0x2964; {p++; goto _out; } }} break; case 1675: -#line 1993 "char_ref.rl" {te = p+1;{ output->first = 0x223d; output->second = 0x0331; {p++; goto _out; } }} break; case 1676: -#line 1994 "char_ref.rl" {te = p+1;{ output->first = 0x0155; {p++; goto _out; } }} break; case 1677: -#line 1995 "char_ref.rl" {te = p+1;{ output->first = 0x221a; {p++; goto _out; } }} break; case 1678: -#line 1996 "char_ref.rl" {te = p+1;{ output->first = 0x29b3; {p++; goto _out; } }} break; case 1679: -#line 1997 "char_ref.rl" {te = p+1;{ output->first = 0x27e9; {p++; goto _out; } }} break; case 1680: -#line 1998 "char_ref.rl" {te = p+1;{ output->first = 0x2992; {p++; goto _out; } }} break; case 1681: -#line 1999 "char_ref.rl" {te = p+1;{ output->first = 0x29a5; {p++; goto _out; } }} break; case 1682: -#line 2000 "char_ref.rl" {te = p+1;{ output->first = 0x27e9; {p++; goto _out; } }} break; case 1683: -#line 2001 "char_ref.rl" {te = p+1;{ output->first = 0xbb; {p++; goto _out; } }} break; case 1684: -#line 2003 "char_ref.rl" {te = p+1;{ output->first = 0x2192; {p++; goto _out; } }} break; case 1685: -#line 2004 "char_ref.rl" {te = p+1;{ output->first = 0x2975; {p++; goto _out; } }} break; case 1686: -#line 2005 "char_ref.rl" {te = p+1;{ output->first = 0x21e5; {p++; goto _out; } }} break; case 1687: -#line 2006 "char_ref.rl" {te = p+1;{ output->first = 0x2920; {p++; goto _out; } }} break; case 1688: -#line 2007 "char_ref.rl" {te = p+1;{ output->first = 0x2933; {p++; goto _out; } }} break; case 1689: -#line 2008 "char_ref.rl" {te = p+1;{ output->first = 0x291e; {p++; goto _out; } }} break; case 1690: -#line 2009 "char_ref.rl" {te = p+1;{ output->first = 0x21aa; {p++; goto _out; } }} break; case 1691: -#line 2010 "char_ref.rl" {te = p+1;{ output->first = 0x21ac; {p++; goto _out; } }} break; case 1692: -#line 2011 "char_ref.rl" {te = p+1;{ output->first = 0x2945; {p++; goto _out; } }} break; case 1693: -#line 2012 "char_ref.rl" {te = p+1;{ output->first = 0x2974; {p++; goto _out; } }} break; case 1694: -#line 2013 "char_ref.rl" {te = p+1;{ output->first = 0x21a3; {p++; goto _out; } }} break; case 1695: -#line 2014 "char_ref.rl" {te = p+1;{ output->first = 0x219d; {p++; goto _out; } }} break; case 1696: -#line 2015 "char_ref.rl" {te = p+1;{ output->first = 0x291a; {p++; goto _out; } }} break; case 1697: -#line 2016 "char_ref.rl" {te = p+1;{ output->first = 0x2236; {p++; goto _out; } }} break; case 1698: -#line 2017 "char_ref.rl" {te = p+1;{ output->first = 0x211a; {p++; goto _out; } }} break; case 1699: -#line 2018 "char_ref.rl" {te = p+1;{ output->first = 0x290d; {p++; goto _out; } }} break; case 1700: -#line 2019 "char_ref.rl" {te = p+1;{ output->first = 0x2773; {p++; goto _out; } }} break; case 1701: -#line 2020 "char_ref.rl" {te = p+1;{ output->first = 0x7d; {p++; goto _out; } }} break; case 1702: -#line 2021 "char_ref.rl" {te = p+1;{ output->first = 0x5d; {p++; goto _out; } }} break; case 1703: -#line 2022 "char_ref.rl" {te = p+1;{ output->first = 0x298c; {p++; goto _out; } }} break; case 1704: -#line 2023 "char_ref.rl" {te = p+1;{ output->first = 0x298e; {p++; goto _out; } }} break; case 1705: -#line 2024 "char_ref.rl" {te = p+1;{ output->first = 0x2990; {p++; goto _out; } }} break; case 1706: -#line 2025 "char_ref.rl" {te = p+1;{ output->first = 0x0159; {p++; goto _out; } }} break; case 1707: -#line 2026 "char_ref.rl" {te = p+1;{ output->first = 0x0157; {p++; goto _out; } }} break; case 1708: -#line 2027 "char_ref.rl" {te = p+1;{ output->first = 0x2309; {p++; goto _out; } }} break; case 1709: -#line 2028 "char_ref.rl" {te = p+1;{ output->first = 0x7d; {p++; goto _out; } }} break; case 1710: -#line 2029 "char_ref.rl" {te = p+1;{ output->first = 0x0440; {p++; goto _out; } }} break; case 1711: -#line 2030 "char_ref.rl" {te = p+1;{ output->first = 0x2937; {p++; goto _out; } }} break; case 1712: -#line 2031 "char_ref.rl" {te = p+1;{ output->first = 0x2969; {p++; goto _out; } }} break; case 1713: -#line 2032 "char_ref.rl" {te = p+1;{ output->first = 0x201d; {p++; goto _out; } }} break; case 1714: -#line 2033 "char_ref.rl" {te = p+1;{ output->first = 0x201d; {p++; goto _out; } }} break; case 1715: -#line 2034 "char_ref.rl" {te = p+1;{ output->first = 0x21b3; {p++; goto _out; } }} break; case 1716: -#line 2035 "char_ref.rl" {te = p+1;{ output->first = 0x211c; {p++; goto _out; } }} break; case 1717: -#line 2036 "char_ref.rl" {te = p+1;{ output->first = 0x211b; {p++; goto _out; } }} break; case 1718: -#line 2037 "char_ref.rl" {te = p+1;{ output->first = 0x211c; {p++; goto _out; } }} break; case 1719: -#line 2038 "char_ref.rl" {te = p+1;{ output->first = 0x211d; {p++; goto _out; } }} break; case 1720: -#line 2039 "char_ref.rl" {te = p+1;{ output->first = 0x25ad; {p++; goto _out; } }} break; case 1721: -#line 2040 "char_ref.rl" {te = p+1;{ output->first = 0xae; {p++; goto _out; } }} break; case 1722: -#line 2042 "char_ref.rl" {te = p+1;{ output->first = 0x297d; {p++; goto _out; } }} break; case 1723: -#line 2043 "char_ref.rl" {te = p+1;{ output->first = 0x230b; {p++; goto _out; } }} break; case 1724: -#line 2044 "char_ref.rl" {te = p+1;{ output->first = 0x0001d52f; {p++; goto _out; } }} break; case 1725: -#line 2045 "char_ref.rl" {te = p+1;{ output->first = 0x21c1; {p++; goto _out; } }} break; case 1726: -#line 2046 "char_ref.rl" {te = p+1;{ output->first = 0x21c0; {p++; goto _out; } }} break; case 1727: -#line 2047 "char_ref.rl" {te = p+1;{ output->first = 0x296c; {p++; goto _out; } }} break; case 1728: -#line 2048 "char_ref.rl" {te = p+1;{ output->first = 0x03c1; {p++; goto _out; } }} break; case 1729: -#line 2049 "char_ref.rl" {te = p+1;{ output->first = 0x03f1; {p++; goto _out; } }} break; case 1730: -#line 2050 "char_ref.rl" {te = p+1;{ output->first = 0x2192; {p++; goto _out; } }} break; case 1731: -#line 2051 "char_ref.rl" {te = p+1;{ output->first = 0x21a3; {p++; goto _out; } }} break; case 1732: -#line 2052 "char_ref.rl" {te = p+1;{ output->first = 0x21c1; {p++; goto _out; } }} break; case 1733: -#line 2053 "char_ref.rl" {te = p+1;{ output->first = 0x21c0; {p++; goto _out; } }} break; case 1734: -#line 2054 "char_ref.rl" {te = p+1;{ output->first = 0x21c4; {p++; goto _out; } }} break; case 1735: -#line 2055 "char_ref.rl" {te = p+1;{ output->first = 0x21cc; {p++; goto _out; } }} break; case 1736: -#line 2056 "char_ref.rl" {te = p+1;{ output->first = 0x21c9; {p++; goto _out; } }} break; case 1737: -#line 2057 "char_ref.rl" {te = p+1;{ output->first = 0x219d; {p++; goto _out; } }} break; case 1738: -#line 2058 "char_ref.rl" {te = p+1;{ output->first = 0x22cc; {p++; goto _out; } }} break; case 1739: -#line 2059 "char_ref.rl" {te = p+1;{ output->first = 0x02da; {p++; goto _out; } }} break; case 1740: -#line 2060 "char_ref.rl" {te = p+1;{ output->first = 0x2253; {p++; goto _out; } }} break; case 1741: -#line 2061 "char_ref.rl" {te = p+1;{ output->first = 0x21c4; {p++; goto _out; } }} break; case 1742: -#line 2062 "char_ref.rl" {te = p+1;{ output->first = 0x21cc; {p++; goto _out; } }} break; case 1743: -#line 2063 "char_ref.rl" {te = p+1;{ output->first = 0x200f; {p++; goto _out; } }} break; case 1744: -#line 2064 "char_ref.rl" {te = p+1;{ output->first = 0x23b1; {p++; goto _out; } }} break; case 1745: -#line 2065 "char_ref.rl" {te = p+1;{ output->first = 0x23b1; {p++; goto _out; } }} break; case 1746: -#line 2066 "char_ref.rl" {te = p+1;{ output->first = 0x2aee; {p++; goto _out; } }} break; case 1747: -#line 2067 "char_ref.rl" {te = p+1;{ output->first = 0x27ed; {p++; goto _out; } }} break; case 1748: -#line 2068 "char_ref.rl" {te = p+1;{ output->first = 0x21fe; {p++; goto _out; } }} break; case 1749: -#line 2069 "char_ref.rl" {te = p+1;{ output->first = 0x27e7; {p++; goto _out; } }} break; case 1750: -#line 2070 "char_ref.rl" {te = p+1;{ output->first = 0x2986; {p++; goto _out; } }} break; case 1751: -#line 2071 "char_ref.rl" {te = p+1;{ output->first = 0x0001d563; {p++; goto _out; } }} break; case 1752: -#line 2072 "char_ref.rl" {te = p+1;{ output->first = 0x2a2e; {p++; goto _out; } }} break; case 1753: -#line 2073 "char_ref.rl" {te = p+1;{ output->first = 0x2a35; {p++; goto _out; } }} break; case 1754: -#line 2074 "char_ref.rl" {te = p+1;{ output->first = 0x29; {p++; goto _out; } }} break; case 1755: -#line 2075 "char_ref.rl" {te = p+1;{ output->first = 0x2994; {p++; goto _out; } }} break; case 1756: -#line 2076 "char_ref.rl" {te = p+1;{ output->first = 0x2a12; {p++; goto _out; } }} break; case 1757: -#line 2077 "char_ref.rl" {te = p+1;{ output->first = 0x21c9; {p++; goto _out; } }} break; case 1758: -#line 2078 "char_ref.rl" {te = p+1;{ output->first = 0x203a; {p++; goto _out; } }} break; case 1759: -#line 2079 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4c7; {p++; goto _out; } }} break; case 1760: -#line 2080 "char_ref.rl" {te = p+1;{ output->first = 0x21b1; {p++; goto _out; } }} break; case 1761: -#line 2081 "char_ref.rl" {te = p+1;{ output->first = 0x5d; {p++; goto _out; } }} break; case 1762: -#line 2082 "char_ref.rl" {te = p+1;{ output->first = 0x2019; {p++; goto _out; } }} break; case 1763: -#line 2083 "char_ref.rl" {te = p+1;{ output->first = 0x2019; {p++; goto _out; } }} break; case 1764: -#line 2084 "char_ref.rl" {te = p+1;{ output->first = 0x22cc; {p++; goto _out; } }} break; case 1765: -#line 2085 "char_ref.rl" {te = p+1;{ output->first = 0x22ca; {p++; goto _out; } }} break; case 1766: -#line 2086 "char_ref.rl" {te = p+1;{ output->first = 0x25b9; {p++; goto _out; } }} break; case 1767: -#line 2087 "char_ref.rl" {te = p+1;{ output->first = 0x22b5; {p++; goto _out; } }} break; case 1768: -#line 2088 "char_ref.rl" {te = p+1;{ output->first = 0x25b8; {p++; goto _out; } }} break; case 1769: -#line 2089 "char_ref.rl" {te = p+1;{ output->first = 0x29ce; {p++; goto _out; } }} break; case 1770: -#line 2090 "char_ref.rl" {te = p+1;{ output->first = 0x2968; {p++; goto _out; } }} break; case 1771: -#line 2091 "char_ref.rl" {te = p+1;{ output->first = 0x211e; {p++; goto _out; } }} break; case 1772: -#line 2092 "char_ref.rl" {te = p+1;{ output->first = 0x015b; {p++; goto _out; } }} break; case 1773: -#line 2093 "char_ref.rl" {te = p+1;{ output->first = 0x201a; {p++; goto _out; } }} break; case 1774: -#line 2094 "char_ref.rl" {te = p+1;{ output->first = 0x227b; {p++; goto _out; } }} break; case 1775: -#line 2095 "char_ref.rl" {te = p+1;{ output->first = 0x2ab4; {p++; goto _out; } }} break; case 1776: -#line 2096 "char_ref.rl" {te = p+1;{ output->first = 0x2ab8; {p++; goto _out; } }} break; case 1777: -#line 2097 "char_ref.rl" {te = p+1;{ output->first = 0x0161; {p++; goto _out; } }} break; case 1778: -#line 2098 "char_ref.rl" {te = p+1;{ output->first = 0x227d; {p++; goto _out; } }} break; case 1779: -#line 2099 "char_ref.rl" {te = p+1;{ output->first = 0x2ab0; {p++; goto _out; } }} break; case 1780: -#line 2100 "char_ref.rl" {te = p+1;{ output->first = 0x015f; {p++; goto _out; } }} break; case 1781: -#line 2101 "char_ref.rl" {te = p+1;{ output->first = 0x015d; {p++; goto _out; } }} break; case 1782: -#line 2102 "char_ref.rl" {te = p+1;{ output->first = 0x2ab6; {p++; goto _out; } }} break; case 1783: -#line 2103 "char_ref.rl" {te = p+1;{ output->first = 0x2aba; {p++; goto _out; } }} break; case 1784: -#line 2104 "char_ref.rl" {te = p+1;{ output->first = 0x22e9; {p++; goto _out; } }} break; case 1785: -#line 2105 "char_ref.rl" {te = p+1;{ output->first = 0x2a13; {p++; goto _out; } }} break; case 1786: -#line 2106 "char_ref.rl" {te = p+1;{ output->first = 0x227f; {p++; goto _out; } }} break; case 1787: -#line 2107 "char_ref.rl" {te = p+1;{ output->first = 0x0441; {p++; goto _out; } }} break; case 1788: -#line 2108 "char_ref.rl" {te = p+1;{ output->first = 0x22c5; {p++; goto _out; } }} break; case 1789: -#line 2109 "char_ref.rl" {te = p+1;{ output->first = 0x22a1; {p++; goto _out; } }} break; case 1790: -#line 2110 "char_ref.rl" {te = p+1;{ output->first = 0x2a66; {p++; goto _out; } }} break; case 1791: -#line 2111 "char_ref.rl" {te = p+1;{ output->first = 0x21d8; {p++; goto _out; } }} break; case 1792: -#line 2112 "char_ref.rl" {te = p+1;{ output->first = 0x2925; {p++; goto _out; } }} break; case 1793: -#line 2113 "char_ref.rl" {te = p+1;{ output->first = 0x2198; {p++; goto _out; } }} break; case 1794: -#line 2114 "char_ref.rl" {te = p+1;{ output->first = 0x2198; {p++; goto _out; } }} break; case 1795: -#line 2115 "char_ref.rl" {te = p+1;{ output->first = 0xa7; {p++; goto _out; } }} break; case 1796: -#line 2117 "char_ref.rl" {te = p+1;{ output->first = 0x3b; {p++; goto _out; } }} break; case 1797: -#line 2118 "char_ref.rl" {te = p+1;{ output->first = 0x2929; {p++; goto _out; } }} break; case 1798: -#line 2119 "char_ref.rl" {te = p+1;{ output->first = 0x2216; {p++; goto _out; } }} break; case 1799: -#line 2120 "char_ref.rl" {te = p+1;{ output->first = 0x2216; {p++; goto _out; } }} break; case 1800: -#line 2121 "char_ref.rl" {te = p+1;{ output->first = 0x2736; {p++; goto _out; } }} break; case 1801: -#line 2122 "char_ref.rl" {te = p+1;{ output->first = 0x0001d530; {p++; goto _out; } }} break; case 1802: -#line 2123 "char_ref.rl" {te = p+1;{ output->first = 0x2322; {p++; goto _out; } }} break; case 1803: -#line 2124 "char_ref.rl" {te = p+1;{ output->first = 0x266f; {p++; goto _out; } }} break; case 1804: -#line 2125 "char_ref.rl" {te = p+1;{ output->first = 0x0449; {p++; goto _out; } }} break; case 1805: -#line 2126 "char_ref.rl" {te = p+1;{ output->first = 0x0448; {p++; goto _out; } }} break; case 1806: -#line 2127 "char_ref.rl" {te = p+1;{ output->first = 0x2223; {p++; goto _out; } }} break; case 1807: -#line 2128 "char_ref.rl" {te = p+1;{ output->first = 0x2225; {p++; goto _out; } }} break; case 1808: -#line 2129 "char_ref.rl" {te = p+1;{ output->first = 0xad; {p++; goto _out; } }} break; case 1809: -#line 2131 "char_ref.rl" {te = p+1;{ output->first = 0x03c3; {p++; goto _out; } }} break; case 1810: -#line 2132 "char_ref.rl" {te = p+1;{ output->first = 0x03c2; {p++; goto _out; } }} break; case 1811: -#line 2133 "char_ref.rl" {te = p+1;{ output->first = 0x03c2; {p++; goto _out; } }} break; case 1812: -#line 2134 "char_ref.rl" {te = p+1;{ output->first = 0x223c; {p++; goto _out; } }} break; case 1813: -#line 2135 "char_ref.rl" {te = p+1;{ output->first = 0x2a6a; {p++; goto _out; } }} break; case 1814: -#line 2136 "char_ref.rl" {te = p+1;{ output->first = 0x2243; {p++; goto _out; } }} break; case 1815: -#line 2137 "char_ref.rl" {te = p+1;{ output->first = 0x2243; {p++; goto _out; } }} break; case 1816: -#line 2138 "char_ref.rl" {te = p+1;{ output->first = 0x2a9e; {p++; goto _out; } }} break; case 1817: -#line 2139 "char_ref.rl" {te = p+1;{ output->first = 0x2aa0; {p++; goto _out; } }} break; case 1818: -#line 2140 "char_ref.rl" {te = p+1;{ output->first = 0x2a9d; {p++; goto _out; } }} break; case 1819: -#line 2141 "char_ref.rl" {te = p+1;{ output->first = 0x2a9f; {p++; goto _out; } }} break; case 1820: -#line 2142 "char_ref.rl" {te = p+1;{ output->first = 0x2246; {p++; goto _out; } }} break; case 1821: -#line 2143 "char_ref.rl" {te = p+1;{ output->first = 0x2a24; {p++; goto _out; } }} break; case 1822: -#line 2144 "char_ref.rl" {te = p+1;{ output->first = 0x2972; {p++; goto _out; } }} break; case 1823: -#line 2145 "char_ref.rl" {te = p+1;{ output->first = 0x2190; {p++; goto _out; } }} break; case 1824: -#line 2146 "char_ref.rl" {te = p+1;{ output->first = 0x2216; {p++; goto _out; } }} break; case 1825: -#line 2147 "char_ref.rl" {te = p+1;{ output->first = 0x2a33; {p++; goto _out; } }} break; case 1826: -#line 2148 "char_ref.rl" {te = p+1;{ output->first = 0x29e4; {p++; goto _out; } }} break; case 1827: -#line 2149 "char_ref.rl" {te = p+1;{ output->first = 0x2223; {p++; goto _out; } }} break; case 1828: -#line 2150 "char_ref.rl" {te = p+1;{ output->first = 0x2323; {p++; goto _out; } }} break; case 1829: -#line 2151 "char_ref.rl" {te = p+1;{ output->first = 0x2aaa; {p++; goto _out; } }} break; case 1830: -#line 2152 "char_ref.rl" {te = p+1;{ output->first = 0x2aac; {p++; goto _out; } }} break; case 1831: -#line 2153 "char_ref.rl" {te = p+1;{ output->first = 0x2aac; output->second = 0xfe00; {p++; goto _out; } }} break; case 1832: -#line 2154 "char_ref.rl" {te = p+1;{ output->first = 0x044c; {p++; goto _out; } }} break; case 1833: -#line 2155 "char_ref.rl" {te = p+1;{ output->first = 0x2f; {p++; goto _out; } }} break; case 1834: -#line 2156 "char_ref.rl" {te = p+1;{ output->first = 0x29c4; {p++; goto _out; } }} break; case 1835: -#line 2157 "char_ref.rl" {te = p+1;{ output->first = 0x233f; {p++; goto _out; } }} break; case 1836: -#line 2158 "char_ref.rl" {te = p+1;{ output->first = 0x0001d564; {p++; goto _out; } }} break; case 1837: -#line 2159 "char_ref.rl" {te = p+1;{ output->first = 0x2660; {p++; goto _out; } }} break; case 1838: -#line 2160 "char_ref.rl" {te = p+1;{ output->first = 0x2660; {p++; goto _out; } }} break; case 1839: -#line 2161 "char_ref.rl" {te = p+1;{ output->first = 0x2225; {p++; goto _out; } }} break; case 1840: -#line 2162 "char_ref.rl" {te = p+1;{ output->first = 0x2293; {p++; goto _out; } }} break; case 1841: -#line 2163 "char_ref.rl" {te = p+1;{ output->first = 0x2293; output->second = 0xfe00; {p++; goto _out; } }} break; case 1842: -#line 2164 "char_ref.rl" {te = p+1;{ output->first = 0x2294; {p++; goto _out; } }} break; case 1843: -#line 2165 "char_ref.rl" {te = p+1;{ output->first = 0x2294; output->second = 0xfe00; {p++; goto _out; } }} break; case 1844: -#line 2166 "char_ref.rl" {te = p+1;{ output->first = 0x228f; {p++; goto _out; } }} break; case 1845: -#line 2167 "char_ref.rl" {te = p+1;{ output->first = 0x2291; {p++; goto _out; } }} break; case 1846: -#line 2168 "char_ref.rl" {te = p+1;{ output->first = 0x228f; {p++; goto _out; } }} break; case 1847: -#line 2169 "char_ref.rl" {te = p+1;{ output->first = 0x2291; {p++; goto _out; } }} break; case 1848: -#line 2170 "char_ref.rl" {te = p+1;{ output->first = 0x2290; {p++; goto _out; } }} break; case 1849: -#line 2171 "char_ref.rl" {te = p+1;{ output->first = 0x2292; {p++; goto _out; } }} break; case 1850: -#line 2172 "char_ref.rl" {te = p+1;{ output->first = 0x2290; {p++; goto _out; } }} break; case 1851: -#line 2173 "char_ref.rl" {te = p+1;{ output->first = 0x2292; {p++; goto _out; } }} break; case 1852: -#line 2174 "char_ref.rl" {te = p+1;{ output->first = 0x25a1; {p++; goto _out; } }} break; case 1853: -#line 2175 "char_ref.rl" {te = p+1;{ output->first = 0x25a1; {p++; goto _out; } }} break; case 1854: -#line 2176 "char_ref.rl" {te = p+1;{ output->first = 0x25aa; {p++; goto _out; } }} break; case 1855: -#line 2177 "char_ref.rl" {te = p+1;{ output->first = 0x25aa; {p++; goto _out; } }} break; case 1856: -#line 2178 "char_ref.rl" {te = p+1;{ output->first = 0x2192; {p++; goto _out; } }} break; case 1857: -#line 2179 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4c8; {p++; goto _out; } }} break; case 1858: -#line 2180 "char_ref.rl" {te = p+1;{ output->first = 0x2216; {p++; goto _out; } }} break; case 1859: -#line 2181 "char_ref.rl" {te = p+1;{ output->first = 0x2323; {p++; goto _out; } }} break; case 1860: -#line 2182 "char_ref.rl" {te = p+1;{ output->first = 0x22c6; {p++; goto _out; } }} break; case 1861: -#line 2183 "char_ref.rl" {te = p+1;{ output->first = 0x2606; {p++; goto _out; } }} break; case 1862: -#line 2184 "char_ref.rl" {te = p+1;{ output->first = 0x2605; {p++; goto _out; } }} break; case 1863: -#line 2185 "char_ref.rl" {te = p+1;{ output->first = 0x03f5; {p++; goto _out; } }} break; case 1864: -#line 2186 "char_ref.rl" {te = p+1;{ output->first = 0x03d5; {p++; goto _out; } }} break; case 1865: -#line 2187 "char_ref.rl" {te = p+1;{ output->first = 0xaf; {p++; goto _out; } }} break; case 1866: -#line 2188 "char_ref.rl" {te = p+1;{ output->first = 0x2282; {p++; goto _out; } }} break; case 1867: -#line 2189 "char_ref.rl" {te = p+1;{ output->first = 0x2ac5; {p++; goto _out; } }} break; case 1868: -#line 2190 "char_ref.rl" {te = p+1;{ output->first = 0x2abd; {p++; goto _out; } }} break; case 1869: -#line 2191 "char_ref.rl" {te = p+1;{ output->first = 0x2286; {p++; goto _out; } }} break; case 1870: -#line 2192 "char_ref.rl" {te = p+1;{ output->first = 0x2ac3; {p++; goto _out; } }} break; case 1871: -#line 2193 "char_ref.rl" {te = p+1;{ output->first = 0x2ac1; {p++; goto _out; } }} break; case 1872: -#line 2194 "char_ref.rl" {te = p+1;{ output->first = 0x2acb; {p++; goto _out; } }} break; case 1873: -#line 2195 "char_ref.rl" {te = p+1;{ output->first = 0x228a; {p++; goto _out; } }} break; case 1874: -#line 2196 "char_ref.rl" {te = p+1;{ output->first = 0x2abf; {p++; goto _out; } }} break; case 1875: -#line 2197 "char_ref.rl" {te = p+1;{ output->first = 0x2979; {p++; goto _out; } }} break; case 1876: -#line 2198 "char_ref.rl" {te = p+1;{ output->first = 0x2282; {p++; goto _out; } }} break; case 1877: -#line 2199 "char_ref.rl" {te = p+1;{ output->first = 0x2286; {p++; goto _out; } }} break; case 1878: -#line 2200 "char_ref.rl" {te = p+1;{ output->first = 0x2ac5; {p++; goto _out; } }} break; case 1879: -#line 2201 "char_ref.rl" {te = p+1;{ output->first = 0x228a; {p++; goto _out; } }} break; case 1880: -#line 2202 "char_ref.rl" {te = p+1;{ output->first = 0x2acb; {p++; goto _out; } }} break; case 1881: -#line 2203 "char_ref.rl" {te = p+1;{ output->first = 0x2ac7; {p++; goto _out; } }} break; case 1882: -#line 2204 "char_ref.rl" {te = p+1;{ output->first = 0x2ad5; {p++; goto _out; } }} break; case 1883: -#line 2205 "char_ref.rl" {te = p+1;{ output->first = 0x2ad3; {p++; goto _out; } }} break; case 1884: -#line 2206 "char_ref.rl" {te = p+1;{ output->first = 0x227b; {p++; goto _out; } }} break; case 1885: -#line 2207 "char_ref.rl" {te = p+1;{ output->first = 0x2ab8; {p++; goto _out; } }} break; case 1886: -#line 2208 "char_ref.rl" {te = p+1;{ output->first = 0x227d; {p++; goto _out; } }} break; case 1887: -#line 2209 "char_ref.rl" {te = p+1;{ output->first = 0x2ab0; {p++; goto _out; } }} break; case 1888: -#line 2210 "char_ref.rl" {te = p+1;{ output->first = 0x2aba; {p++; goto _out; } }} break; case 1889: -#line 2211 "char_ref.rl" {te = p+1;{ output->first = 0x2ab6; {p++; goto _out; } }} break; case 1890: -#line 2212 "char_ref.rl" {te = p+1;{ output->first = 0x22e9; {p++; goto _out; } }} break; case 1891: -#line 2213 "char_ref.rl" {te = p+1;{ output->first = 0x227f; {p++; goto _out; } }} break; case 1892: -#line 2214 "char_ref.rl" {te = p+1;{ output->first = 0x2211; {p++; goto _out; } }} break; case 1893: -#line 2215 "char_ref.rl" {te = p+1;{ output->first = 0x266a; {p++; goto _out; } }} break; case 1894: -#line 2216 "char_ref.rl" {te = p+1;{ output->first = 0xb9; {p++; goto _out; } }} break; case 1895: -#line 2218 "char_ref.rl" {te = p+1;{ output->first = 0xb2; {p++; goto _out; } }} break; case 1896: -#line 2220 "char_ref.rl" {te = p+1;{ output->first = 0xb3; {p++; goto _out; } }} break; case 1897: -#line 2222 "char_ref.rl" {te = p+1;{ output->first = 0x2283; {p++; goto _out; } }} break; case 1898: -#line 2223 "char_ref.rl" {te = p+1;{ output->first = 0x2ac6; {p++; goto _out; } }} break; case 1899: -#line 2224 "char_ref.rl" {te = p+1;{ output->first = 0x2abe; {p++; goto _out; } }} break; case 1900: -#line 2225 "char_ref.rl" {te = p+1;{ output->first = 0x2ad8; {p++; goto _out; } }} break; case 1901: -#line 2226 "char_ref.rl" {te = p+1;{ output->first = 0x2287; {p++; goto _out; } }} break; case 1902: -#line 2227 "char_ref.rl" {te = p+1;{ output->first = 0x2ac4; {p++; goto _out; } }} break; case 1903: -#line 2228 "char_ref.rl" {te = p+1;{ output->first = 0x27c9; {p++; goto _out; } }} break; case 1904: -#line 2229 "char_ref.rl" {te = p+1;{ output->first = 0x2ad7; {p++; goto _out; } }} break; case 1905: -#line 2230 "char_ref.rl" {te = p+1;{ output->first = 0x297b; {p++; goto _out; } }} break; case 1906: -#line 2231 "char_ref.rl" {te = p+1;{ output->first = 0x2ac2; {p++; goto _out; } }} break; case 1907: -#line 2232 "char_ref.rl" {te = p+1;{ output->first = 0x2acc; {p++; goto _out; } }} break; case 1908: -#line 2233 "char_ref.rl" {te = p+1;{ output->first = 0x228b; {p++; goto _out; } }} break; case 1909: -#line 2234 "char_ref.rl" {te = p+1;{ output->first = 0x2ac0; {p++; goto _out; } }} break; case 1910: -#line 2235 "char_ref.rl" {te = p+1;{ output->first = 0x2283; {p++; goto _out; } }} break; case 1911: -#line 2236 "char_ref.rl" {te = p+1;{ output->first = 0x2287; {p++; goto _out; } }} break; case 1912: -#line 2237 "char_ref.rl" {te = p+1;{ output->first = 0x2ac6; {p++; goto _out; } }} break; case 1913: -#line 2238 "char_ref.rl" {te = p+1;{ output->first = 0x228b; {p++; goto _out; } }} break; case 1914: -#line 2239 "char_ref.rl" {te = p+1;{ output->first = 0x2acc; {p++; goto _out; } }} break; case 1915: -#line 2240 "char_ref.rl" {te = p+1;{ output->first = 0x2ac8; {p++; goto _out; } }} break; case 1916: -#line 2241 "char_ref.rl" {te = p+1;{ output->first = 0x2ad4; {p++; goto _out; } }} break; case 1917: -#line 2242 "char_ref.rl" {te = p+1;{ output->first = 0x2ad6; {p++; goto _out; } }} break; case 1918: -#line 2243 "char_ref.rl" {te = p+1;{ output->first = 0x21d9; {p++; goto _out; } }} break; case 1919: -#line 2244 "char_ref.rl" {te = p+1;{ output->first = 0x2926; {p++; goto _out; } }} break; case 1920: -#line 2245 "char_ref.rl" {te = p+1;{ output->first = 0x2199; {p++; goto _out; } }} break; case 1921: -#line 2246 "char_ref.rl" {te = p+1;{ output->first = 0x2199; {p++; goto _out; } }} break; case 1922: -#line 2247 "char_ref.rl" {te = p+1;{ output->first = 0x292a; {p++; goto _out; } }} break; case 1923: -#line 2248 "char_ref.rl" {te = p+1;{ output->first = 0xdf; {p++; goto _out; } }} break; case 1924: -#line 2250 "char_ref.rl" {te = p+1;{ output->first = 0x2316; {p++; goto _out; } }} break; case 1925: -#line 2251 "char_ref.rl" {te = p+1;{ output->first = 0x03c4; {p++; goto _out; } }} break; case 1926: -#line 2252 "char_ref.rl" {te = p+1;{ output->first = 0x23b4; {p++; goto _out; } }} break; case 1927: -#line 2253 "char_ref.rl" {te = p+1;{ output->first = 0x0165; {p++; goto _out; } }} break; case 1928: -#line 2254 "char_ref.rl" {te = p+1;{ output->first = 0x0163; {p++; goto _out; } }} break; case 1929: -#line 2255 "char_ref.rl" {te = p+1;{ output->first = 0x0442; {p++; goto _out; } }} break; case 1930: -#line 2256 "char_ref.rl" {te = p+1;{ output->first = 0x20db; {p++; goto _out; } }} break; case 1931: -#line 2257 "char_ref.rl" {te = p+1;{ output->first = 0x2315; {p++; goto _out; } }} break; case 1932: -#line 2258 "char_ref.rl" {te = p+1;{ output->first = 0x0001d531; {p++; goto _out; } }} break; case 1933: -#line 2259 "char_ref.rl" {te = p+1;{ output->first = 0x2234; {p++; goto _out; } }} break; case 1934: -#line 2260 "char_ref.rl" {te = p+1;{ output->first = 0x2234; {p++; goto _out; } }} break; case 1935: -#line 2261 "char_ref.rl" {te = p+1;{ output->first = 0x03b8; {p++; goto _out; } }} break; case 1936: -#line 2262 "char_ref.rl" {te = p+1;{ output->first = 0x03d1; {p++; goto _out; } }} break; case 1937: -#line 2263 "char_ref.rl" {te = p+1;{ output->first = 0x03d1; {p++; goto _out; } }} break; case 1938: -#line 2264 "char_ref.rl" {te = p+1;{ output->first = 0x2248; {p++; goto _out; } }} break; case 1939: -#line 2265 "char_ref.rl" {te = p+1;{ output->first = 0x223c; {p++; goto _out; } }} break; case 1940: -#line 2266 "char_ref.rl" {te = p+1;{ output->first = 0x2009; {p++; goto _out; } }} break; case 1941: -#line 2267 "char_ref.rl" {te = p+1;{ output->first = 0x2248; {p++; goto _out; } }} break; case 1942: -#line 2268 "char_ref.rl" {te = p+1;{ output->first = 0x223c; {p++; goto _out; } }} break; case 1943: -#line 2269 "char_ref.rl" {te = p+1;{ output->first = 0xfe; {p++; goto _out; } }} break; case 1944: -#line 2271 "char_ref.rl" {te = p+1;{ output->first = 0x02dc; {p++; goto _out; } }} break; case 1945: -#line 2272 "char_ref.rl" {te = p+1;{ output->first = 0xd7; {p++; goto _out; } }} break; case 1946: -#line 2274 "char_ref.rl" {te = p+1;{ output->first = 0x22a0; {p++; goto _out; } }} break; case 1947: -#line 2275 "char_ref.rl" {te = p+1;{ output->first = 0x2a31; {p++; goto _out; } }} break; case 1948: -#line 2276 "char_ref.rl" {te = p+1;{ output->first = 0x2a30; {p++; goto _out; } }} break; case 1949: -#line 2277 "char_ref.rl" {te = p+1;{ output->first = 0x222d; {p++; goto _out; } }} break; case 1950: -#line 2278 "char_ref.rl" {te = p+1;{ output->first = 0x2928; {p++; goto _out; } }} break; case 1951: -#line 2279 "char_ref.rl" {te = p+1;{ output->first = 0x22a4; {p++; goto _out; } }} break; case 1952: -#line 2280 "char_ref.rl" {te = p+1;{ output->first = 0x2336; {p++; goto _out; } }} break; case 1953: -#line 2281 "char_ref.rl" {te = p+1;{ output->first = 0x2af1; {p++; goto _out; } }} break; case 1954: -#line 2282 "char_ref.rl" {te = p+1;{ output->first = 0x0001d565; {p++; goto _out; } }} break; case 1955: -#line 2283 "char_ref.rl" {te = p+1;{ output->first = 0x2ada; {p++; goto _out; } }} break; case 1956: -#line 2284 "char_ref.rl" {te = p+1;{ output->first = 0x2929; {p++; goto _out; } }} break; case 1957: -#line 2285 "char_ref.rl" {te = p+1;{ output->first = 0x2034; {p++; goto _out; } }} break; case 1958: -#line 2286 "char_ref.rl" {te = p+1;{ output->first = 0x2122; {p++; goto _out; } }} break; case 1959: -#line 2287 "char_ref.rl" {te = p+1;{ output->first = 0x25b5; {p++; goto _out; } }} break; case 1960: -#line 2288 "char_ref.rl" {te = p+1;{ output->first = 0x25bf; {p++; goto _out; } }} break; case 1961: -#line 2289 "char_ref.rl" {te = p+1;{ output->first = 0x25c3; {p++; goto _out; } }} break; case 1962: -#line 2290 "char_ref.rl" {te = p+1;{ output->first = 0x22b4; {p++; goto _out; } }} break; case 1963: -#line 2291 "char_ref.rl" {te = p+1;{ output->first = 0x225c; {p++; goto _out; } }} break; case 1964: -#line 2292 "char_ref.rl" {te = p+1;{ output->first = 0x25b9; {p++; goto _out; } }} break; case 1965: -#line 2293 "char_ref.rl" {te = p+1;{ output->first = 0x22b5; {p++; goto _out; } }} break; case 1966: -#line 2294 "char_ref.rl" {te = p+1;{ output->first = 0x25ec; {p++; goto _out; } }} break; case 1967: -#line 2295 "char_ref.rl" {te = p+1;{ output->first = 0x225c; {p++; goto _out; } }} break; case 1968: -#line 2296 "char_ref.rl" {te = p+1;{ output->first = 0x2a3a; {p++; goto _out; } }} break; case 1969: -#line 2297 "char_ref.rl" {te = p+1;{ output->first = 0x2a39; {p++; goto _out; } }} break; case 1970: -#line 2298 "char_ref.rl" {te = p+1;{ output->first = 0x29cd; {p++; goto _out; } }} break; case 1971: -#line 2299 "char_ref.rl" {te = p+1;{ output->first = 0x2a3b; {p++; goto _out; } }} break; case 1972: -#line 2300 "char_ref.rl" {te = p+1;{ output->first = 0x23e2; {p++; goto _out; } }} break; case 1973: -#line 2301 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4c9; {p++; goto _out; } }} break; case 1974: -#line 2302 "char_ref.rl" {te = p+1;{ output->first = 0x0446; {p++; goto _out; } }} break; case 1975: -#line 2303 "char_ref.rl" {te = p+1;{ output->first = 0x045b; {p++; goto _out; } }} break; case 1976: -#line 2304 "char_ref.rl" {te = p+1;{ output->first = 0x0167; {p++; goto _out; } }} break; case 1977: -#line 2305 "char_ref.rl" {te = p+1;{ output->first = 0x226c; {p++; goto _out; } }} break; case 1978: -#line 2306 "char_ref.rl" {te = p+1;{ output->first = 0x219e; {p++; goto _out; } }} break; case 1979: -#line 2307 "char_ref.rl" {te = p+1;{ output->first = 0x21a0; {p++; goto _out; } }} break; case 1980: -#line 2308 "char_ref.rl" {te = p+1;{ output->first = 0x21d1; {p++; goto _out; } }} break; case 1981: -#line 2309 "char_ref.rl" {te = p+1;{ output->first = 0x2963; {p++; goto _out; } }} break; case 1982: -#line 2310 "char_ref.rl" {te = p+1;{ output->first = 0xfa; {p++; goto _out; } }} break; case 1983: -#line 2312 "char_ref.rl" {te = p+1;{ output->first = 0x2191; {p++; goto _out; } }} break; case 1984: -#line 2313 "char_ref.rl" {te = p+1;{ output->first = 0x045e; {p++; goto _out; } }} break; case 1985: -#line 2314 "char_ref.rl" {te = p+1;{ output->first = 0x016d; {p++; goto _out; } }} break; case 1986: -#line 2315 "char_ref.rl" {te = p+1;{ output->first = 0xfb; {p++; goto _out; } }} break; case 1987: -#line 2317 "char_ref.rl" {te = p+1;{ output->first = 0x0443; {p++; goto _out; } }} break; case 1988: -#line 2318 "char_ref.rl" {te = p+1;{ output->first = 0x21c5; {p++; goto _out; } }} break; case 1989: -#line 2319 "char_ref.rl" {te = p+1;{ output->first = 0x0171; {p++; goto _out; } }} break; case 1990: -#line 2320 "char_ref.rl" {te = p+1;{ output->first = 0x296e; {p++; goto _out; } }} break; case 1991: -#line 2321 "char_ref.rl" {te = p+1;{ output->first = 0x297e; {p++; goto _out; } }} break; case 1992: -#line 2322 "char_ref.rl" {te = p+1;{ output->first = 0x0001d532; {p++; goto _out; } }} break; case 1993: -#line 2323 "char_ref.rl" {te = p+1;{ output->first = 0xf9; {p++; goto _out; } }} break; case 1994: -#line 2325 "char_ref.rl" {te = p+1;{ output->first = 0x21bf; {p++; goto _out; } }} break; case 1995: -#line 2326 "char_ref.rl" {te = p+1;{ output->first = 0x21be; {p++; goto _out; } }} break; case 1996: -#line 2327 "char_ref.rl" {te = p+1;{ output->first = 0x2580; {p++; goto _out; } }} break; case 1997: -#line 2328 "char_ref.rl" {te = p+1;{ output->first = 0x231c; {p++; goto _out; } }} break; case 1998: -#line 2329 "char_ref.rl" {te = p+1;{ output->first = 0x231c; {p++; goto _out; } }} break; case 1999: -#line 2330 "char_ref.rl" {te = p+1;{ output->first = 0x230f; {p++; goto _out; } }} break; case 2000: -#line 2331 "char_ref.rl" {te = p+1;{ output->first = 0x25f8; {p++; goto _out; } }} break; case 2001: -#line 2332 "char_ref.rl" {te = p+1;{ output->first = 0x016b; {p++; goto _out; } }} break; case 2002: -#line 2333 "char_ref.rl" {te = p+1;{ output->first = 0xa8; {p++; goto _out; } }} break; case 2003: -#line 2335 "char_ref.rl" {te = p+1;{ output->first = 0x0173; {p++; goto _out; } }} break; case 2004: -#line 2336 "char_ref.rl" {te = p+1;{ output->first = 0x0001d566; {p++; goto _out; } }} break; case 2005: -#line 2337 "char_ref.rl" {te = p+1;{ output->first = 0x2191; {p++; goto _out; } }} break; case 2006: -#line 2338 "char_ref.rl" {te = p+1;{ output->first = 0x2195; {p++; goto _out; } }} break; case 2007: -#line 2339 "char_ref.rl" {te = p+1;{ output->first = 0x21bf; {p++; goto _out; } }} break; case 2008: -#line 2340 "char_ref.rl" {te = p+1;{ output->first = 0x21be; {p++; goto _out; } }} break; case 2009: -#line 2341 "char_ref.rl" {te = p+1;{ output->first = 0x228e; {p++; goto _out; } }} break; case 2010: -#line 2342 "char_ref.rl" {te = p+1;{ output->first = 0x03c5; {p++; goto _out; } }} break; case 2011: -#line 2343 "char_ref.rl" {te = p+1;{ output->first = 0x03d2; {p++; goto _out; } }} break; case 2012: -#line 2344 "char_ref.rl" {te = p+1;{ output->first = 0x03c5; {p++; goto _out; } }} break; case 2013: -#line 2345 "char_ref.rl" {te = p+1;{ output->first = 0x21c8; {p++; goto _out; } }} break; case 2014: -#line 2346 "char_ref.rl" {te = p+1;{ output->first = 0x231d; {p++; goto _out; } }} break; case 2015: -#line 2347 "char_ref.rl" {te = p+1;{ output->first = 0x231d; {p++; goto _out; } }} break; case 2016: -#line 2348 "char_ref.rl" {te = p+1;{ output->first = 0x230e; {p++; goto _out; } }} break; case 2017: -#line 2349 "char_ref.rl" {te = p+1;{ output->first = 0x016f; {p++; goto _out; } }} break; case 2018: -#line 2350 "char_ref.rl" {te = p+1;{ output->first = 0x25f9; {p++; goto _out; } }} break; case 2019: -#line 2351 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4ca; {p++; goto _out; } }} break; case 2020: -#line 2352 "char_ref.rl" {te = p+1;{ output->first = 0x22f0; {p++; goto _out; } }} break; case 2021: -#line 2353 "char_ref.rl" {te = p+1;{ output->first = 0x0169; {p++; goto _out; } }} break; case 2022: -#line 2354 "char_ref.rl" {te = p+1;{ output->first = 0x25b5; {p++; goto _out; } }} break; case 2023: -#line 2355 "char_ref.rl" {te = p+1;{ output->first = 0x25b4; {p++; goto _out; } }} break; case 2024: -#line 2356 "char_ref.rl" {te = p+1;{ output->first = 0x21c8; {p++; goto _out; } }} break; case 2025: -#line 2357 "char_ref.rl" {te = p+1;{ output->first = 0xfc; {p++; goto _out; } }} break; case 2026: -#line 2359 "char_ref.rl" {te = p+1;{ output->first = 0x29a7; {p++; goto _out; } }} break; case 2027: -#line 2360 "char_ref.rl" {te = p+1;{ output->first = 0x21d5; {p++; goto _out; } }} break; case 2028: -#line 2361 "char_ref.rl" {te = p+1;{ output->first = 0x2ae8; {p++; goto _out; } }} break; case 2029: -#line 2362 "char_ref.rl" {te = p+1;{ output->first = 0x2ae9; {p++; goto _out; } }} break; case 2030: -#line 2363 "char_ref.rl" {te = p+1;{ output->first = 0x22a8; {p++; goto _out; } }} break; case 2031: -#line 2364 "char_ref.rl" {te = p+1;{ output->first = 0x299c; {p++; goto _out; } }} break; case 2032: -#line 2365 "char_ref.rl" {te = p+1;{ output->first = 0x03f5; {p++; goto _out; } }} break; case 2033: -#line 2366 "char_ref.rl" {te = p+1;{ output->first = 0x03f0; {p++; goto _out; } }} break; case 2034: -#line 2367 "char_ref.rl" {te = p+1;{ output->first = 0x2205; {p++; goto _out; } }} break; case 2035: -#line 2368 "char_ref.rl" {te = p+1;{ output->first = 0x03d5; {p++; goto _out; } }} break; case 2036: -#line 2369 "char_ref.rl" {te = p+1;{ output->first = 0x03d6; {p++; goto _out; } }} break; case 2037: -#line 2370 "char_ref.rl" {te = p+1;{ output->first = 0x221d; {p++; goto _out; } }} break; case 2038: -#line 2371 "char_ref.rl" {te = p+1;{ output->first = 0x2195; {p++; goto _out; } }} break; case 2039: -#line 2372 "char_ref.rl" {te = p+1;{ output->first = 0x03f1; {p++; goto _out; } }} break; case 2040: -#line 2373 "char_ref.rl" {te = p+1;{ output->first = 0x03c2; {p++; goto _out; } }} break; case 2041: -#line 2374 "char_ref.rl" {te = p+1;{ output->first = 0x228a; output->second = 0xfe00; {p++; goto _out; } }} break; case 2042: -#line 2375 "char_ref.rl" {te = p+1;{ output->first = 0x2acb; output->second = 0xfe00; {p++; goto _out; } }} break; case 2043: -#line 2376 "char_ref.rl" {te = p+1;{ output->first = 0x228b; output->second = 0xfe00; {p++; goto _out; } }} break; case 2044: -#line 2377 "char_ref.rl" {te = p+1;{ output->first = 0x2acc; output->second = 0xfe00; {p++; goto _out; } }} break; case 2045: -#line 2378 "char_ref.rl" {te = p+1;{ output->first = 0x03d1; {p++; goto _out; } }} break; case 2046: -#line 2379 "char_ref.rl" {te = p+1;{ output->first = 0x22b2; {p++; goto _out; } }} break; case 2047: -#line 2380 "char_ref.rl" {te = p+1;{ output->first = 0x22b3; {p++; goto _out; } }} break; case 2048: -#line 2381 "char_ref.rl" {te = p+1;{ output->first = 0x0432; {p++; goto _out; } }} break; case 2049: -#line 2382 "char_ref.rl" {te = p+1;{ output->first = 0x22a2; {p++; goto _out; } }} break; case 2050: -#line 2383 "char_ref.rl" {te = p+1;{ output->first = 0x2228; {p++; goto _out; } }} break; case 2051: -#line 2384 "char_ref.rl" {te = p+1;{ output->first = 0x22bb; {p++; goto _out; } }} break; case 2052: -#line 2385 "char_ref.rl" {te = p+1;{ output->first = 0x225a; {p++; goto _out; } }} break; case 2053: -#line 2386 "char_ref.rl" {te = p+1;{ output->first = 0x22ee; {p++; goto _out; } }} break; case 2054: -#line 2387 "char_ref.rl" {te = p+1;{ output->first = 0x7c; {p++; goto _out; } }} break; case 2055: -#line 2388 "char_ref.rl" {te = p+1;{ output->first = 0x7c; {p++; goto _out; } }} break; case 2056: -#line 2389 "char_ref.rl" {te = p+1;{ output->first = 0x0001d533; {p++; goto _out; } }} break; case 2057: -#line 2390 "char_ref.rl" {te = p+1;{ output->first = 0x22b2; {p++; goto _out; } }} break; case 2058: -#line 2391 "char_ref.rl" {te = p+1;{ output->first = 0x2282; output->second = 0x20d2; {p++; goto _out; } }} break; case 2059: -#line 2392 "char_ref.rl" {te = p+1;{ output->first = 0x2283; output->second = 0x20d2; {p++; goto _out; } }} break; case 2060: -#line 2393 "char_ref.rl" {te = p+1;{ output->first = 0x0001d567; {p++; goto _out; } }} break; case 2061: -#line 2394 "char_ref.rl" {te = p+1;{ output->first = 0x221d; {p++; goto _out; } }} break; case 2062: -#line 2395 "char_ref.rl" {te = p+1;{ output->first = 0x22b3; {p++; goto _out; } }} break; case 2063: -#line 2396 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4cb; {p++; goto _out; } }} break; case 2064: -#line 2397 "char_ref.rl" {te = p+1;{ output->first = 0x2acb; output->second = 0xfe00; {p++; goto _out; } }} break; case 2065: -#line 2398 "char_ref.rl" {te = p+1;{ output->first = 0x228a; output->second = 0xfe00; {p++; goto _out; } }} break; case 2066: -#line 2399 "char_ref.rl" {te = p+1;{ output->first = 0x2acc; output->second = 0xfe00; {p++; goto _out; } }} break; case 2067: -#line 2400 "char_ref.rl" {te = p+1;{ output->first = 0x228b; output->second = 0xfe00; {p++; goto _out; } }} break; case 2068: -#line 2401 "char_ref.rl" {te = p+1;{ output->first = 0x299a; {p++; goto _out; } }} break; case 2069: -#line 2402 "char_ref.rl" {te = p+1;{ output->first = 0x0175; {p++; goto _out; } }} break; case 2070: -#line 2403 "char_ref.rl" {te = p+1;{ output->first = 0x2a5f; {p++; goto _out; } }} break; case 2071: -#line 2404 "char_ref.rl" {te = p+1;{ output->first = 0x2227; {p++; goto _out; } }} break; case 2072: -#line 2405 "char_ref.rl" {te = p+1;{ output->first = 0x2259; {p++; goto _out; } }} break; case 2073: -#line 2406 "char_ref.rl" {te = p+1;{ output->first = 0x2118; {p++; goto _out; } }} break; case 2074: -#line 2407 "char_ref.rl" {te = p+1;{ output->first = 0x0001d534; {p++; goto _out; } }} break; case 2075: -#line 2408 "char_ref.rl" {te = p+1;{ output->first = 0x0001d568; {p++; goto _out; } }} break; case 2076: -#line 2409 "char_ref.rl" {te = p+1;{ output->first = 0x2118; {p++; goto _out; } }} break; case 2077: -#line 2410 "char_ref.rl" {te = p+1;{ output->first = 0x2240; {p++; goto _out; } }} break; case 2078: -#line 2411 "char_ref.rl" {te = p+1;{ output->first = 0x2240; {p++; goto _out; } }} break; case 2079: -#line 2412 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4cc; {p++; goto _out; } }} break; case 2080: -#line 2413 "char_ref.rl" {te = p+1;{ output->first = 0x22c2; {p++; goto _out; } }} break; case 2081: -#line 2414 "char_ref.rl" {te = p+1;{ output->first = 0x25ef; {p++; goto _out; } }} break; case 2082: -#line 2415 "char_ref.rl" {te = p+1;{ output->first = 0x22c3; {p++; goto _out; } }} break; case 2083: -#line 2416 "char_ref.rl" {te = p+1;{ output->first = 0x25bd; {p++; goto _out; } }} break; case 2084: -#line 2417 "char_ref.rl" {te = p+1;{ output->first = 0x0001d535; {p++; goto _out; } }} break; case 2085: -#line 2418 "char_ref.rl" {te = p+1;{ output->first = 0x27fa; {p++; goto _out; } }} break; case 2086: -#line 2419 "char_ref.rl" {te = p+1;{ output->first = 0x27f7; {p++; goto _out; } }} break; case 2087: -#line 2420 "char_ref.rl" {te = p+1;{ output->first = 0x03be; {p++; goto _out; } }} break; case 2088: -#line 2421 "char_ref.rl" {te = p+1;{ output->first = 0x27f8; {p++; goto _out; } }} break; case 2089: -#line 2422 "char_ref.rl" {te = p+1;{ output->first = 0x27f5; {p++; goto _out; } }} break; case 2090: -#line 2423 "char_ref.rl" {te = p+1;{ output->first = 0x27fc; {p++; goto _out; } }} break; case 2091: -#line 2424 "char_ref.rl" {te = p+1;{ output->first = 0x22fb; {p++; goto _out; } }} break; case 2092: -#line 2425 "char_ref.rl" {te = p+1;{ output->first = 0x2a00; {p++; goto _out; } }} break; case 2093: -#line 2426 "char_ref.rl" {te = p+1;{ output->first = 0x0001d569; {p++; goto _out; } }} break; case 2094: -#line 2427 "char_ref.rl" {te = p+1;{ output->first = 0x2a01; {p++; goto _out; } }} break; case 2095: -#line 2428 "char_ref.rl" {te = p+1;{ output->first = 0x2a02; {p++; goto _out; } }} break; case 2096: -#line 2429 "char_ref.rl" {te = p+1;{ output->first = 0x27f9; {p++; goto _out; } }} break; case 2097: -#line 2430 "char_ref.rl" {te = p+1;{ output->first = 0x27f6; {p++; goto _out; } }} break; case 2098: -#line 2431 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4cd; {p++; goto _out; } }} break; case 2099: -#line 2432 "char_ref.rl" {te = p+1;{ output->first = 0x2a06; {p++; goto _out; } }} break; case 2100: -#line 2433 "char_ref.rl" {te = p+1;{ output->first = 0x2a04; {p++; goto _out; } }} break; case 2101: -#line 2434 "char_ref.rl" {te = p+1;{ output->first = 0x25b3; {p++; goto _out; } }} break; case 2102: -#line 2435 "char_ref.rl" {te = p+1;{ output->first = 0x22c1; {p++; goto _out; } }} break; case 2103: -#line 2436 "char_ref.rl" {te = p+1;{ output->first = 0x22c0; {p++; goto _out; } }} break; case 2104: -#line 2437 "char_ref.rl" {te = p+1;{ output->first = 0xfd; {p++; goto _out; } }} break; case 2105: -#line 2439 "char_ref.rl" {te = p+1;{ output->first = 0x044f; {p++; goto _out; } }} break; case 2106: -#line 2440 "char_ref.rl" {te = p+1;{ output->first = 0x0177; {p++; goto _out; } }} break; case 2107: -#line 2441 "char_ref.rl" {te = p+1;{ output->first = 0x044b; {p++; goto _out; } }} break; case 2108: -#line 2442 "char_ref.rl" {te = p+1;{ output->first = 0xa5; {p++; goto _out; } }} break; case 2109: -#line 2444 "char_ref.rl" {te = p+1;{ output->first = 0x0001d536; {p++; goto _out; } }} break; case 2110: -#line 2445 "char_ref.rl" {te = p+1;{ output->first = 0x0457; {p++; goto _out; } }} break; case 2111: -#line 2446 "char_ref.rl" {te = p+1;{ output->first = 0x0001d56a; {p++; goto _out; } }} break; case 2112: -#line 2447 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4ce; {p++; goto _out; } }} break; case 2113: -#line 2448 "char_ref.rl" {te = p+1;{ output->first = 0x044e; {p++; goto _out; } }} break; case 2114: -#line 2449 "char_ref.rl" {te = p+1;{ output->first = 0xff; {p++; goto _out; } }} break; case 2115: -#line 2451 "char_ref.rl" {te = p+1;{ output->first = 0x017a; {p++; goto _out; } }} break; case 2116: -#line 2452 "char_ref.rl" {te = p+1;{ output->first = 0x017e; {p++; goto _out; } }} break; case 2117: -#line 2453 "char_ref.rl" {te = p+1;{ output->first = 0x0437; {p++; goto _out; } }} break; case 2118: -#line 2454 "char_ref.rl" {te = p+1;{ output->first = 0x017c; {p++; goto _out; } }} break; case 2119: -#line 2455 "char_ref.rl" {te = p+1;{ output->first = 0x2128; {p++; goto _out; } }} break; case 2120: -#line 2456 "char_ref.rl" {te = p+1;{ output->first = 0x03b6; {p++; goto _out; } }} break; case 2121: -#line 2457 "char_ref.rl" {te = p+1;{ output->first = 0x0001d537; {p++; goto _out; } }} break; case 2122: -#line 2458 "char_ref.rl" {te = p+1;{ output->first = 0x0436; {p++; goto _out; } }} break; case 2123: -#line 2459 "char_ref.rl" {te = p+1;{ output->first = 0x21dd; {p++; goto _out; } }} break; case 2124: -#line 2460 "char_ref.rl" {te = p+1;{ output->first = 0x0001d56b; {p++; goto _out; } }} break; case 2125: -#line 2461 "char_ref.rl" {te = p+1;{ output->first = 0x0001d4cf; {p++; goto _out; } }} break; case 2126: -#line 2462 "char_ref.rl" {te = p+1;{ output->first = 0x200d; {p++; goto _out; } }} break; case 2127: -#line 2463 "char_ref.rl" {te = p+1;{ output->first = 0x200c; {p++; goto _out; } }} break; case 2128: -#line 234 "char_ref.rl" {te = p;p--;{ output->first = 0xc6; {p++; goto _out; } }} break; case 2129: -#line 236 "char_ref.rl" {te = p;p--;{ output->first = 0x26; {p++; goto _out; } }} break; case 2130: -#line 238 "char_ref.rl" {te = p;p--;{ output->first = 0xc1; {p++; goto _out; } }} break; case 2131: -#line 241 "char_ref.rl" {te = p;p--;{ output->first = 0xc2; {p++; goto _out; } }} break; case 2132: -#line 245 "char_ref.rl" {te = p;p--;{ output->first = 0xc0; {p++; goto _out; } }} break; case 2133: -#line 253 "char_ref.rl" {te = p;p--;{ output->first = 0xc5; {p++; goto _out; } }} break; case 2134: -#line 257 "char_ref.rl" {te = p;p--;{ output->first = 0xc3; {p++; goto _out; } }} break; case 2135: -#line 259 "char_ref.rl" {te = p;p--;{ output->first = 0xc4; {p++; goto _out; } }} break; case 2136: -#line 274 "char_ref.rl" {te = p;p--;{ output->first = 0xa9; {p++; goto _out; } }} break; case 2137: -#line 281 "char_ref.rl" {te = p;p--;{ output->first = 0xc7; {p++; goto _out; } }} break; case 2138: -#line 364 "char_ref.rl" {te = p;p--;{ output->first = 0xd0; {p++; goto _out; } }} break; case 2139: -#line 366 "char_ref.rl" {te = p;p--;{ output->first = 0xc9; {p++; goto _out; } }} break; case 2140: -#line 369 "char_ref.rl" {te = p;p--;{ output->first = 0xca; {p++; goto _out; } }} break; case 2141: -#line 374 "char_ref.rl" {te = p;p--;{ output->first = 0xc8; {p++; goto _out; } }} break; case 2142: -#line 389 "char_ref.rl" {te = p;p--;{ output->first = 0xcb; {p++; goto _out; } }} break; case 2143: -#line 402 "char_ref.rl" {te = p;p--;{ output->first = 0x3e; {p++; goto _out; } }} break; case 2144: -#line 438 "char_ref.rl" {te = p;p--;{ output->first = 0xcd; {p++; goto _out; } }} break; case 2145: -#line 440 "char_ref.rl" {te = p;p--;{ output->first = 0xce; {p++; goto _out; } }} break; case 2146: -#line 445 "char_ref.rl" {te = p;p--;{ output->first = 0xcc; {p++; goto _out; } }} break; case 2147: -#line 462 "char_ref.rl" {te = p;p--;{ output->first = 0xcf; {p++; goto _out; } }} break; case 2148: -#line 480 "char_ref.rl" {te = p;p--;{ output->first = 0x3c; {p++; goto _out; } }} break; case 2149: -#line 617 "char_ref.rl" {te = p;p--;{ output->first = 0xd1; {p++; goto _out; } }} break; case 2150: -#line 621 "char_ref.rl" {te = p;p--;{ output->first = 0xd3; {p++; goto _out; } }} break; case 2151: -#line 623 "char_ref.rl" {te = p;p--;{ output->first = 0xd4; {p++; goto _out; } }} break; case 2152: -#line 628 "char_ref.rl" {te = p;p--;{ output->first = 0xd2; {p++; goto _out; } }} break; case 2153: -#line 638 "char_ref.rl" {te = p;p--;{ output->first = 0xd8; {p++; goto _out; } }} break; case 2154: -#line 640 "char_ref.rl" {te = p;p--;{ output->first = 0xd5; {p++; goto _out; } }} break; case 2155: -#line 643 "char_ref.rl" {te = p;p--;{ output->first = 0xd6; {p++; goto _out; } }} break; case 2156: -#line 668 "char_ref.rl" {te = p;p--;{ output->first = 0x22; {p++; goto _out; } }} break; case 2157: -#line 674 "char_ref.rl" {te = p;p--;{ output->first = 0xae; {p++; goto _out; } }} break; case 2158: -#line 758 "char_ref.rl" {te = p;p--;{ output->first = 0xde; {p++; goto _out; } }} break; case 2159: -#line 781 "char_ref.rl" {te = p;p--;{ output->first = 0xda; {p++; goto _out; } }} break; case 2160: -#line 787 "char_ref.rl" {te = p;p--;{ output->first = 0xdb; {p++; goto _out; } }} break; case 2161: -#line 792 "char_ref.rl" {te = p;p--;{ output->first = 0xd9; {p++; goto _out; } }} break; case 2162: -#line 819 "char_ref.rl" {te = p;p--;{ output->first = 0xdc; {p++; goto _out; } }} break; case 2163: -#line 850 "char_ref.rl" {te = p;p--;{ output->first = 0xdd; {p++; goto _out; } }} break; case 2164: -#line 868 "char_ref.rl" {te = p;p--;{ output->first = 0xe1; {p++; goto _out; } }} break; case 2165: -#line 874 "char_ref.rl" {te = p;p--;{ output->first = 0xe2; {p++; goto _out; } }} break; case 2166: -#line 876 "char_ref.rl" {te = p;p--;{ output->first = 0xb4; {p++; goto _out; } }} break; case 2167: -#line 879 "char_ref.rl" {te = p;p--;{ output->first = 0xe6; {p++; goto _out; } }} break; case 2168: -#line 883 "char_ref.rl" {te = p;p--;{ output->first = 0xe0; {p++; goto _out; } }} break; case 2169: -#line 890 "char_ref.rl" {te = p;p--;{ output->first = 0x26; {p++; goto _out; } }} break; case 2170: -#line 925 "char_ref.rl" {te = p;p--;{ output->first = 0xe5; {p++; goto _out; } }} break; case 2171: -#line 931 "char_ref.rl" {te = p;p--;{ output->first = 0xe3; {p++; goto _out; } }} break; case 2172: -#line 933 "char_ref.rl" {te = p;p--;{ output->first = 0xe4; {p++; goto _out; } }} break; case 2173: -#line 1038 "char_ref.rl" {te = p;p--;{ output->first = 0xa6; {p++; goto _out; } }} break; case 2174: -#line 1065 "char_ref.rl" {te = p;p--;{ output->first = 0xe7; {p++; goto _out; } }} break; case 2175: -#line 1071 "char_ref.rl" {te = p;p--;{ output->first = 0xb8; {p++; goto _out; } }} break; case 2176: -#line 1074 "char_ref.rl" {te = p;p--;{ output->first = 0xa2; {p++; goto _out; } }} break; case 2177: -#line 1113 "char_ref.rl" {te = p;p--;{ output->first = 0xa9; {p++; goto _out; } }} break; case 2178: -#line 1143 "char_ref.rl" {te = p;p--;{ output->first = 0xa4; {p++; goto _out; } }} break; case 2179: -#line 1167 "char_ref.rl" {te = p;p--;{ output->first = 0xb0; {p++; goto _out; } }} break; case 2180: -#line 1183 "char_ref.rl" {te = p;p--;{ output->first = 0xf7; {p++; goto _out; } }} break; case 2181: -#line 1220 "char_ref.rl" {te = p;p--;{ output->first = 0xe9; {p++; goto _out; } }} break; case 2182: -#line 1225 "char_ref.rl" {te = p;p--;{ output->first = 0xea; {p++; goto _out; } }} break; case 2183: -#line 1234 "char_ref.rl" {te = p;p--;{ output->first = 0xe8; {p++; goto _out; } }} break; case 2184: -#line 1276 "char_ref.rl" {te = p;p--;{ output->first = 0xf0; {p++; goto _out; } }} break; case 2185: -#line 1278 "char_ref.rl" {te = p;p--;{ output->first = 0xeb; {p++; goto _out; } }} break; case 2186: -#line 1303 "char_ref.rl" {te = p;p--;{ output->first = 0xbd; {p++; goto _out; } }} break; case 2187: -#line 1306 "char_ref.rl" {te = p;p--;{ output->first = 0xbc; {p++; goto _out; } }} break; case 2188: -#line 1313 "char_ref.rl" {te = p;p--;{ output->first = 0xbe; {p++; goto _out; } }} break; case 2189: -#line 1368 "char_ref.rl" {te = p;p--;{ output->first = 0x3e; {p++; goto _out; } }} break; case 2190: -#line 1412 "char_ref.rl" {te = p;p--;{ output->first = 0xed; {p++; goto _out; } }} break; case 2191: -#line 1415 "char_ref.rl" {te = p;p--;{ output->first = 0xee; {p++; goto _out; } }} break; case 2192: -#line 1419 "char_ref.rl" {te = p;p--;{ output->first = 0xa1; {p++; goto _out; } }} break; case 2193: -#line 1423 "char_ref.rl" {te = p;p--;{ output->first = 0xec; {p++; goto _out; } }} break; case 2194: -#line 1454 "char_ref.rl" {te = p;p--;{ output->first = 0xbf; {p++; goto _out; } }} break; case 2195: -#line 1466 "char_ref.rl" {te = p;p--;{ output->first = 0xef; {p++; goto _out; } }} break; case 2196: -#line 1501 "char_ref.rl" {te = p;p--;{ output->first = 0xab; {p++; goto _out; } }} break; case 2197: -#line 1623 "char_ref.rl" {te = p;p--;{ output->first = 0x3c; {p++; goto _out; } }} break; case 2198: -#line 1641 "char_ref.rl" {te = p;p--;{ output->first = 0xaf; {p++; goto _out; } }} break; case 2199: -#line 1658 "char_ref.rl" {te = p;p--;{ output->first = 0xb5; {p++; goto _out; } }} break; case 2200: -#line 1663 "char_ref.rl" {te = p;p--;{ output->first = 0xb7; {p++; goto _out; } }} break; case 2201: -#line 1702 "char_ref.rl" {te = p;p--;{ output->first = 0xa0; {p++; goto _out; } }} break; case 2202: -#line 1771 "char_ref.rl" {te = p;p--;{ output->first = 0xac; {p++; goto _out; } }} break; case 2203: -#line 1818 "char_ref.rl" {te = p;p--;{ output->first = 0xf1; {p++; goto _out; } }} break; case 2204: -#line 1849 "char_ref.rl" {te = p;p--;{ output->first = 0xf3; {p++; goto _out; } }} break; case 2205: -#line 1853 "char_ref.rl" {te = p;p--;{ output->first = 0xf4; {p++; goto _out; } }} break; case 2206: -#line 1865 "char_ref.rl" {te = p;p--;{ output->first = 0xf2; {p++; goto _out; } }} break; case 2207: -#line 1890 "char_ref.rl" {te = p;p--;{ output->first = 0xaa; {p++; goto _out; } }} break; case 2208: -#line 1892 "char_ref.rl" {te = p;p--;{ output->first = 0xba; {p++; goto _out; } }} break; case 2209: -#line 1899 "char_ref.rl" {te = p;p--;{ output->first = 0xf8; {p++; goto _out; } }} break; case 2210: -#line 1902 "char_ref.rl" {te = p;p--;{ output->first = 0xf5; {p++; goto _out; } }} break; case 2211: -#line 1906 "char_ref.rl" {te = p;p--;{ output->first = 0xf6; {p++; goto _out; } }} break; case 2212: -#line 1910 "char_ref.rl" {te = p;p--;{ output->first = 0xb6; {p++; goto _out; } }} break; case 2213: -#line 1940 "char_ref.rl" {te = p;p--;{ output->first = 0xb1; {p++; goto _out; } }} break; case 2214: -#line 1947 "char_ref.rl" {te = p;p--;{ output->first = 0xa3; {p++; goto _out; } }} break; case 2215: -#line 1987 "char_ref.rl" {te = p;p--;{ output->first = 0x22; {p++; goto _out; } }} break; case 2216: -#line 2002 "char_ref.rl" {te = p;p--;{ output->first = 0xbb; {p++; goto _out; } }} break; case 2217: -#line 2041 "char_ref.rl" {te = p;p--;{ output->first = 0xae; {p++; goto _out; } }} break; case 2218: -#line 2116 "char_ref.rl" {te = p;p--;{ output->first = 0xa7; {p++; goto _out; } }} break; case 2219: -#line 2130 "char_ref.rl" {te = p;p--;{ output->first = 0xad; {p++; goto _out; } }} break; case 2220: -#line 2217 "char_ref.rl" {te = p;p--;{ output->first = 0xb9; {p++; goto _out; } }} break; case 2221: -#line 2219 "char_ref.rl" {te = p;p--;{ output->first = 0xb2; {p++; goto _out; } }} break; case 2222: -#line 2221 "char_ref.rl" {te = p;p--;{ output->first = 0xb3; {p++; goto _out; } }} break; case 2223: -#line 2249 "char_ref.rl" {te = p;p--;{ output->first = 0xdf; {p++; goto _out; } }} break; case 2224: -#line 2270 "char_ref.rl" {te = p;p--;{ output->first = 0xfe; {p++; goto _out; } }} break; case 2225: -#line 2273 "char_ref.rl" {te = p;p--;{ output->first = 0xd7; {p++; goto _out; } }} break; case 2226: -#line 2311 "char_ref.rl" {te = p;p--;{ output->first = 0xfa; {p++; goto _out; } }} break; case 2227: -#line 2316 "char_ref.rl" {te = p;p--;{ output->first = 0xfb; {p++; goto _out; } }} break; case 2228: -#line 2324 "char_ref.rl" {te = p;p--;{ output->first = 0xf9; {p++; goto _out; } }} break; case 2229: -#line 2334 "char_ref.rl" {te = p;p--;{ output->first = 0xa8; {p++; goto _out; } }} break; case 2230: -#line 2358 "char_ref.rl" {te = p;p--;{ output->first = 0xfc; {p++; goto _out; } }} break; case 2231: -#line 2438 "char_ref.rl" {te = p;p--;{ output->first = 0xfd; {p++; goto _out; } }} break; case 2232: -#line 2443 "char_ref.rl" {te = p;p--;{ output->first = 0xa5; {p++; goto _out; } }} break; case 2233: -#line 2450 "char_ref.rl" {te = p;p--;{ output->first = 0xff; {p++; goto _out; } }} break; case 2234: -#line 1074 "char_ref.rl" {{p = ((te))-1;}{ output->first = 0xa2; {p++; goto _out; } }} break; case 2235: -#line 1113 "char_ref.rl" {{p = ((te))-1;}{ output->first = 0xa9; {p++; goto _out; } }} break; case 2236: -#line 1183 "char_ref.rl" {{p = ((te))-1;}{ output->first = 0xf7; {p++; goto _out; } }} break; case 2237: -#line 1368 "char_ref.rl" {{p = ((te))-1;}{ output->first = 0x3e; {p++; goto _out; } }} break; case 2238: -#line 1623 "char_ref.rl" {{p = ((te))-1;}{ output->first = 0x3c; {p++; goto _out; } }} break; case 2239: -#line 1771 "char_ref.rl" {{p = ((te))-1;}{ output->first = 0xac; {p++; goto _out; } }} break; case 2240: -#line 1910 "char_ref.rl" {{p = ((te))-1;}{ output->first = 0xb6; {p++; goto _out; } }} break; case 2241: -#line 2273 "char_ref.rl" {{p = ((te))-1;}{ output->first = 0xd7; {p++; goto _out; } }} break; -#line 23006 "char_ref.c" } } @@ -22980,10 +20758,8 @@ static bool consume_named_ref(struct GumboInternalParser* parser, while ( _nacts-- > 0 ) { switch ( *_acts++ ) { case 0: -#line 1 "NONE" {ts = 0;} break; -#line 23019 "char_ref.c" } } @@ -23003,18 +20779,17 @@ static bool consume_named_ref(struct GumboInternalParser* parser, _out: {} } -#line 2491 "char_ref.rl" - // clang-format on if (cs >= 7623) { assert(output->first != kGumboNoChar); char last_char = *(te - 1); - int len = te - start; + size_t len = te - start; if (last_char == ';') { bool matched = utf8iterator_maybe_consume_match(input, start, len, true); assert(matched); + UNUSED_IF_NDEBUG(matched); return true; - } else if (is_in_attribute && (*te == '=' || isalnum(*te))) { + } else if (is_in_attribute && (*te == '=' || ascii_isalnum(*te))) { output->first = kGumboNoChar; output->second = kGumboNoChar; utf8iterator_reset(input); @@ -23023,10 +20798,15 @@ static bool consume_named_ref(struct GumboInternalParser* parser, GumboStringPiece bad_ref; bad_ref.length = te - start; bad_ref.data = start; - add_named_reference_error( - parser, input, GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON, bad_ref); + add_named_reference_error ( + parser, + input, + GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON, + bad_ref + ); bool matched = utf8iterator_maybe_consume_match(input, start, len, true); assert(matched); + UNUSED_IF_NDEBUG(matched); return false; } } else { @@ -23038,9 +20818,13 @@ static bool consume_named_ref(struct GumboInternalParser* parser, } } -bool consume_char_ref(struct GumboInternalParser* parser, - struct GumboInternalUtf8Iterator* input, int additional_allowed_char, - bool is_in_attribute, OneOrTwoCodepoints* output) { +bool gumbo_consume_char_ref ( + struct GumboInternalParser* parser, + struct GumboInternalUtf8Iterator* input, + int additional_allowed_char, + bool is_in_attribute, + OneOrTwoCodepoints* output +) { utf8iterator_mark(input); utf8iterator_next(input); int c = utf8iterator_current(input); diff --git a/gumbo-parser/src/char_ref.h b/gumbo-parser/src/char_ref.h index 09d2598f..153858d2 100644 --- a/gumbo-parser/src/char_ref.h +++ b/gumbo-parser/src/char_ref.h @@ -1,23 +1,3 @@ -// Copyright 2011 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) -// -// Internal header for character reference handling; this should not be exposed -// transitively by any public API header. This is why the functions aren't -// namespaced. - #ifndef GUMBO_CHAR_REF_H_ #define GUMBO_CHAR_REF_H_ @@ -34,8 +14,8 @@ struct GumboInternalUtf8Iterator; extern const int kGumboNoChar; // Certain named character references generate two codepoints, not one, and so -// the consume_char_ref subroutine needs to return this instead of an int. The -// first field will be kGumboNoChar if no character reference was found; the +// the gumbo_consume_char_ref subroutine needs to return this instead of an int. +// The first field will be kGumboNoChar if no character reference was found; the // second field will be kGumboNoChar if that is the case or if the character // reference returns only a single codepoint. typedef struct { @@ -45,16 +25,20 @@ typedef struct { // Implements the "consume a character reference" section of the spec. // This reads in characters from the input as necessary, and fills in a -// OneOrTwoCodepoints struct containing the characters read. It may add parse -// errors to the GumboParser's errors vector, if the spec calls for it. Pass a +// OneOrTwoCodepoints struct containing the characters read. It may add parse +// errors to the GumboParser's errors vector, if the spec calls for it. Pass a // space for the "additional allowed char" when the spec says "with no -// additional allowed char". Returns false on parse error, true otherwise. -bool consume_char_ref(struct GumboInternalParser* parser, - struct GumboInternalUtf8Iterator* input, int additional_allowed_char, - bool is_in_attribute, OneOrTwoCodepoints* output); +// additional allowed char". Returns false on parse error, true otherwise. +bool gumbo_consume_char_ref ( + struct GumboInternalParser* parser, + struct GumboInternalUtf8Iterator* input, + int additional_allowed_char, + bool is_in_attribute, + OneOrTwoCodepoints* output +); #ifdef __cplusplus } #endif -#endif // GUMBO_CHAR_REF_H_ +#endif // GUMBO_CHAR_REF_H_ diff --git a/gumbo-parser/src/char_ref.rl b/gumbo-parser/src/char_ref.rl index 139a4bbd..3d93ac88 100644 --- a/gumbo-parser/src/char_ref.rl +++ b/gumbo-parser/src/char_ref.rl @@ -1,92 +1,43 @@ -// Copyright 2011 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) -// -// This is a Ragel state machine re-implementation of the original char_ref.c, -// rewritten to improve efficiency. To generate the .c file from it, -// -// $ ragel -F0 char_ref.rl -// -// The generated source is also checked into source control so that most people -// hacking on the parser do not need to install ragel. +/* + Copyright 2017-2018 Craig Barnes. + Copyright 2011 Google Inc. -#include "char_ref.h" + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at -#include -#include -#include -#include -#include // Only for debug assertions at present. + https://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include +#include "char_ref.h" #include "error.h" -#include "string_piece.h" +#include "macros.h" #include "utf8.h" -#include "util.h" struct GumboInternalParser; const int kGumboNoChar = -1; -// Table of replacement characters. The spec specifies that any occurrence of -// the first character should be replaced by the second character, and a parse -// error recorded. -typedef struct { - int from_char; - int to_char; -} CharReplacement; - -static const CharReplacement kCharReplacements[] = { - { 0x00, 0xfffd }, - { 0x0d, 0x000d }, - { 0x80, 0x20ac }, - { 0x81, 0x0081 }, - { 0x82, 0x201A }, - { 0x83, 0x0192 }, - { 0x84, 0x201E }, - { 0x85, 0x2026 }, - { 0x86, 0x2020 }, - { 0x87, 0x2021 }, - { 0x88, 0x02C6 }, - { 0x89, 0x2030 }, - { 0x8A, 0x0160 }, - { 0x8B, 0x2039 }, - { 0x8C, 0x0152 }, - { 0x8D, 0x008D }, - { 0x8E, 0x017D }, - { 0x8F, 0x008F }, - { 0x90, 0x0090 }, - { 0x91, 0x2018 }, - { 0x92, 0x2019 }, - { 0x93, 0x201C }, - { 0x94, 0x201D }, - { 0x95, 0x2022 }, - { 0x96, 0x2013 }, - { 0x97, 0x2014 }, - { 0x98, 0x02DC }, - { 0x99, 0x2122 }, - { 0x9A, 0x0161 }, - { 0x9B, 0x203A }, - { 0x9C, 0x0153 }, - { 0x9D, 0x009D }, - { 0x9E, 0x017E }, - { 0x9F, 0x0178 }, - // Terminator. - { -1, -1 } +static const uint32_t kCharReplacements[] = { + [0x00] = 0xFFFD, [0x0D] = 0x000D, [0x80] = 0x20AC, [0x81] = 0x0081, + [0x82] = 0x201A, [0x83] = 0x0192, [0x84] = 0x201E, [0x85] = 0x2026, + [0x86] = 0x2020, [0x87] = 0x2021, [0x88] = 0x02C6, [0x89] = 0x2030, + [0x8A] = 0x0160, [0x8B] = 0x2039, [0x8C] = 0x0152, [0x8D] = 0x008D, + [0x8E] = 0x017D, [0x8F] = 0x008F, [0x90] = 0x0090, [0x91] = 0x2018, + [0x92] = 0x2019, [0x93] = 0x201C, [0x94] = 0x201D, [0x95] = 0x2022, + [0x96] = 0x2013, [0x97] = 0x2014, [0x98] = 0x02DC, [0x99] = 0x2122, + [0x9A] = 0x0161, [0x9B] = 0x203A, [0x9C] = 0x0153, [0x9D] = 0x009D, + [0x9E] = 0x017E, [0x9F] = 0x0178 }; -static int parse_digit(int c, bool allow_hex) { +static int CONST_FN parse_digit(int c, bool allow_hex) { if (c >= '0' && c <= '9') { return c - '0'; } @@ -99,8 +50,10 @@ static int parse_digit(int c, bool allow_hex) { return -1; } -static void add_no_digit_error( - struct GumboInternalParser* parser, Utf8Iterator* input) { +static void add_no_digit_error ( + struct GumboInternalParser* parser, + Utf8Iterator* input +) { GumboError* error = gumbo_add_error(parser); if (!error) { return; @@ -109,9 +62,12 @@ static void add_no_digit_error( error->type = GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS; } -static void add_codepoint_error( - struct GumboInternalParser* parser, Utf8Iterator* input, - GumboErrorType type, int codepoint) { +static void add_codepoint_error ( + struct GumboInternalParser* parser, + Utf8Iterator* input, + GumboErrorType type, + int codepoint +) { GumboError* error = gumbo_add_error(parser); if (!error) { return; @@ -121,9 +77,12 @@ static void add_codepoint_error( error->v.codepoint = codepoint; } -static void add_named_reference_error( - struct GumboInternalParser* parser, Utf8Iterator* input, - GumboErrorType type, GumboStringPiece text) { +static void add_named_reference_error ( + struct GumboInternalParser* parser, + Utf8Iterator* input, + GumboErrorType type, + GumboStringPiece text +) { GumboError* error = gumbo_add_error(parser); if (!error) { return; @@ -133,17 +92,15 @@ static void add_named_reference_error( error->v.text = text; } -static int maybe_replace_codepoint(int codepoint) { - for (int i = 0; kCharReplacements[i].from_char != -1; ++i) { - if (kCharReplacements[i].from_char == codepoint) { - return kCharReplacements[i].to_char; - } - } - return -1; +static uint32_t PURE maybe_replace_codepoint(uint32_t codepoint) { + return (codepoint > 0x9F) ? 0x00 : kCharReplacements[codepoint]; } -static bool consume_numeric_ref( - struct GumboInternalParser* parser, Utf8Iterator* input, int* output) { +static bool consume_numeric_ref ( + struct GumboInternalParser* parser, + Utf8Iterator* input, + int* output +) { utf8iterator_next(input); bool is_hex = false; int c = utf8iterator_current(input); @@ -162,7 +119,7 @@ static bool consume_numeric_ref( return false; } - int codepoint = 0; + uint32_t codepoint = 0; bool status = true; do { codepoint = (codepoint * (is_hex ? 16 : 10)) + digit; @@ -171,31 +128,47 @@ static bool consume_numeric_ref( } while (digit != -1); if (utf8iterator_current(input) != ';') { - add_codepoint_error( - parser, input, GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON, codepoint); + add_codepoint_error ( + parser, + input, + GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON, + codepoint + ); status = false; } else { utf8iterator_next(input); } - int replacement = maybe_replace_codepoint(codepoint); - if (replacement != -1) { - add_codepoint_error( - parser, input, GUMBO_ERR_NUMERIC_CHAR_REF_INVALID, codepoint); + uint32_t replacement = maybe_replace_codepoint(codepoint); + if (replacement != 0) { + add_codepoint_error ( + parser, + input, + GUMBO_ERR_NUMERIC_CHAR_REF_INVALID, + codepoint + ); *output = replacement; return false; } if ((codepoint >= 0xd800 && codepoint <= 0xdfff) || codepoint > 0x10ffff) { - add_codepoint_error( - parser, input, GUMBO_ERR_NUMERIC_CHAR_REF_INVALID, codepoint); + add_codepoint_error ( + parser, + input, + GUMBO_ERR_NUMERIC_CHAR_REF_INVALID, + codepoint + ); *output = 0xfffd; return false; } - if (utf8_is_invalid_code_point(codepoint) || codepoint == 0xb) { - add_codepoint_error( - parser, input, GUMBO_ERR_NUMERIC_CHAR_REF_INVALID, codepoint); + if (utf8_is_invalid_code_point(codepoint)) { + add_codepoint_error ( + parser, + input, + GUMBO_ERR_NUMERIC_CHAR_REF_INVALID, + codepoint + ); status = false; // But return it anyway, per spec. } @@ -203,15 +176,19 @@ static bool consume_numeric_ref( return status; } -static bool maybe_add_invalid_named_reference( - struct GumboInternalParser* parser, Utf8Iterator* input) { +static bool maybe_add_invalid_named_reference ( + struct GumboInternalParser* parser, + Utf8Iterator* input +) { // The iterator will always be reset in this code path, so we don't need to // worry about consuming characters. const char* start = utf8iterator_get_char_pointer(input); int c = utf8iterator_current(input); - while ((c >= 'a' && c <= 'z') || - (c >= 'A' && c <= 'Z') || - (c >= '0' && c <= '9')) { + while ( + (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || (c >= '0' && c <= '9') + ) { utf8iterator_next(input); c = utf8iterator_current(input); } @@ -219,8 +196,12 @@ static bool maybe_add_invalid_named_reference( GumboStringPiece bad_ref; bad_ref.data = start; bad_ref.length = utf8iterator_get_char_pointer(input) - start; - add_named_reference_error( - parser, input, GUMBO_ERR_NAMED_CHAR_REF_INVALID, bad_ref); + add_named_reference_error ( + parser, + input, + GUMBO_ERR_NAMED_CHAR_REF_INVALID, + bad_ref + ); return false; } return true; @@ -2464,13 +2445,30 @@ valid_named_ref := |* *|; }%% -// clang-format off %% write data noerror nofinal; -// clang-format on -static bool consume_named_ref( - struct GumboInternalParser* parser, Utf8Iterator* input, bool is_in_attribute, - OneOrTwoCodepoints* output) { +static const unsigned char ascii_alnum_table[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0.. 15 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16.. 31 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 32.. 47 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, // 48.. 63 + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 64.. 79 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, // 80.. 95 + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 96..111 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, // 112..127 + // 128..255: implicitly zero +}; + +static inline bool PURE ascii_isalnum(unsigned char ch) { + return ascii_alnum_table[ch]; +} + +static bool consume_named_ref ( + struct GumboInternalParser* parser, + Utf8Iterator* input, + bool is_in_attribute, + OneOrTwoCodepoints* output +) { assert(output->first == kGumboNoChar); const char* p = utf8iterator_get_char_pointer(input); const char* pe = utf8iterator_get_end_pointer(input); @@ -2479,7 +2477,6 @@ static bool consume_named_ref( const char *ts, *start; int cs, act; - // clang-format off %% write init; // Avoid unused variable warnings. (void) act; @@ -2488,17 +2485,17 @@ static bool consume_named_ref( start = p; %% write exec; - // clang-format on if (cs >= %%{ write first_final; }%%) { assert(output->first != kGumboNoChar); char last_char = *(te - 1); - int len = te - start; + size_t len = te - start; if (last_char == ';') { bool matched = utf8iterator_maybe_consume_match(input, start, len, true); assert(matched); + UNUSED_IF_NDEBUG(matched); return true; - } else if (is_in_attribute && (*te == '=' || isalnum(*te))) { + } else if (is_in_attribute && (*te == '=' || ascii_isalnum(*te))) { output->first = kGumboNoChar; output->second = kGumboNoChar; utf8iterator_reset(input); @@ -2507,10 +2504,15 @@ static bool consume_named_ref( GumboStringPiece bad_ref; bad_ref.length = te - start; bad_ref.data = start; - add_named_reference_error( - parser, input, GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON, bad_ref); + add_named_reference_error ( + parser, + input, + GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON, + bad_ref + ); bool matched = utf8iterator_maybe_consume_match(input, start, len, true); assert(matched); + UNUSED_IF_NDEBUG(matched); return false; } } else { @@ -2522,10 +2524,13 @@ static bool consume_named_ref( } } -bool consume_char_ref( - struct GumboInternalParser* parser, struct GumboInternalUtf8Iterator* input, - int additional_allowed_char, bool is_in_attribute, - OneOrTwoCodepoints* output) { +bool gumbo_consume_char_ref ( + struct GumboInternalParser* parser, + struct GumboInternalUtf8Iterator* input, + int additional_allowed_char, + bool is_in_attribute, + OneOrTwoCodepoints* output +) { utf8iterator_mark(input); utf8iterator_next(input); int c = utf8iterator_current(input); diff --git a/gumbo-parser/src/error.c b/gumbo-parser/src/error.c index 25af6004..f3c50b12 100644 --- a/gumbo-parser/src/error.c +++ b/gumbo-parser/src/error.c @@ -1,157 +1,183 @@ -// Copyright 2010 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) +/* + Copyright 2010 Google Inc. -#include "error.h" + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include +#include #include #include #include - +#include "error.h" #include "gumbo.h" +#include "macros.h" #include "parser.h" #include "string_buffer.h" #include "util.h" #include "vector.h" -// Prints a formatted message to a StringBuffer. This automatically resizes the -// StringBuffer as necessary to fit the message. Returns the number of bytes +// Prints a formatted message to a StringBuffer. This automatically resizes the +// StringBuffer as necessary to fit the message. Returns the number of bytes // written. -static int print_message( - GumboParser* parser, GumboStringBuffer* output, const char* format, ...) { +static int PRINTF(2) print_message ( + GumboStringBuffer* output, + const char* format, + ... +) { va_list args; int remaining_capacity = output->capacity - output->length; va_start(args, format); - int bytes_written = vsnprintf( - output->data + output->length, remaining_capacity, format, args); + int bytes_written = vsnprintf ( + output->data + output->length, + remaining_capacity, + format, + args + ); va_end(args); #ifdef _MSC_VER if (bytes_written == -1) { // vsnprintf returns -1 on MSVC++ if there's not enough capacity, instead of // returning the number of bytes that would've been written had there been - // enough. In this case, we'll double the buffer size and hope it fits when + // enough. In this case, we'll double the buffer size and hope it fits when // we retry (letting it fail and returning 0 if it doesn't), since there's // no way to smartly resize the buffer. - gumbo_string_buffer_reserve(parser, output->capacity * 2, output); + gumbo_string_buffer_reserve(output->capacity * 2, output); va_start(args, format); - int result = vsnprintf( - output->data + output->length, remaining_capacity, format, args); + int result = vsnprintf ( + output->data + output->length, + remaining_capacity, + format, + args + ); va_end(args); return result == -1 ? 0 : result; } #else - // -1 in standard C99 indicates an encoding error. Return 0 and do nothing. + // -1 in standard C99 indicates an encoding error. Return 0 and do nothing. if (bytes_written == -1) { return 0; } #endif if (bytes_written >= remaining_capacity) { - gumbo_string_buffer_reserve( - parser, output->capacity + bytes_written, output); + gumbo_string_buffer_reserve(output->capacity + bytes_written, output); remaining_capacity = output->capacity - output->length; va_start(args, format); - bytes_written = vsnprintf( - output->data + output->length, remaining_capacity, format, args); + bytes_written = vsnprintf ( + output->data + output->length, + remaining_capacity, + format, + args + ); va_end(args); } output->length += bytes_written; return bytes_written; } -static void print_tag_stack(GumboParser* parser, const GumboParserError* error, - GumboStringBuffer* output) { - print_message(parser, output, " Currently open tags: "); +static void print_tag_stack ( + const GumboParserError* error, + GumboStringBuffer* output +) { + print_message(output, " Currently open tags: "); for (unsigned int i = 0; i < error->tag_stack.length; ++i) { if (i) { - print_message(parser, output, ", "); + print_message(output, ", "); } GumboTag tag = (GumboTag) error->tag_stack.data[i]; - print_message(parser, output, gumbo_normalized_tagname(tag)); + print_message(output, "%s", gumbo_normalized_tagname(tag)); } - gumbo_string_buffer_append_codepoint(parser, '.', output); + gumbo_string_buffer_append_codepoint('.', output); } -static void handle_parser_error(GumboParser* parser, - const GumboParserError* error, GumboStringBuffer* output) { - if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL && - error->input_type != GUMBO_TOKEN_DOCTYPE) { - print_message( - parser, output, "The doctype must be the first token in the document"); +static void handle_parser_error ( + const GumboParserError* error, + GumboStringBuffer* output +) { + if ( + error->parser_state == GUMBO_INSERTION_MODE_INITIAL + && error->input_type != GUMBO_TOKEN_DOCTYPE + ) { + print_message ( + output, + "The doctype must be the first token in the document" + ); return; } switch (error->input_type) { case GUMBO_TOKEN_DOCTYPE: - print_message(parser, output, "This is not a legal doctype"); + print_message(output, "This is not a legal doctype"); return; case GUMBO_TOKEN_COMMENT: // Should never happen; comments are always legal. assert(0); // But just in case... - print_message(parser, output, "Comments aren't legal here"); + print_message(output, "Comments aren't legal here"); return; case GUMBO_TOKEN_CDATA: case GUMBO_TOKEN_WHITESPACE: case GUMBO_TOKEN_CHARACTER: - print_message(parser, output, "Character tokens aren't legal here"); + print_message(output, "Character tokens aren't legal here"); return; case GUMBO_TOKEN_NULL: - print_message(parser, output, "Null bytes are not allowed in HTML5"); + print_message(output, "Null bytes are not allowed in HTML5"); return; case GUMBO_TOKEN_EOF: if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL) { - print_message(parser, output, "You must provide a doctype"); + print_message(output, "You must provide a doctype"); } else { - print_message(parser, output, "Premature end of file"); - print_tag_stack(parser, error, output); + print_message(output, "Premature end of file"); + print_tag_stack(error, output); } return; case GUMBO_TOKEN_START_TAG: case GUMBO_TOKEN_END_TAG: - print_message(parser, output, "That tag isn't allowed here"); - print_tag_stack(parser, error, output); + print_message(output, "That tag isn't allowed here"); + print_tag_stack(error, output); // TODO(jdtang): Give more specific messaging. return; } } // Finds the preceding newline in an original source buffer from a given byte -// location. Returns a character pointer to the character after that, or a +// location. Returns a character pointer to the character after that, or a // pointer to the beginning of the string if this is the first line. -static const char* find_prev_newline( - const char* source_text, const char* error_location) { +static const char* find_prev_newline ( + const char* source_text, + const char* error_location +) { assert(error_location >= source_text); const char* c = error_location; if (*c == '\n' && c != source_text) --c; - for (; c != source_text && *c != '\n'; --c) - ; + while (c != source_text && *c != '\n') + --c; return c == source_text ? c : c + 1; } // Finds the next newline in the original source buffer from a given byte -// location. Returns a character pointer to that newline, or a pointer to the +// location. Returns a character pointer to that newline, or a pointer to the // terminating null byte if this is the last line. static const char* find_next_newline( - const char* source_text_end, const char* error_location) { + const char* source_text_end, + const char* error_location +) { assert(error_location <= source_text_end); const char* c = error_location; - for (; c != source_text_end && *c != '\n'; ++c) - ; + while (c != source_text_end && *c != '\n') + ++c; return c; } @@ -160,130 +186,176 @@ GumboError* gumbo_add_error(GumboParser* parser) { if (max_errors >= 0 && parser->_output->errors.length >= (unsigned int) max_errors) { return NULL; } - GumboError* error = gumbo_parser_allocate(parser, sizeof(GumboError)); - gumbo_vector_add(parser, error, &parser->_output->errors); + GumboError* error = gumbo_alloc(sizeof(GumboError)); + gumbo_vector_add(error, &parser->_output->errors); return error; } -void gumbo_error_to_string( - GumboParser* parser, const GumboError* error, GumboStringBuffer* output) { - print_message( - parser, output, "@%d:%d: ", error->position.line, error->position.column); +void gumbo_error_to_string ( + const GumboError* error, + GumboStringBuffer* output +) { + print_message ( + output, + "@%zu:%zu: ", + error->position.line, + error->position.column + ); switch (error->type) { case GUMBO_ERR_UTF8_INVALID: - print_message( - parser, output, "Invalid UTF8 character 0x%x", error->v.codepoint); + print_message ( + output, + "Invalid UTF8 character 0x%" PRIx32, + error->v.codepoint + ); break; case GUMBO_ERR_UTF8_TRUNCATED: - print_message(parser, output, - "Input stream ends with a truncated UTF8 character 0x%x", - error->v.codepoint); + print_message ( + output, + "Input stream ends with a truncated UTF8 character 0x%" PRIx32, + error->v.codepoint + ); break; case GUMBO_ERR_UTF8_NULL: - print_message(parser, output, - "Unexpected NULL character in the input stream"); + print_message ( + output, + "Unexpected NULL character in the input stream" + ); break; case GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS: - print_message( - parser, output, "No digits after &# in numeric character reference"); + print_message ( + output, + "No digits after &# in numeric character reference" + ); break; case GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON: - print_message(parser, output, - "The numeric character reference &#%d should be followed " - "by a semicolon", - error->v.codepoint); + print_message ( + output, + "The numeric character reference &#%" PRIu32 " should be followed " + "by a semicolon", + error->v.codepoint + ); break; case GUMBO_ERR_NUMERIC_CHAR_REF_INVALID: - print_message(parser, output, - "The numeric character reference &#%d; encodes an invalid " - "unicode codepoint", - error->v.codepoint); + print_message ( + output, + "The numeric character reference &#%" PRIu32 "; encodes an invalid " + "unicode codepoint", + error->v.codepoint + ); break; case GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON: // The textual data came from one of the literal strings in the table, and // so it'll be null-terminated. - print_message(parser, output, - "The named character reference &%.*s should be followed by a " - "semicolon", - (int) error->v.text.length, error->v.text.data); + print_message ( + output, + "The named character reference &%.*s should be followed by a " + "semicolon", + (int) error->v.text.length, + error->v.text.data + ); break; case GUMBO_ERR_NAMED_CHAR_REF_INVALID: - print_message(parser, output, - "The named character reference &%.*s; is not a valid entity name", - (int) error->v.text.length, error->v.text.data); + print_message ( + output, + "The named character reference &%.*s; is not a valid entity name", + (int) error->v.text.length, + error->v.text.data + ); break; case GUMBO_ERR_DUPLICATE_ATTR: - print_message(parser, output, - "Attribute %s occurs multiple times, at positions %d and %d", - error->v.duplicate_attr.name, error->v.duplicate_attr.original_index, - error->v.duplicate_attr.new_index); + print_message ( + output, + "Attribute %s occurs multiple times, at positions %u and %u", + error->v.duplicate_attr.name, + error->v.duplicate_attr.original_index, + error->v.duplicate_attr.new_index + ); + break; + case GUMBO_ERR_DASHES_OR_DOCTYPE: + print_message ( + output, + "Incorrectly opened comment; expected '--', 'DOCTYPE', or '[CDATA['" + ); break; case GUMBO_ERR_PARSER: - case GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG: - handle_parser_error(parser, &error->v.parser, output); + handle_parser_error(&error->v.parser, output); break; - case GUMBO_ERR_DASHES_OR_DOCTYPE: - print_message(parser, output, - "Incorrectly opened comment; expected '--', 'DOCTYPE', or '[CDATA['"); + case GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG: + case GUMBO_ERR_SELF_CLOSING_END_TAG: + print_message ( + output, + "Tag cannot be self-closing"); break; default: - print_message(parser, output, - "Tokenizer error with an unimplemented error message"); + print_message ( + output, + "Tokenizer error with an unimplemented error message" + ); break; } - gumbo_string_buffer_append_codepoint(parser, '.', output); + gumbo_string_buffer_append_codepoint('.', output); } -void gumbo_caret_diagnostic_to_string(GumboParser* parser, - const GumboError* error, const char* source_text, - size_t length, GumboStringBuffer* output) { - gumbo_error_to_string(parser, error, output); +void gumbo_caret_diagnostic_to_string ( + const GumboError* error, + const char* source_text, + size_t source_length, + GumboStringBuffer* output +) { + gumbo_error_to_string(error, output); const char* line_start = find_prev_newline(source_text, error->original_text); - const char* line_end = find_next_newline(source_text+length, error->original_text); + const char* line_end = find_next_newline(source_text + source_length, error->original_text); GumboStringPiece original_line; original_line.data = line_start; original_line.length = line_end - line_start; - gumbo_string_buffer_append_codepoint(parser, '\n', output); - gumbo_string_buffer_append_string(parser, &original_line, output); - gumbo_string_buffer_append_codepoint(parser, '\n', output); - gumbo_string_buffer_reserve( - parser, output->length + error->position.column, output); - int num_spaces = error->position.column - 1; - memset(output->data + output->length, ' ', num_spaces); - output->length += num_spaces; - gumbo_string_buffer_append_codepoint(parser, '^', output); - gumbo_string_buffer_append_codepoint(parser, '\n', output); + gumbo_string_buffer_append_codepoint('\n', output); + gumbo_string_buffer_append_string(&original_line, output); + gumbo_string_buffer_append_codepoint('\n', output); + gumbo_string_buffer_reserve(output->length + error->position.column, output); + if (error->position.column >= 2) { + size_t num_spaces = error->position.column - 1; + memset(output->data + output->length, ' ', num_spaces); + output->length += num_spaces; + } + gumbo_string_buffer_append_codepoint('^', output); + gumbo_string_buffer_append_codepoint('\n', output); } -void gumbo_print_caret_diagnostic( - GumboParser* parser, const GumboError* error, const char* source_text, - size_t length) { +void gumbo_print_caret_diagnostic ( + const GumboError* error, + const char* source_text, + size_t source_length +) { GumboStringBuffer text; - gumbo_string_buffer_init(parser, &text); - gumbo_caret_diagnostic_to_string(parser, error, source_text, length, &text); + gumbo_string_buffer_init(&text); + gumbo_caret_diagnostic_to_string(error, source_text, source_length, &text); printf("%.*s", (int) text.length, text.data); - gumbo_string_buffer_destroy(parser, &text); + gumbo_string_buffer_destroy(&text); } -void gumbo_error_destroy(GumboParser* parser, GumboError* error) { - if (error->type == GUMBO_ERR_PARSER || - error->type == GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG) { - gumbo_vector_destroy(parser, &error->v.parser.tag_stack); +void gumbo_error_destroy(GumboError* error) { + if ( + error->type == GUMBO_ERR_PARSER + || error->type == GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG + || error->type == GUMBO_ERR_SELF_CLOSING_END_TAG + ) { + gumbo_vector_destroy(&error->v.parser.tag_stack); } else if (error->type == GUMBO_ERR_DUPLICATE_ATTR) { - gumbo_parser_deallocate(parser, (void*) error->v.duplicate_attr.name); + gumbo_free((void*) error->v.duplicate_attr.name); } - gumbo_parser_deallocate(parser, error); + gumbo_free(error); } void gumbo_init_errors(GumboParser* parser) { - gumbo_vector_init(parser, 5, &parser->_output->errors); + gumbo_vector_init(5, &parser->_output->errors); } void gumbo_destroy_errors(GumboParser* parser) { for (unsigned int i = 0; i < parser->_output->errors.length; ++i) { - gumbo_error_destroy(parser, parser->_output->errors.data[i]); + gumbo_error_destroy(parser->_output->errors.data[i]); } - gumbo_vector_destroy(parser, &parser->_output->errors); + gumbo_vector_destroy(&parser->_output->errors); } diff --git a/gumbo-parser/src/error.h b/gumbo-parser/src/error.h index 344e57ee..839c71e0 100644 --- a/gumbo-parser/src/error.h +++ b/gumbo-parser/src/error.h @@ -1,26 +1,6 @@ -// Copyright 2010 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) -// -// Error types, enums, and handling functions. - #ifndef GUMBO_ERROR_H_ #define GUMBO_ERROR_H_ -#ifdef _MSC_VER -#define _CRT_SECURE_NO_WARNINGS -#endif + #include #include "gumbo.h" @@ -77,11 +57,12 @@ typedef enum { GUMBO_ERR_DOCTYPE_END, GUMBO_ERR_PARSER, GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG, + GUMBO_ERR_SELF_CLOSING_END_TAG, } GumboErrorType; // Additional data for duplicated attributes. typedef struct GumboInternalDuplicateAttrError { - // The name of the attribute. Owned by this struct. + // The name of the attribute. Owned by this struct. const char* name; // The (0-based) index within the attributes vector of the original @@ -93,7 +74,7 @@ typedef struct GumboInternalDuplicateAttrError { } GumboDuplicateAttrError; // A simplified representation of the tokenizer state, designed to be more -// useful to clients of this library than the internal representation. This +// useful to clients of this library than the internal representation. This // condenses the actual states used in the tokenizer state machine into a few // values that will be familiar to users of HTML. typedef enum { @@ -129,20 +110,20 @@ typedef struct GumboInternalParserError { // The type of input token that resulted in this error. GumboTokenType input_type; - // The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token. + // The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token. GumboTag input_tag; // The insertion mode that the parser was in at the time. GumboInsertionMode parser_state; - // The tag stack at the point of the error. Note that this is an GumboVector + // The tag stack at the point of the error. Note that this is an GumboVector // of GumboTag's *stored by value* - cast the void* to an GumboTag directly to // get at the tag. GumboVector /* GumboTag */ tag_stack; } GumboParserError; // The overall error struct representing an error in decoding/tokenizing/parsing -// the HTML. This contains an enumerated type flag, a source position, and then +// the HTML. This contains an enumerated type flag, a source position, and then // a union of fields containing data specific to the error. typedef struct GumboInternalError { // The type of error. @@ -163,7 +144,7 @@ typedef struct GumboInternalError { // * GUMBO_ERR_UTF8_TRUNCATED // * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON // * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID - uint64_t codepoint; + uint32_t codepoint; // Tokenizer errors. GumboTokenizerError tokenizer; @@ -183,7 +164,7 @@ typedef struct GumboInternalError { } GumboError; // Adds a new error to the parser's error list, and returns a pointer to it so -// that clients can fill out the rest of its fields. May return NULL if we're +// that clients can fill out the rest of its fields. May return NULL if we're // already over the max_errors field specified in GumboOptions. GumboError* gumbo_add_error(struct GumboInternalParser* parser); @@ -194,32 +175,36 @@ void gumbo_init_errors(struct GumboInternalParser* errors); void gumbo_destroy_errors(struct GumboInternalParser* errors); // Frees the memory used for a single GumboError. -void gumbo_error_destroy(struct GumboInternalParser* parser, GumboError* error); - -// Prints an error to a string. This fills an empty GumboStringBuffer with a -// freshly-allocated buffer containing the error message text. The caller is -// responsible for deleting the buffer. (Note that the buffer is allocated with -// the allocator specified in the GumboParser config and hence should be freed -// by gumbo_parser_deallocate().) -void gumbo_error_to_string(struct GumboInternalParser* parser, - const GumboError* error, GumboStringBuffer* output); - -// Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer -// with a freshly-allocated buffer containing the error message text. The -// caller is responsible for deleting the buffer. (Note that the buffer is -// allocated with the allocator specified in the GumboParser config and hence -// should be freed by gumbo_parser_deallocate().) -void gumbo_caret_diagnostic_to_string(struct GumboInternalParser* parser, - const GumboError* error, const char* source_text, size_t length, - GumboStringBuffer* output); +void gumbo_error_destroy(GumboError* error); + +// Prints an error to a string. This fills an empty GumboStringBuffer with a +// freshly-allocated buffer containing the error message text. The caller is +// responsible for freeing the buffer. +void gumbo_error_to_string ( + const GumboError* error, + GumboStringBuffer* output +); + +// Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer +// with a freshly-allocated buffer containing the error message text. The +// caller is responsible for freeing the buffer. +void gumbo_caret_diagnostic_to_string ( + const GumboError* error, + const char* source_text, + size_t source_length, + GumboStringBuffer* output +); // Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead // of writing to a string. -void gumbo_print_caret_diagnostic(struct GumboInternalParser* parser, - const GumboError* error, const char* source_text, size_t length); +void gumbo_print_caret_diagnostic ( + const GumboError* error, + const char* source_text, + size_t source_length +); #ifdef __cplusplus } #endif -#endif // GUMBO_ERROR_H_ +#endif // GUMBO_ERROR_H_ diff --git a/gumbo-parser/src/foreign_attrs.c b/gumbo-parser/src/foreign_attrs.c new file mode 100644 index 00000000..7a77a188 --- /dev/null +++ b/gumbo-parser/src/foreign_attrs.c @@ -0,0 +1,104 @@ +/* ANSI-C code produced by gperf version 3.1 */ +/* Command-line: gperf -m100 -n lib/foreign_attrs.gperf */ +/* Computed positions: -k'2,8' */ +/* Filtered by: mk/gperf-filter.sed */ + +#include "replacement.h" +#include "macros.h" +#include + +#define TOTAL_KEYWORDS 11 +#define MIN_WORD_LENGTH 5 +#define MAX_WORD_LENGTH 13 +#define MIN_HASH_VALUE 0 +#define MAX_HASH_VALUE 10 +/* maximum key range = 11, duplicates = 0 */ + +static inline unsigned int +hash (register const char *str, register size_t len) +{ + static const unsigned char asso_values[] = + { + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 2, + 11, 10, 11, 9, 7, 6, 11, 11, 1, 0, + 11, 5, 11, 11, 4, 11, 11, 11, 11, 11, + 11, 3, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11 + }; + register unsigned int hval = 0; + + switch (len) + { + default: + hval += asso_values[(unsigned char)str[7]]; + /*FALLTHROUGH*/ + case 7: + case 6: + case 5: + case 4: + case 3: + case 2: + hval += asso_values[(unsigned char)str[1]]; + break; + } + return hval; +} + +const ForeignAttrReplacement * +gumbo_get_foreign_attr_replacement (register const char *str, register size_t len) +{ + static const unsigned char lengthtable[] = + { + 5, 11, 9, 13, 10, 10, 10, 11, 10, 8, 8 + }; + static const ForeignAttrReplacement wordlist[] = + { + {"xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS}, + {"xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS}, + {"xml:space", "space", GUMBO_ATTR_NAMESPACE_XML}, + {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK}, + {"xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK}, + {"xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK}, + {"xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK}, + {"xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK}, + {"xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK}, + {"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML}, + {"xml:base", "base", GUMBO_ATTR_NAMESPACE_XML} + }; + + if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) + { + register unsigned int key = hash (str, len); + + if (key <= MAX_HASH_VALUE) + if (len == lengthtable[key]) + { + register const char *s = wordlist[key].from; + + if (s && *str == *s && !memcmp (str + 1, s + 1, len - 1)) + return &wordlist[key]; + } + } + return 0; +} diff --git a/gumbo-parser/src/foreign_attrs.gperf b/gumbo-parser/src/foreign_attrs.gperf new file mode 100644 index 00000000..91a0aa40 --- /dev/null +++ b/gumbo-parser/src/foreign_attrs.gperf @@ -0,0 +1,27 @@ +%{ +#include "replacement.h" +#include "macros.h" +%} + +%struct-type +%omit-struct-type +%compare-lengths +%readonly-tables +%null-strings +%includes +%define lookup-function-name gumbo_get_foreign_attr_replacement +%define slot-name from +ForeignAttrReplacement; + +%% +"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK +"xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK +"xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK +"xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK +"xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK +"xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK +"xml:base", "base", GUMBO_ATTR_NAMESPACE_XML +"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML +"xml:space", "space", GUMBO_ATTR_NAMESPACE_XML +"xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS +"xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS diff --git a/gumbo-parser/src/gumbo.h b/gumbo-parser/src/gumbo.h index f45a583e..e575bce1 100644 --- a/gumbo-parser/src/gumbo.h +++ b/gumbo-parser/src/gumbo.h @@ -1,51 +1,33 @@ -// Copyright 2010 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) -// -// We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and -// GUMBO_ as a prefix for enum constants (static constants get the Google-style -// kGumbo prefix). +// Copyright 2010 Google Inc. +// Copyright 2018 Craig Barnes. +// Licensed under the Apache License, version 2.0. + +// We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, +// GUMBO_ as a prefix for enum constants and kGumbo as a prefix for +// static constants /** * @file * @mainpage Gumbo HTML Parser * - * This provides a conformant, no-dependencies implementation of the HTML5 - * parsing algorithm. It supports only UTF8; if you need to parse a different - * encoding, run a preprocessing step to convert to UTF8. It returns a parse - * tree made of the structs in this file. + * This provides a conformant, no-dependencies implementation of the + * [HTML5] parsing algorithm. It supports only UTF-8 -- if you need + * to parse a different encoding, run a preprocessing step to convert + * to UTF-8. It returns a parse tree made of the structs in this file. * * Example: * @code * GumboOutput* output = gumbo_parse(input); * do_something_with_doctype(output->document); * do_something_with_html_tree(output->root); - * gumbo_destroy_output(&options, output); + * gumbo_destroy_output(output); * @endcode - * HTML5 Spec: * - * http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html + * [HTML5]: https://html.spec.whatwg.org/multipage/ */ -#ifndef GUMBO_GUMBO_H_ -#define GUMBO_GUMBO_H_ - -#ifdef _MSC_VER -#define _CRT_SECURE_NO_WARNINGS -#define fileno _fileno -#endif +#ifndef GUMBO_H +#define GUMBO_H #include #include @@ -55,41 +37,30 @@ extern "C" { #endif /** - * A struct representing a character position within the original text buffer. - * Line and column numbers are 1-based and offsets are 0-based, which matches - * how most editors and command-line tools work. Also, columns measure - * positions in terms of characters while offsets measure by bytes; this is - * because the offset field is often used to pull out a particular region of - * text (which in most languages that bind to C implies pointer arithmetic on a - * buffer of bytes), while the column field is often used to reference a - * particular column on a printable display, which nowadays is usually UTF-8. + * A struct representing a character position within the original text + * buffer. Line and column numbers are 1-based and offsets are 0-based, + * which matches how most editors and command-line tools work. */ typedef struct { - unsigned int line; - unsigned int column; - unsigned int offset; + size_t line; + size_t column; + size_t offset; } GumboSourcePosition; /** - * A SourcePosition used for elements that have no source position, i.e. - * parser-inserted elements. - */ -extern const GumboSourcePosition kGumboEmptySourcePosition; - -/** - * A struct representing a string or part of a string. Strings within the - * parser are represented by a char* and a length; the char* points into - * an existing data buffer owned by some other code (often the original input). - * GumboStringPieces are assumed (by convention) to be immutable, because they - * may share data. Use GumboStringBuffer if you need to construct a string. - * Clients should assume that it is not NUL-terminated, and should always use - * explicit lengths when manipulating them. + * A struct representing a string or part of a string. Strings within + * the parser are represented by a `char*` and a length; the `char*` + * points into an existing data buffer owned by some other code (often + * the original input). `GumboStringPiece`s are assumed (by convention) + * to be immutable, because they may share data. Clients should assume + * that it is not NUL-terminated and should always use explicit lengths + * when manipulating them. */ typedef struct { - /** A pointer to the beginning of the string. NULL iff length == 0. */ + /** A pointer to the beginning of the string. `NULL` if `length == 0`. */ const char* data; - /** The length of the string fragment, in bytes. May be zero. */ + /** The length of the string fragment, in bytes (may be zero). */ size_t length; } GumboStringPiece; @@ -97,31 +68,36 @@ typedef struct { extern const GumboStringPiece kGumboEmptyString; /** - * Compares two GumboStringPieces, and returns true if they're equal or false - * otherwise. + * Compares two `GumboStringPiece`s, and returns `true` if they're + * equal or `false` otherwise. */ -bool gumbo_string_equals( - const GumboStringPiece* str1, const GumboStringPiece* str2); +bool gumbo_string_equals ( + const GumboStringPiece* str1, + const GumboStringPiece* str2 +); /** - * Compares two GumboStringPieces ignoring case, and returns true if they're - * equal or false otherwise. + * Compares two `GumboStringPiece`s, ignoring case, and returns `true` + * if they're equal or `false` otherwise. */ -bool gumbo_string_equals_ignore_case( - const GumboStringPiece* str1, const GumboStringPiece* str2); +bool gumbo_string_equals_ignore_case ( + const GumboStringPiece* str1, + const GumboStringPiece* str2 +); /** - * A simple vector implementation. This stores a pointer to a data array and a - * length. All elements are stored as void*; client code must cast to the - * appropriate type. Overflows upon addition result in reallocation of the data - * array, with the size doubling to maintain O(1) amortized cost. There is no - * removal function, as this isn't needed for any of the operations within this - * library. Iteration can be done through inspecting the structure directly in - * a for-loop. + * A simple vector implementation. This stores a pointer to a data array + * and a length. All elements are stored as `void*`; client code must + * cast to the appropriate type. Overflows upon addition result in + * reallocation of the data array, with the size doubling to maintain + * `O(1)` amortized cost. There is no removal function, as this isn't + * needed for any of the operations within this library. Iteration can + * be done through inspecting the structure directly in a `for` loop. */ typedef struct { - /** Data elements. This points to a dynamically-allocated array of capacity - * elements, each a void* to the element itself. + /** + * Data elements. This points to a dynamically-allocated array of + * `capacity` elements, each a `void*` to the element itself. */ void** data; @@ -132,82 +108,229 @@ typedef struct { unsigned int capacity; } GumboVector; -/** An empty (0-length, 0-capacity) GumboVector. */ +/** An empty (0-length, 0-capacity) `GumboVector`. */ extern const GumboVector kGumboEmptyVector; /** - * Returns the first index at which an element appears in this vector (testing - * by pointer equality), or -1 if it never does. + * Returns the first index at which an element appears in this vector + * (testing by pointer equality), or `-1` if it never does. */ int gumbo_vector_index_of(GumboVector* vector, const void* element); /** - * An enum for all the tags defined in the HTML5 standard. These correspond to - * the tag names themselves. Enum constants exist only for tags which appear in - * the spec itself (or for tags with special handling in the SVG and MathML - * namespaces); any other tags appear as GUMBO_TAG_UNKNOWN and the actual tag - * name can be obtained through original_tag. + * An `enum` for all the tags defined in the HTML5 standard. These + * correspond to the tag names themselves. Enum constants exist only + * for tags that appear in the spec itself (or for tags with special + * handling in the SVG and MathML namespaces). Any other tags appear + * as `GUMBO_TAG_UNKNOWN` and the actual tag name can be obtained + * through `original_tag`. * - * This is mostly for API convenience, so that clients of this library don't - * need to perform a strcasecmp to find the normalized tag name. It also has - * efficiency benefits, by letting the parser work with enums instead of - * strings. + * This is mostly for API convenience, so that clients of this library + * don't need to perform a `strcasecmp` to find the normalized tag + * name. It also has efficiency benefits, by letting the parser work + * with enums instead of strings. */ typedef enum { -// Load all the tags from an external source, generated from tag.in. -#include "tag_enum.h" - // Used for all tags that don't have special handling in HTML. Add new tags - // to the end of tag.in so as to preserve backwards-compatibility. + GUMBO_TAG_HTML, + GUMBO_TAG_HEAD, + GUMBO_TAG_TITLE, + GUMBO_TAG_BASE, + GUMBO_TAG_LINK, + GUMBO_TAG_META, + GUMBO_TAG_STYLE, + GUMBO_TAG_SCRIPT, + GUMBO_TAG_NOSCRIPT, + GUMBO_TAG_TEMPLATE, + GUMBO_TAG_BODY, + GUMBO_TAG_ARTICLE, + GUMBO_TAG_SECTION, + GUMBO_TAG_NAV, + GUMBO_TAG_ASIDE, + GUMBO_TAG_H1, + GUMBO_TAG_H2, + GUMBO_TAG_H3, + GUMBO_TAG_H4, + GUMBO_TAG_H5, + GUMBO_TAG_H6, + GUMBO_TAG_HGROUP, + GUMBO_TAG_HEADER, + GUMBO_TAG_FOOTER, + GUMBO_TAG_ADDRESS, + GUMBO_TAG_P, + GUMBO_TAG_HR, + GUMBO_TAG_PRE, + GUMBO_TAG_BLOCKQUOTE, + GUMBO_TAG_OL, + GUMBO_TAG_UL, + GUMBO_TAG_LI, + GUMBO_TAG_DL, + GUMBO_TAG_DT, + GUMBO_TAG_DD, + GUMBO_TAG_FIGURE, + GUMBO_TAG_FIGCAPTION, + GUMBO_TAG_MAIN, + GUMBO_TAG_DIV, + GUMBO_TAG_A, + GUMBO_TAG_EM, + GUMBO_TAG_STRONG, + GUMBO_TAG_SMALL, + GUMBO_TAG_S, + GUMBO_TAG_CITE, + GUMBO_TAG_Q, + GUMBO_TAG_DFN, + GUMBO_TAG_ABBR, + GUMBO_TAG_DATA, + GUMBO_TAG_TIME, + GUMBO_TAG_CODE, + GUMBO_TAG_VAR, + GUMBO_TAG_SAMP, + GUMBO_TAG_KBD, + GUMBO_TAG_SUB, + GUMBO_TAG_SUP, + GUMBO_TAG_I, + GUMBO_TAG_B, + GUMBO_TAG_U, + GUMBO_TAG_MARK, + GUMBO_TAG_RUBY, + GUMBO_TAG_RT, + GUMBO_TAG_RP, + GUMBO_TAG_BDI, + GUMBO_TAG_BDO, + GUMBO_TAG_SPAN, + GUMBO_TAG_BR, + GUMBO_TAG_WBR, + GUMBO_TAG_INS, + GUMBO_TAG_DEL, + GUMBO_TAG_IMAGE, + GUMBO_TAG_IMG, + GUMBO_TAG_IFRAME, + GUMBO_TAG_EMBED, + GUMBO_TAG_OBJECT, + GUMBO_TAG_PARAM, + GUMBO_TAG_VIDEO, + GUMBO_TAG_AUDIO, + GUMBO_TAG_SOURCE, + GUMBO_TAG_TRACK, + GUMBO_TAG_CANVAS, + GUMBO_TAG_MAP, + GUMBO_TAG_AREA, + GUMBO_TAG_MATH, + GUMBO_TAG_MI, + GUMBO_TAG_MO, + GUMBO_TAG_MN, + GUMBO_TAG_MS, + GUMBO_TAG_MTEXT, + GUMBO_TAG_MGLYPH, + GUMBO_TAG_MALIGNMARK, + GUMBO_TAG_ANNOTATION_XML, + GUMBO_TAG_SVG, + GUMBO_TAG_FOREIGNOBJECT, + GUMBO_TAG_DESC, + GUMBO_TAG_TABLE, + GUMBO_TAG_CAPTION, + GUMBO_TAG_COLGROUP, + GUMBO_TAG_COL, + GUMBO_TAG_TBODY, + GUMBO_TAG_THEAD, + GUMBO_TAG_TFOOT, + GUMBO_TAG_TR, + GUMBO_TAG_TD, + GUMBO_TAG_TH, + GUMBO_TAG_FORM, + GUMBO_TAG_FIELDSET, + GUMBO_TAG_LEGEND, + GUMBO_TAG_LABEL, + GUMBO_TAG_INPUT, + GUMBO_TAG_BUTTON, + GUMBO_TAG_SELECT, + GUMBO_TAG_DATALIST, + GUMBO_TAG_OPTGROUP, + GUMBO_TAG_OPTION, + GUMBO_TAG_TEXTAREA, + GUMBO_TAG_KEYGEN, + GUMBO_TAG_OUTPUT, + GUMBO_TAG_PROGRESS, + GUMBO_TAG_METER, + GUMBO_TAG_DETAILS, + GUMBO_TAG_SUMMARY, + GUMBO_TAG_MENU, + GUMBO_TAG_MENUITEM, + GUMBO_TAG_APPLET, + GUMBO_TAG_ACRONYM, + GUMBO_TAG_BGSOUND, + GUMBO_TAG_DIR, + GUMBO_TAG_FRAME, + GUMBO_TAG_FRAMESET, + GUMBO_TAG_NOFRAMES, + GUMBO_TAG_LISTING, + GUMBO_TAG_XMP, + GUMBO_TAG_NEXTID, + GUMBO_TAG_NOEMBED, + GUMBO_TAG_PLAINTEXT, + GUMBO_TAG_RB, + GUMBO_TAG_STRIKE, + GUMBO_TAG_BASEFONT, + GUMBO_TAG_BIG, + GUMBO_TAG_BLINK, + GUMBO_TAG_CENTER, + GUMBO_TAG_FONT, + GUMBO_TAG_MARQUEE, + GUMBO_TAG_MULTICOL, + GUMBO_TAG_NOBR, + GUMBO_TAG_SPACER, + GUMBO_TAG_TT, + GUMBO_TAG_RTC, + GUMBO_TAG_DIALOG, + // Used for all tags that don't have special handling in HTML. GUMBO_TAG_UNKNOWN, // A marker value to indicate the end of the enum, for iterating over it. - // Also used as the terminator for varargs functions that take tags. GUMBO_TAG_LAST, } GumboTag; /** - * Returns the normalized (usually all-lowercased, except for foreign content) - * tag name for an GumboTag enum. Return value is static data owned by the - * library. + * Returns the normalized (all lower case) tag name for a `GumboTag` enum. The + * return value is static data owned by the library. */ const char* gumbo_normalized_tagname(GumboTag tag); /** - * Extracts the tag name from the original_text field of an element or token by - * stripping off characters and attributes and adjusting the passed-in - * GumboStringPiece appropriately. The tag name is in the original case and - * shares a buffer with the original text, to simplify memory management. - * Behavior is undefined if a string-piece that doesn't represent an HTML tag - * ( or ) is passed in. If the string piece is completely - * empty (NULL data pointer), then this function will exit successfully as a - * no-op. + * Extracts the tag name from the `original_text` field of an element + * or token by stripping off `` characters and attributes and + * adjusting the passed-in `GumboStringPiece` appropriately. The tag + * name is in the original case and shares a buffer with the original + * text, to simplify memory management. Behavior is undefined if a + * string piece that doesn't represent an HTML tag (`` or + * ``) is passed in. If the string piece is completely + * empty (`NULL` data pointer), then this function will exit + * successfully as a no-op. */ void gumbo_tag_from_original_text(GumboStringPiece* text); /** - * Fixes the case of SVG elements that are not all lowercase. - * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-inforeign - * This is not done at parse time because there's no place to store a mutated - * tag name. tag_name is an enum (which will be TAG_UNKNOWN for most SVG tags - * without special handling), while original_tag_name is a pointer into the - * original buffer. Instead, we provide this helper function that clients can - * use to rename SVG tags as appropriate. - * Returns the case-normalized SVG tagname if a replacement is found, or NULL if - * no normalization is called for. The return value is static data and owned by - * the library. + * Fixes the case of SVG elements that are not all lowercase. This is + * not done at parse time because there's no place to store a mutated + * tag name. `tag_name` is an enum (which will be `TAG_UNKNOWN` for most + * SVG tags without special handling), while `original_tag_name` is a + * pointer into the original buffer. Instead, we provide this helper + * function that clients can use to rename SVG tags as appropriate. + * Returns the case-normalized SVG tagname if a replacement is found, or + * `NULL` if no normalization is called for. The return value is static + * data and owned by the library. + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign */ const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname); /** - * Converts a tag name string (which may be in upper or mixed case) to a tag - * enum. The `tag` version expects `tagname` to be NULL-terminated + * Converts a tag name string (which may be in upper or mixed case) to a + * tag enum. */ -GumboTag gumbo_tag_enum(const char* tagname); -GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length); +GumboTag gumbo_tagn_enum(const char* tagname, size_t length); /** * Attribute namespaces. - * HTML includes special handling for XLink, XML, and XMLNS namespaces on - * attributes. Everything else goes in the generic "NONE" namespace. + * HTML includes special handling for XLink, XML, and XMLNS namespaces + * on attributes. Everything else goes in the generic "NONE" namespace. */ typedef enum { GUMBO_ATTR_NAMESPACE_NONE, @@ -217,46 +340,47 @@ typedef enum { } GumboAttributeNamespaceEnum; /** - * A struct representing a single attribute on an HTML tag. This is a - * name-value pair, but also includes information about source locations and - * original source text. + * A struct representing a single attribute on a HTML tag. This is a + * name-value pair, but also includes information about source locations + * and original source text. */ typedef struct { /** - * The namespace for the attribute. This will usually be - * GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special - * values, per: - * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes + * The namespace for the attribute. This will usually be + * `GUMBO_ATTR_NAMESPACE_NONE`, but some XLink/XMLNS/XML attributes + * take special values, per: + * https://html.spec.whatwg.org/multipage/parsing.html#adjust-foreign-attributes */ GumboAttributeNamespaceEnum attr_namespace; /** - * The name of the attribute. This is in a freshly-allocated buffer to deal - * with case-normalization, and is null-terminated. + * The name of the attribute. This is in a freshly-allocated buffer to + * deal with case-normalization and is null-terminated. */ const char* name; /** - * The original text of the attribute name, as a pointer into the original - * source buffer. + * The original text of the attribute name, as a pointer into the + * original source buffer. */ GumboStringPiece original_name; /** - * The value of the attribute. This is in a freshly-allocated buffer to deal - * with unescaping, and is null-terminated. It does not include any quotes - * that surround the attribute. If the attribute has no value (for example, - * 'selected' on a checkbox), this will be an empty string. + * The value of the attribute. This is in a freshly-allocated buffer + * to deal with unescaping and is null-terminated. It does not include + * any quotes that surround the attribute. If the attribute has no + * value (for example, `selected` on a checkbox) this will be an empty + * string. */ const char* value; /** - * The original text of the value of the attribute. This points into the - * original source buffer. It includes any quotes that surround the - * attribute, and you can look at original_value.data[0] and - * original_value.data[original_value.length - 1] to determine what the quote - * characters were. If the attribute has no value, this will be a 0-length - * string. + * The original text of the value of the attribute. This points into + * the original source buffer. It includes any quotes that surround + * the attribute and you can look at `original_value.data[0]` and + * `original_value.data[original_value.length - 1]` to determine what + * the quote characters were. If the attribute has no value this will + * be a 0-length string. */ GumboStringPiece original_value; @@ -264,9 +388,9 @@ typedef struct { GumboSourcePosition name_start; /** - * The ending position of the attribute name. This is not always derivable + * The ending position of the attribute name. This is not always derivable * from the starting position of the value because of the possibility of - * whitespace around the = sign. + * whitespace around the `=` sign. */ GumboSourcePosition name_end; @@ -278,34 +402,37 @@ typedef struct { } GumboAttribute; /** - * Given a vector of GumboAttributes, look up the one with the specified name - * and return it, or NULL if no such attribute exists. This uses a - * case-insensitive match, as HTML is case-insensitive. + * Given a vector of `GumboAttribute`s, look up the one with the + * specified name and return it, or `NULL` if no such attribute exists. + * This uses a case-insensitive match, as HTML is case-insensitive. */ GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name); /** - * Enum denoting the type of node. This determines the type of the node.v - * union. + * Enum denoting the type of node. This determines the type of the + * `node.v` union. */ typedef enum { - /** Document node. v will be a GumboDocument. */ + /** Document node. `v` will be a `GumboDocument`. */ GUMBO_NODE_DOCUMENT, - /** Element node. v will be a GumboElement. */ + /** Element node. `v` will be a `GumboElement`. */ GUMBO_NODE_ELEMENT, - /** Text node. v will be a GumboText. */ + /** Text node. `v` will be a `GumboText`. */ GUMBO_NODE_TEXT, - /** CDATA node. v will be a GumboText. */ + /** CDATA node. `v` will be a `GumboText`. */ GUMBO_NODE_CDATA, - /** Comment node. v will be a GumboText, excluding comment delimiters. */ + /** Comment node. `v` will be a `GumboText`, excluding comment delimiters. */ GUMBO_NODE_COMMENT, - /** Text node, where all contents is whitespace. v will be a GumboText. */ + /** Text node, where all contents is whitespace. `v` will be a `GumboText`. */ GUMBO_NODE_WHITESPACE, - /** Template node. This is separate from GUMBO_NODE_ELEMENT because many - * client libraries will want to ignore the contents of template nodes, as - * the spec suggests. Recursing on GUMBO_NODE_ELEMENT will do the right thing - * here, while clients that want to include template contents should also - * check for GUMBO_NODE_TEMPLATE. v will be a GumboElement. */ + /** + * Template node. This is separate from `GUMBO_NODE_ELEMENT` because + * many client libraries will want to ignore the contents of template + * nodes, as the spec suggests. Recursing on `GUMBO_NODE_ELEMENT` will + * do the right thing here, while clients that want to include template + * contents should also check for `GUMBO_NODE_TEMPLATE`. `v` will be a + * `GumboElement`. + */ GUMBO_NODE_TEMPLATE } GumboNodeType; @@ -315,9 +442,7 @@ typedef enum { */ typedef struct GumboInternalNode GumboNode; -/** - * http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode - */ +/** https://dom.spec.whatwg.org/#concept-document-quirks */ typedef enum { GUMBO_DOCTYPE_NO_QUIRKS, GUMBO_DOCTYPE_QUIRKS, @@ -326,10 +451,11 @@ typedef enum { /** * Namespaces. - * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix. Rather, - * anything inside an tag is in the SVG namespace, anything inside the - * tag is in the MathML namespace, and anything else is inside the HTML - * namespace. No other namespaces are supported, so this can be an enum only. + * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix. + * Rather, anything inside an `` tag is in the SVG namespace, + * anything inside the `` tag is in the MathML namespace, and + * anything else is inside the HTML namespace. No other namespaces are + * supported, so this can be an `enum`. */ typedef enum { GUMBO_NAMESPACE_HTML, @@ -339,66 +465,70 @@ typedef enum { /** * Parse flags. - * We track the reasons for parser insertion of nodes and store them in a - * bitvector in the node itself. This lets client code optimize out nodes that - * are implied by the HTML structure of the document, or flag constructs that - * may not be allowed by a style guide, or track the prevalence of incorrect or - * tricky HTML code. + * We track the reasons for parser insertion of nodes and store them in + * a bitvector in the node itself. This lets client code optimize out + * nodes that are implied by the HTML structure of the document, or flag + * constructs that may not be allowed by a style guide, or track the + * prevalence of incorrect or tricky HTML code. */ typedef enum { /** - * A normal node - both start and end tags appear in the source, nothing has - * been reparented. + * A normal node -- both start and end tags appear in the source, + * nothing has been reparented. */ GUMBO_INSERTION_NORMAL = 0, /** - * A node inserted by the parser to fulfill some implicit insertion rule. - * This is usually set in addition to some other flag giving a more specific - * insertion reason; it's a generic catch-all term meaning "The start tag for - * this node did not appear in the document source". + * A node inserted by the parser to fulfill some implicit insertion + * rule. This is usually set in addition to some other flag giving a + * more specific insertion reason; it's a generic catch-all term + * meaning "The start tag for this node did not appear in the document + * source". */ GUMBO_INSERTION_BY_PARSER = 1 << 0, /** - * A flag indicating that the end tag for this node did not appear in the - * document source. Note that in some cases, you can still have - * parser-inserted nodes with an explicit end tag: for example, "Text" - * has GUMBO_INSERTED_BY_PARSER set on the node, but - * GUMBO_INSERTED_END_TAG_IMPLICITLY is unset, as the tag actually - * exists. This flag will be set only if the end tag is completely missing; - * in some cases, the end tag may be misplaced (eg. a tag with text - * afterwards), which will leave this flag unset and require clients to - * inspect the parse errors for that case. + * A flag indicating that the end tag for this node did not appear in + * the document source. Note that in some cases, you can still have + * parser-inserted nodes with an explicit end tag. For example, + * `Text` has `GUMBO_INSERTED_BY_PARSER` set on the `` + * node, but `GUMBO_INSERTED_END_TAG_IMPLICITLY` is unset, as the + * `` tag actually exists. + * + * This flag will be set only if the end tag is completely missing. + * In some cases, the end tag may be misplaced (e.g. a `` tag + * with text afterwards), which will leave this flag unset and require + * clients to inspect the parse errors for that case. */ GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1, // Value 1 << 2 was for a flag that has since been removed. /** - * A flag for nodes that are inserted because their presence is implied by - * other tags, eg. , , , , etc. + * A flag for nodes that are inserted because their presence is + * implied by other tags, e.g. ``, ``, ``, + * ``, etc. */ GUMBO_INSERTION_IMPLIED = 1 << 3, /** - * A flag for nodes that are converted from their end tag equivalents. For - * example,

when no paragraph is open implies that the parser should - * create a

tag and immediately close it, while
means the same thing - * as
. + * A flag for nodes that are converted from their end tag equivalents. + * For example, `

` when no paragraph is open implies that the + * parser should create a `

` tag and immediately close it, while + * `
` means the same thing as `
`. */ GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4, - /** A flag for nodes that are converted from the parse of an tag. */ - GUMBO_INSERTION_FROM_ISINDEX = 1 << 5, + // Value 1 << 5 was for a flag that has since been removed. - /** A flag for tags that are rewritten as . */ + /** A flag for `` tags that are rewritten as ``. */ GUMBO_INSERTION_FROM_IMAGE = 1 << 6, /** - * A flag for nodes that are cloned as a result of the reconstruction of - * active formatting elements. This is set only on the clone; the initial - * portion of the formatting run is a NORMAL node with an IMPLICIT_END_TAG. + * A flag for nodes that are cloned as a result of the reconstruction + * of active formatting elements. This is set only on the clone; the + * initial portion of the formatting run is a NORMAL node with an + * `IMPLICIT_END_TAG`. */ GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7, @@ -415,18 +545,19 @@ typedef enum { GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10, } GumboParseFlags; -/** - * Information specific to document nodes. - */ +/** Information specific to document nodes. */ typedef struct { /** - * An array of GumboNodes, containing the children of this element. This will - * normally consist of the element and any comment nodes found. - * Pointers are owned. + * An array of `GumboNode`s, containing the children of this element. + * This will normally consist of the `` element and any comment + * nodes found. Pointers are owned. */ GumboVector /* GumboNode* */ children; - // True if there was an explicit doctype token as opposed to it being omitted. + /** + * `true` if there was an explicit doctype token, as opposed to it + * being omitted. + */ bool has_doctype; // Fields from the doctype token, copied verbatim. @@ -435,65 +566,70 @@ typedef struct { const char* system_identifier; /** - * Whether or not the document is in QuirksMode, as determined by the values - * in the GumboTokenDocType template. + * Whether or not the document is in QuirksMode, as determined by the + * values in the GumboTokenDocType template. */ GumboQuirksModeEnum doc_type_quirks_mode; } GumboDocument; /** - * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE elements. - * This contains just a block of text and its position. + * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE + * elements. This contains just a block of text and its position. */ typedef struct { /** - * The text of this node, after entities have been parsed and decoded. For - * comment/cdata nodes, this does not include the comment delimiters. + * The text of this node, after entities have been parsed and decoded. + * For comment and cdata nodes, this does not include the comment + * delimiters. */ const char* text; /** - * The original text of this node, as a pointer into the original buffer. For - * comment/cdata nodes, this includes the comment delimiters. + * The original text of this node, as a pointer into the original + * buffer. For comment/cdata nodes, this includes the comment + * delimiters. */ GumboStringPiece original_text; /** - * The starting position of this node. This corresponds to the position of - * original_text, before entities are decoded. + * The starting position of this node. This corresponds to the + * position of `original_text`, before entities are decoded. * */ GumboSourcePosition start_pos; } GumboText; /** - * The struct used to represent all HTML elements. This contains information - * about the tag, attributes, and child nodes. + * The struct used to represent all HTML elements. This contains + * information about the tag, attributes, and child nodes. */ typedef struct { /** - * An array of GumboNodes, containing the children of this element. Pointers - * are owned. + * An array of `GumboNode`s, containing the children of this element. + * Pointers are owned. */ GumboVector /* GumboNode* */ children; /** The GumboTag enum for this element. */ GumboTag tag; + /** The name for this element. */ + const char* name; + /** The GumboNamespaceEnum for this element. */ GumboNamespaceEnum tag_namespace; /** - * A GumboStringPiece pointing to the original tag text for this element, - * pointing directly into the source buffer. If the tag was inserted - * algorithmically (for example, or insertion), this will be a - * zero-length string. + * A `GumboStringPiece` pointing to the original tag text for this + * element, pointing directly into the source buffer. If the tag was + * inserted algorithmically (for example, `` or `` + * insertion), this will be a zero-length string. */ GumboStringPiece original_tag; /** - * A GumboStringPiece pointing to the original end tag text for this element. - * If the end tag was inserted algorithmically, (for example, closing a - * self-closing tag), this will be a zero-length string. + * A `GumboStringPiece` pointing to the original end tag text for this + * element. If the end tag was inserted algorithmically, (for example, + * closing a self-closing tag), this will be a zero-length string. */ GumboStringPiece original_end_tag; @@ -504,30 +640,31 @@ typedef struct { GumboSourcePosition end_pos; /** - * An array of GumboAttributes, containing the attributes for this tag in the - * order that they were parsed. Pointers are owned. + * An array of `GumboAttribute`s, containing the attributes for this + * tag in the order that they were parsed. Pointers are owned. */ GumboVector /* GumboAttribute* */ attributes; } GumboElement; /** - * A supertype for GumboElement and GumboText, so that we can include one - * generic type in lists of children and cast as necessary to subtypes. + * A supertype for `GumboElement` and `GumboText`, so that we can + * include one generic type in lists of children and cast as necessary + * to subtypes. */ struct GumboInternalNode { /** The type of node that this is. */ GumboNodeType type; - /** Pointer back to parent node. Not owned. */ + /** Pointer back to parent node. Not owned. */ GumboNode* parent; /** The index within the parent's children vector of this node. */ - size_t index_within_parent; + unsigned int index_within_parent; /** - * A bitvector of flags containing information about why this element was - * inserted into the parse tree, including a variety of special parse - * situations. + * A bitvector of flags containing information about why this element + * was inserted into the parse tree, including a variety of special + * parse situations. */ GumboParseFlags parse_flags; @@ -539,81 +676,55 @@ struct GumboInternalNode { } v; }; -/** - * The type for an allocator function. Takes the 'userdata' member of the - * GumboParser struct as its first argument. Semantics should be the same as - * malloc, i.e. return a block of size_t bytes on success or NULL on failure. - * Allocating a block of 0 bytes behaves as per malloc. - */ -// TODO(jdtang): Add checks throughout the codebase for out-of-memory condition. -typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size); - -/** - * The type for a deallocator function. Takes the 'userdata' member of the - * GumboParser struct as its first argument. - */ -typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr); - /** * Input struct containing configuration options for the parser. - * These let you specify alternate memory managers, provide different error - * handling, etc. - * Use kGumboDefaultOptions for sensible defaults, and only set what you need. + * These let you specify alternate memory managers, provide different + * error handling, etc. Use `kGumboDefaultOptions` for sensible + * defaults and only set what you need. */ typedef struct GumboInternalOptions { - /** A memory allocator function. Default: malloc. */ - GumboAllocatorFunction allocator; - - /** A memory deallocator function. Default: free. */ - GumboDeallocatorFunction deallocator; - /** - * An opaque object that's passed in as the first argument to all callbacks - * used by this library. Default: NULL. - */ - void* userdata; - - /** - * The tab-stop size, for computing positions in source code that uses tabs. - * Default: 8. + * The tab-stop size, for computing positions in HTML files that + * use tabs. Default: `8`. */ int tab_stop; /** * Whether or not to stop parsing when the first error is encountered. - * Default: false. + * Default: `false`. */ bool stop_on_first_error; /** - * The maximum number of errors before the parser stops recording them. This - * is provided so that if the page is totally borked, we don't completely fill - * up the errors vector and exhaust memory with useless redundant errors. Set - * to -1 to disable the limit. - * Default: -1 + * The maximum number of errors before the parser stops recording + * them. This is provided so that if the page is totally borked, we + * don't completely fill up the errors vector and exhaust memory with + * useless redundant errors. Set to `-1` to disable the limit. + * Default: `-1`. */ int max_errors; /** * The fragment context for parsing: - * https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments + * https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments * - * If GUMBO_TAG_LAST is passed here, it is assumed to be "no fragment", i.e. - * the regular parsing algorithm. Otherwise, pass the tag enum for the - * intended parent of the parsed fragment. We use just the tag enum rather - * than a full node because that's enough to set all the parsing context we - * need, and it provides some additional flexibility for client code to act as - * if parsing a fragment even when a full HTML tree isn't available. + * If `GUMBO_TAG_LAST` is passed here, it is assumed to be "no + * fragment", i.e. the regular parsing algorithm. Otherwise, pass the + * tag enum for the intended parent of the parsed fragment. We use + * just the tag enum rather than a full node because that's enough to + * set all the parsing context we need and it provides some additional + * flexibility for client code to act as if parsing a fragment even + * when a full HTML tree isn't available. * - * Default: GUMBO_TAG_LAST + * Default: `GUMBO_TAG_LAST`. */ GumboTag fragment_context; /** - * The namespace for the fragment context. This lets client code - * differentiate between, say, parsing a tag in SVG vs. parsing it in - * HTML. - * Default: GUMBO_NAMESPACE_HTML + * The namespace for the fragment context. This lets client code + * differentiate between, say, parsing a `<title>` tag in SVG vs. + * parsing it in HTML. + * Default: `GUMBO_NAMESPACE_HTML`. */ GumboNamespaceEnum fragment_namespace; } GumboOptions; @@ -621,51 +732,70 @@ typedef struct GumboInternalOptions { /** Default options struct; use this with gumbo_parse_with_options. */ extern const GumboOptions kGumboDefaultOptions; +typedef enum { + GUMBO_STATUS_OK, + GUMBO_STATUS_OUT_OF_MEMORY, + GUMBO_STATUS_TREE_TOO_DEEP +} GumboOutputStatus; + + /** The output struct containing the results of the parse. */ typedef struct GumboInternalOutput { /** - * Pointer to the document node. This is a GumboNode of type NODE_DOCUMENT - * that contains the entire document as its child. + * Pointer to the document node. This is a `GumboNode` of type + * `NODE_DOCUMENT` that contains the entire document as its child. */ GumboNode* document; /** - * Pointer to the root node. This the <html> tag that forms the root of the - * document. + * Pointer to the root node. This is the `<html>` tag that forms the + * root of the document. */ GumboNode* root; /** * A list of errors that occurred during the parse. * NOTE: In version 1.0 of this library, the API for errors hasn't been fully - * fleshed out and may change in the future. For this reason, the GumboError - * header isn't part of the public API. Contact us if you need errors + * fleshed out and may change in the future. For this reason, the GumboError + * header isn't part of the public API. Contact us if you need errors * reported so we can work out something appropriate for your use-case. */ GumboVector /* GumboError */ errors; + + /** + * A status code indicating whether parsing finished successfully or was + * stopped mid-document due to exceptional circumstances. + */ + GumboOutputStatus status; } GumboOutput; /** - * Parses a buffer of UTF8 text into an GumboNode parse tree. The buffer must - * live at least as long as the parse tree, as some fields (eg. original_text) - * point directly into the original buffer. + * Parses a buffer of UTF-8 text into an `GumboNode` parse tree. The + * buffer must live at least as long as the parse tree, as some fields + * (eg. `original_text`) point directly into the original buffer. * * This doesn't support buffers longer than 4 gigabytes. */ GumboOutput* gumbo_parse(const char* buffer); /** - * Extended version of gumbo_parse that takes an explicit options structure, - * buffer, and length. + * Extended version of `gumbo_parse` that takes an explicit options + * structure, buffer, and length. */ -GumboOutput* gumbo_parse_with_options( - const GumboOptions* options, const char* buffer, size_t buffer_length); +GumboOutput* gumbo_parse_with_options ( + const GumboOptions* options, + const char* buffer, + size_t buffer_length +); + +/** Convert a `GumboOutputStatus` code into a readable description. */ +const char* gumbo_status_to_string(GumboOutputStatus status); -/** Release the memory used for the parse tree & parse errors. */ -void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output); +/** Release the memory used for the parse tree and parse errors. */ +void gumbo_destroy_output(GumboOutput* output); #ifdef __cplusplus } #endif -#endif // GUMBO_GUMBO_H_ +#endif // GUMBO_H diff --git a/gumbo-parser/src/insertion_mode.h b/gumbo-parser/src/insertion_mode.h index 45134c13..6cb1d341 100644 --- a/gumbo-parser/src/insertion_mode.h +++ b/gumbo-parser/src/insertion_mode.h @@ -1,29 +1,9 @@ -// Copyright 2011 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) - #ifndef GUMBO_INSERTION_MODE_H_ #define GUMBO_INSERTION_MODE_H_ -#ifdef __cplusplus -extern "C" { -#endif - -// http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode -// If new enum values are added, be sure to update the kTokenHandlers dispatch -// table in parser.c. +// https://html.spec.whatwg.org/multipage/parsing.html#insertion-mode +// If new enum values are added, be sure to update the kTokenHandlers +// dispatch table in parser.c. typedef enum { GUMBO_INSERTION_MODE_INITIAL, GUMBO_INSERTION_MODE_BEFORE_HTML, @@ -50,8 +30,4 @@ typedef enum { GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET } GumboInsertionMode; -#ifdef __cplusplus -} // extern C -#endif - -#endif // GUMBO_INSERTION_MODE_H_ +#endif // GUMBO_INSERTION_MODE_H_ diff --git a/gumbo-parser/src/macros.h b/gumbo-parser/src/macros.h new file mode 100644 index 00000000..ccf8bb86 --- /dev/null +++ b/gumbo-parser/src/macros.h @@ -0,0 +1,91 @@ +#ifndef MACROS_H +#define MACROS_H + +#if (!defined(__STDC_VERSION__) || !(__STDC_VERSION__ >= 199901L)) \ + && !defined(_WIN32) && !defined(__cplusplus) +# error C99 compiler required +#endif + +#if defined(_WIN32) +# define inline __inline +# define __func__ __FUNCTION__ +#endif + +// Calculate the number of elements in an array. +// The extra division on the third line is a trick to help prevent +// passing a pointer to the first element of an array instead of a +// reference to the array itself. +#define ARRAY_COUNT(x) ( \ + (sizeof(x) / sizeof((x)[0])) \ + / ((size_t)(!(sizeof(x) % sizeof((x)[0])))) \ +) + +#ifdef NDEBUG + #define UNUSED_IF_NDEBUG(x) (void)(x) +#else + #define UNUSED_IF_NDEBUG(x) +#endif + +#ifdef __GNUC__ + #define GNUC_AT_LEAST(major, minor) ( \ + (__GNUC__ > major) \ + || ((__GNUC__ == major) && (__GNUC_MINOR__ >= minor)) ) +#else + #define GNUC_AT_LEAST(major, minor) 0 +#endif + +#ifdef __has_attribute + #define HAS_ATTRIBUTE(x) __has_attribute(x) +#else + #define HAS_ATTRIBUTE(x) 0 +#endif + +#if GNUC_AT_LEAST(3, 0) || HAS_ATTRIBUTE(unused) || defined(__TINYC__) + #define UNUSED __attribute__((__unused__)) +#else + #define UNUSED +#endif + +#if GNUC_AT_LEAST(3, 0) + #define MALLOC __attribute__((__malloc__)) + #define PRINTF(x) __attribute__((__format__(__printf__, (x), (x + 1)))) + #define PURE __attribute__((__pure__)) + #define CONST_FN __attribute__((__const__)) +#else + #define MALLOC + #define PRINTF(x) + #define PURE + #define CONST_FN +#endif + +#define UNUSED_ARG(x) unused__ ## x UNUSED + +#if GNUC_AT_LEAST(3, 0) && defined(__OPTIMIZE__) + #define likely(x) __builtin_expect(!!(x), 1) + #define unlikely(x) __builtin_expect(!!(x), 0) +#else + #define likely(x) (x) + #define unlikely(x) (x) +#endif + +#if GNUC_AT_LEAST(3, 3) || HAS_ATTRIBUTE(nonnull) + #define NONNULL_ARGS __attribute__((__nonnull__)) +#else + #define NONNULL_ARGS +#endif + +#if GNUC_AT_LEAST(3, 4) || HAS_ATTRIBUTE(warn_unused_result) + #define WARN_UNUSED_RESULT __attribute__((__warn_unused_result__)) +#else + #define WARN_UNUSED_RESULT +#endif + +#if GNUC_AT_LEAST(5, 0) || HAS_ATTRIBUTE(returns_nonnull) + #define RETURNS_NONNULL __attribute__((__returns_nonnull__)) +#else + #define RETURNS_NONNULL +#endif + +#define XMALLOC MALLOC RETURNS_NONNULL + +#endif // ndef MACROS_H diff --git a/gumbo-parser/src/parser.c b/gumbo-parser/src/parser.c index dc692b3e..41dc1595 100644 --- a/gumbo-parser/src/parser.c +++ b/gumbo-parser/src/parser.c @@ -1,310 +1,181 @@ -// Copyright 2010 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) +/* + Copyright 2017-2018 Craig Barnes. + Copyright 2010 Google Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include <assert.h> -#include <ctype.h> #include <stdarg.h> +#include <stdint.h> #include <stdlib.h> #include <string.h> -#include <strings.h> +#include "ascii.h" #include "attribute.h" #include "error.h" #include "gumbo.h" #include "insertion_mode.h" +#include "macros.h" #include "parser.h" +#include "replacement.h" #include "tokenizer.h" #include "tokenizer_states.h" #include "utf8.h" #include "util.h" #include "vector.h" -#define AVOID_UNUSED_VARIABLE_WARNING(i) (void)(i) - -#define GUMBO_STRING(literal) \ - { literal, sizeof(literal) - 1 } -#define TERMINATOR \ - { "", 0 } - -typedef char gumbo_tagset[GUMBO_TAG_LAST]; +typedef uint8_t TagSet[GUMBO_TAG_LAST + 1]; #define TAG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_HTML) #define TAG_SVG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_SVG) #define TAG_MATHML(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_MATHML) -#define TAGSET_INCLUDES(tagset, namespace, tag) \ - (tag < GUMBO_TAG_LAST && tagset[(int) tag] == (1 << (int) namespace)) - -// selected forward declarations as it is getting hard to find -// an appropriate order -static bool node_html_tag_is(const GumboNode*, GumboTag); -static GumboInsertionMode get_current_template_insertion_mode( - const GumboParser*); -static bool handle_in_template(GumboParser*, GumboToken*); -static void destroy_node(GumboParser*, GumboNode*); - -static void* malloc_wrapper(void* unused, size_t size) { return malloc(size); } +static const GumboSourcePosition kGumboEmptySourcePosition = { \ + .line = 0, \ + .column = 0, \ + .offset = 0 \ +}; -static void free_wrapper(void* unused, void* ptr) { free(ptr); } +const GumboOptions kGumboDefaultOptions = { + .tab_stop = 8, + .stop_on_first_error = false, + .max_errors = -1, + .fragment_context = GUMBO_TAG_LAST, + .fragment_namespace = GUMBO_NAMESPACE_HTML +}; -const GumboOptions kGumboDefaultOptions = {&malloc_wrapper, &free_wrapper, NULL, - 8, false, -1, GUMBO_TAG_LAST, GUMBO_NAMESPACE_HTML}; +#define STRING(s) {.data = s, .length = sizeof(s) - 1} +#define TERMINATOR {.data = "", .length = 0} -static const GumboStringPiece kDoctypeHtml = GUMBO_STRING("html"); static const GumboStringPiece kPublicIdHtml4_0 = - GUMBO_STRING("-//W3C//DTD HTML 4.0//EN"); + STRING("-//W3C//DTD HTML 4.0//EN"); static const GumboStringPiece kPublicIdHtml4_01 = - GUMBO_STRING("-//W3C//DTD HTML 4.01//EN"); + STRING("-//W3C//DTD HTML 4.01//EN"); static const GumboStringPiece kPublicIdXhtml1_0 = - GUMBO_STRING("-//W3C//DTD XHTML 1.0 Strict//EN"); + STRING("-//W3C//DTD XHTML 1.0 Strict//EN"); static const GumboStringPiece kPublicIdXhtml1_1 = - GUMBO_STRING("-//W3C//DTD XHTML 1.1//EN"); + STRING("-//W3C//DTD XHTML 1.1//EN"); static const GumboStringPiece kSystemIdRecHtml4_0 = - GUMBO_STRING("http://www.w3.org/TR/REC-html40/strict.dtd"); + STRING("http://www.w3.org/TR/REC-html40/strict.dtd"); static const GumboStringPiece kSystemIdHtml4 = - GUMBO_STRING("http://www.w3.org/TR/html4/strict.dtd"); + STRING("http://www.w3.org/TR/html4/strict.dtd"); static const GumboStringPiece kSystemIdXhtmlStrict1_1 = - GUMBO_STRING("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"); + STRING("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"); static const GumboStringPiece kSystemIdXhtml1_1 = - GUMBO_STRING("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"); + STRING("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"); static const GumboStringPiece kSystemIdLegacyCompat = - GUMBO_STRING("about:legacy-compat"); + STRING("about:legacy-compat"); // The doctype arrays have an explicit terminator because we want to pass them // to a helper function, and passing them as a pointer discards sizeof -// information. The SVG arrays are used only by one-off functions, and so loops +// information. The SVG arrays are used only by one-off functions, and so loops // over them use sizeof directly instead of a terminator. static const GumboStringPiece kQuirksModePublicIdPrefixes[] = { - GUMBO_STRING("+//Silmaril//dtd html Pro v0r11 19970101//"), - GUMBO_STRING("-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//"), - GUMBO_STRING("-//AS//DTD HTML 3.0 asWedit + extensions//"), - GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 1//"), - GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 2//"), - GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 1//"), - GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 2//"), - GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict//"), - GUMBO_STRING("-//IETF//DTD HTML 2.0//"), - GUMBO_STRING("-//IETF//DTD HTML 2.1E//"), - GUMBO_STRING("-//IETF//DTD HTML 3.0//"), - GUMBO_STRING("-//IETF//DTD HTML 3.2 Final//"), - GUMBO_STRING("-//IETF//DTD HTML 3.2//"), - GUMBO_STRING("-//IETF//DTD HTML 3//"), - GUMBO_STRING("-//IETF//DTD HTML Level 0//"), - GUMBO_STRING("-//IETF//DTD HTML Level 1//"), - GUMBO_STRING("-//IETF//DTD HTML Level 2//"), - GUMBO_STRING("-//IETF//DTD HTML Level 3//"), - GUMBO_STRING("-//IETF//DTD HTML Strict Level 0//"), - GUMBO_STRING("-//IETF//DTD HTML Strict Level 1//"), - GUMBO_STRING("-//IETF//DTD HTML Strict Level 2//"), - GUMBO_STRING("-//IETF//DTD HTML Strict Level 3//"), - GUMBO_STRING("-//IETF//DTD HTML Strict//"), - GUMBO_STRING("-//IETF//DTD HTML//"), - GUMBO_STRING("-//Metrius//DTD Metrius Presentational//"), - GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//"), - GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML//"), - GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 Tables//"), - GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//"), - GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML//"), - GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 Tables//"), - GUMBO_STRING("-//Netscape Comm. Corp.//DTD HTML//"), - GUMBO_STRING("-//Netscape Comm. Corp.//DTD Strict HTML//"), - GUMBO_STRING("-//O'Reilly and Associates//DTD HTML 2.0//"), - GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended 1.0//"), - GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//"), - GUMBO_STRING( - "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::)" - "extensions to HTML 4.0//"), - GUMBO_STRING( - "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::" - "extensions to HTML 4.0//"), - GUMBO_STRING("-//Spyglass//DTD HTML 2.0 Extended//"), - GUMBO_STRING("-//SQ//DTD HTML 2.0 HoTMetaL + extensions//"), - GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava HTML//"), - GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava Strict HTML//"), - GUMBO_STRING("-//W3C//DTD HTML 3 1995-03-24//"), - GUMBO_STRING("-//W3C//DTD HTML 3.2 Draft//"), - GUMBO_STRING("-//W3C//DTD HTML 3.2 Final//"), - GUMBO_STRING("-//W3C//DTD HTML 3.2//"), - GUMBO_STRING("-//W3C//DTD HTML 3.2S Draft//"), - GUMBO_STRING("-//W3C//DTD HTML 4.0 Frameset//"), - GUMBO_STRING("-//W3C//DTD HTML 4.0 Transitional//"), - GUMBO_STRING("-//W3C//DTD HTML Experimental 19960712//"), - GUMBO_STRING("-//W3C//DTD HTML Experimental 970421//"), - GUMBO_STRING("-//W3C//DTD W3 HTML//"), - GUMBO_STRING("-//W3O//DTD W3 HTML 3.0//"), - GUMBO_STRING("-//WebTechs//DTD Mozilla HTML 2.0//"), - GUMBO_STRING("-//WebTechs//DTD Mozilla HTML//"), TERMINATOR}; + STRING("+//Silmaril//dtd html Pro v0r11 19970101//"), + STRING("-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//"), + STRING("-//AS//DTD HTML 3.0 asWedit + extensions//"), + STRING("-//IETF//DTD HTML 2.0 Level 1//"), + STRING("-//IETF//DTD HTML 2.0 Level 2//"), + STRING("-//IETF//DTD HTML 2.0 Strict Level 1//"), + STRING("-//IETF//DTD HTML 2.0 Strict Level 2//"), + STRING("-//IETF//DTD HTML 2.0 Strict//"), + STRING("-//IETF//DTD HTML 2.0//"), + STRING("-//IETF//DTD HTML 2.1E//"), + STRING("-//IETF//DTD HTML 3.0//"), + STRING("-//IETF//DTD HTML 3.2 Final//"), + STRING("-//IETF//DTD HTML 3.2//"), + STRING("-//IETF//DTD HTML 3//"), + STRING("-//IETF//DTD HTML Level 0//"), + STRING("-//IETF//DTD HTML Level 1//"), + STRING("-//IETF//DTD HTML Level 2//"), + STRING("-//IETF//DTD HTML Level 3//"), + STRING("-//IETF//DTD HTML Strict Level 0//"), + STRING("-//IETF//DTD HTML Strict Level 1//"), + STRING("-//IETF//DTD HTML Strict Level 2//"), + STRING("-//IETF//DTD HTML Strict Level 3//"), + STRING("-//IETF//DTD HTML Strict//"), + STRING("-//IETF//DTD HTML//"), + STRING("-//Metrius//DTD Metrius Presentational//"), + STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//"), + STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML//"), + STRING("-//Microsoft//DTD Internet Explorer 2.0 Tables//"), + STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//"), + STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML//"), + STRING("-//Microsoft//DTD Internet Explorer 3.0 Tables//"), + STRING("-//Netscape Comm. Corp.//DTD HTML//"), + STRING("-//Netscape Comm. Corp.//DTD Strict HTML//"), + STRING("-//O'Reilly and Associates//DTD HTML 2.0//"), + STRING("-//O'Reilly and Associates//DTD HTML Extended 1.0//"), + STRING("-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//"), + STRING( + "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::)" + "extensions to HTML 4.0//"), + STRING( + "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::" + "extensions to HTML 4.0//"), + STRING("-//Spyglass//DTD HTML 2.0 Extended//"), + STRING("-//SQ//DTD HTML 2.0 HoTMetaL + extensions//"), + STRING("-//Sun Microsystems Corp.//DTD HotJava HTML//"), + STRING("-//Sun Microsystems Corp.//DTD HotJava Strict HTML//"), + STRING("-//W3C//DTD HTML 3 1995-03-24//"), + STRING("-//W3C//DTD HTML 3.2 Draft//"), + STRING("-//W3C//DTD HTML 3.2 Final//"), + STRING("-//W3C//DTD HTML 3.2//"), + STRING("-//W3C//DTD HTML 3.2S Draft//"), + STRING("-//W3C//DTD HTML 4.0 Frameset//"), + STRING("-//W3C//DTD HTML 4.0 Transitional//"), + STRING("-//W3C//DTD HTML Experimental 19960712//"), + STRING("-//W3C//DTD HTML Experimental 970421//"), + STRING("-//W3C//DTD W3 HTML//"), + STRING("-//W3O//DTD W3 HTML 3.0//"), + STRING("-//WebTechs//DTD Mozilla HTML 2.0//"), + STRING("-//WebTechs//DTD Mozilla HTML//"), + TERMINATOR +}; static const GumboStringPiece kQuirksModePublicIdExactMatches[] = { - GUMBO_STRING("-//W3O//DTD W3 HTML Strict 3.0//EN//"), - GUMBO_STRING("-/W3C/DTD HTML 4.0 Transitional/EN"), GUMBO_STRING("HTML"), - TERMINATOR}; + STRING("-//W3O//DTD W3 HTML Strict 3.0//EN//"), + STRING("-/W3C/DTD HTML 4.0 Transitional/EN"), + STRING("HTML"), + TERMINATOR +}; static const GumboStringPiece kQuirksModeSystemIdExactMatches[] = { - GUMBO_STRING("http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"), - TERMINATOR}; + STRING("http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"), + TERMINATOR +}; static const GumboStringPiece kLimitedQuirksPublicIdPrefixes[] = { - GUMBO_STRING("-//W3C//DTD XHTML 1.0 Frameset//"), - GUMBO_STRING("-//W3C//DTD XHTML 1.0 Transitional//"), TERMINATOR}; - -static const GumboStringPiece kLimitedQuirksRequiresSystemIdPublicIdPrefixes[] = - {GUMBO_STRING("-//W3C//DTD HTML 4.01 Frameset//"), - GUMBO_STRING("-//W3C//DTD HTML 4.01 Transitional//"), TERMINATOR}; - -// Indexed by GumboNamespaceEnum; keep in sync with that. -static const char* kLegalXmlns[] = {"http://www.w3.org/1999/xhtml", - "http://www.w3.org/2000/svg", "http://www.w3.org/1998/Math/MathML"}; - -typedef struct _ReplacementEntry { - const GumboStringPiece from; - const GumboStringPiece to; -} ReplacementEntry; - -#define REPLACEMENT_ENTRY(from, to) \ - { GUMBO_STRING(from), GUMBO_STRING(to) } - -// Static data for SVG attribute replacements. -// https://html.spec.whatwg.org/multipage/syntax.html#creating-and-inserting-nodes -static const ReplacementEntry kSvgAttributeReplacements[] = { - REPLACEMENT_ENTRY("attributename", "attributeName"), - REPLACEMENT_ENTRY("attributetype", "attributeType"), - REPLACEMENT_ENTRY("basefrequency", "baseFrequency"), - REPLACEMENT_ENTRY("baseprofile", "baseProfile"), - REPLACEMENT_ENTRY("calcmode", "calcMode"), - REPLACEMENT_ENTRY("clippathunits", "clipPathUnits"), - // REPLACEMENT_ENTRY("contentscripttype", "contentScriptType"), - // REPLACEMENT_ENTRY("contentstyletype", "contentStyleType"), - REPLACEMENT_ENTRY("diffuseconstant", "diffuseConstant"), - REPLACEMENT_ENTRY("edgemode", "edgeMode"), - // REPLACEMENT_ENTRY("externalresourcesrequired", - // "externalResourcesRequired"), - // REPLACEMENT_ENTRY("filterres", "filterRes"), - REPLACEMENT_ENTRY("filterunits", "filterUnits"), - REPLACEMENT_ENTRY("glyphref", "glyphRef"), - REPLACEMENT_ENTRY("gradienttransform", "gradientTransform"), - REPLACEMENT_ENTRY("gradientunits", "gradientUnits"), - REPLACEMENT_ENTRY("kernelmatrix", "kernelMatrix"), - REPLACEMENT_ENTRY("kernelunitlength", "kernelUnitLength"), - REPLACEMENT_ENTRY("keypoints", "keyPoints"), - REPLACEMENT_ENTRY("keysplines", "keySplines"), - REPLACEMENT_ENTRY("keytimes", "keyTimes"), - REPLACEMENT_ENTRY("lengthadjust", "lengthAdjust"), - REPLACEMENT_ENTRY("limitingconeangle", "limitingConeAngle"), - REPLACEMENT_ENTRY("markerheight", "markerHeight"), - REPLACEMENT_ENTRY("markerunits", "markerUnits"), - REPLACEMENT_ENTRY("markerwidth", "markerWidth"), - REPLACEMENT_ENTRY("maskcontentunits", "maskContentUnits"), - REPLACEMENT_ENTRY("maskunits", "maskUnits"), - REPLACEMENT_ENTRY("numoctaves", "numOctaves"), - REPLACEMENT_ENTRY("pathlength", "pathLength"), - REPLACEMENT_ENTRY("patterncontentunits", "patternContentUnits"), - REPLACEMENT_ENTRY("patterntransform", "patternTransform"), - REPLACEMENT_ENTRY("patternunits", "patternUnits"), - REPLACEMENT_ENTRY("pointsatx", "pointsAtX"), - REPLACEMENT_ENTRY("pointsaty", "pointsAtY"), - REPLACEMENT_ENTRY("pointsatz", "pointsAtZ"), - REPLACEMENT_ENTRY("preservealpha", "preserveAlpha"), - REPLACEMENT_ENTRY("preserveaspectratio", "preserveAspectRatio"), - REPLACEMENT_ENTRY("primitiveunits", "primitiveUnits"), - REPLACEMENT_ENTRY("refx", "refX"), REPLACEMENT_ENTRY("refy", "refY"), - REPLACEMENT_ENTRY("repeatcount", "repeatCount"), - REPLACEMENT_ENTRY("repeatdur", "repeatDur"), - REPLACEMENT_ENTRY("requiredextensions", "requiredExtensions"), - REPLACEMENT_ENTRY("requiredfeatures", "requiredFeatures"), - REPLACEMENT_ENTRY("specularconstant", "specularConstant"), - REPLACEMENT_ENTRY("specularexponent", "specularExponent"), - REPLACEMENT_ENTRY("spreadmethod", "spreadMethod"), - REPLACEMENT_ENTRY("startoffset", "startOffset"), - REPLACEMENT_ENTRY("stddeviation", "stdDeviation"), - REPLACEMENT_ENTRY("stitchtiles", "stitchTiles"), - REPLACEMENT_ENTRY("surfacescale", "surfaceScale"), - REPLACEMENT_ENTRY("systemlanguage", "systemLanguage"), - REPLACEMENT_ENTRY("tablevalues", "tableValues"), - REPLACEMENT_ENTRY("targetx", "targetX"), - REPLACEMENT_ENTRY("targety", "targetY"), - REPLACEMENT_ENTRY("textlength", "textLength"), - REPLACEMENT_ENTRY("viewbox", "viewBox"), - REPLACEMENT_ENTRY("viewtarget", "viewTarget"), - REPLACEMENT_ENTRY("xchannelselector", "xChannelSelector"), - REPLACEMENT_ENTRY("ychannelselector", "yChannelSelector"), - REPLACEMENT_ENTRY("zoomandpan", "zoomAndPan"), + STRING("-//W3C//DTD XHTML 1.0 Frameset//"), + STRING("-//W3C//DTD XHTML 1.0 Transitional//"), + TERMINATOR }; -static const ReplacementEntry kSvgTagReplacements[] = { - REPLACEMENT_ENTRY("altglyph", "altGlyph"), - REPLACEMENT_ENTRY("altglyphdef", "altGlyphDef"), - REPLACEMENT_ENTRY("altglyphitem", "altGlyphItem"), - REPLACEMENT_ENTRY("animatecolor", "animateColor"), - REPLACEMENT_ENTRY("animatemotion", "animateMotion"), - REPLACEMENT_ENTRY("animatetransform", "animateTransform"), - REPLACEMENT_ENTRY("clippath", "clipPath"), - REPLACEMENT_ENTRY("feblend", "feBlend"), - REPLACEMENT_ENTRY("fecolormatrix", "feColorMatrix"), - REPLACEMENT_ENTRY("fecomponenttransfer", "feComponentTransfer"), - REPLACEMENT_ENTRY("fecomposite", "feComposite"), - REPLACEMENT_ENTRY("feconvolvematrix", "feConvolveMatrix"), - REPLACEMENT_ENTRY("fediffuselighting", "feDiffuseLighting"), - REPLACEMENT_ENTRY("fedisplacementmap", "feDisplacementMap"), - REPLACEMENT_ENTRY("fedistantlight", "feDistantLight"), - REPLACEMENT_ENTRY("feflood", "feFlood"), - REPLACEMENT_ENTRY("fefunca", "feFuncA"), - REPLACEMENT_ENTRY("fefuncb", "feFuncB"), - REPLACEMENT_ENTRY("fefuncg", "feFuncG"), - REPLACEMENT_ENTRY("fefuncr", "feFuncR"), - REPLACEMENT_ENTRY("fegaussianblur", "feGaussianBlur"), - REPLACEMENT_ENTRY("feimage", "feImage"), - REPLACEMENT_ENTRY("femerge", "feMerge"), - REPLACEMENT_ENTRY("femergenode", "feMergeNode"), - REPLACEMENT_ENTRY("femorphology", "feMorphology"), - REPLACEMENT_ENTRY("feoffset", "feOffset"), - REPLACEMENT_ENTRY("fepointlight", "fePointLight"), - REPLACEMENT_ENTRY("fespecularlighting", "feSpecularLighting"), - REPLACEMENT_ENTRY("fespotlight", "feSpotLight"), - REPLACEMENT_ENTRY("fetile", "feTile"), - REPLACEMENT_ENTRY("feturbulence", "feTurbulence"), - REPLACEMENT_ENTRY("foreignobject", "foreignObject"), - REPLACEMENT_ENTRY("glyphref", "glyphRef"), - REPLACEMENT_ENTRY("lineargradient", "linearGradient"), - REPLACEMENT_ENTRY("radialgradient", "radialGradient"), - REPLACEMENT_ENTRY("textpath", "textPath"), +static const GumboStringPiece kSystemIdDependentPublicIdPrefixes[] = { + STRING("-//W3C//DTD HTML 4.01 Frameset//"), + STRING("-//W3C//DTD HTML 4.01 Transitional//"), + TERMINATOR }; -typedef struct _NamespacedAttributeReplacement { - const char* from; - const char* local_name; - const GumboAttributeNamespaceEnum attr_namespace; -} NamespacedAttributeReplacement; - -static const NamespacedAttributeReplacement kForeignAttributeReplacements[] = { - {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK}, - {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK}, - {"xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK}, - {"xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK}, - {"xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK}, - {"xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK}, - {"xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK}, - {"xml:base", "base", GUMBO_ATTR_NAMESPACE_XML}, - {"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML}, - {"xml:space", "space", GUMBO_ATTR_NAMESPACE_XML}, - {"xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS}, - {"xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS}, +// Indexed by GumboNamespaceEnum; keep in sync with that. +static const char* kLegalXmlns[] = { + "http://www.w3.org/1999/xhtml", + "http://www.w3.org/2000/svg", + "http://www.w3.org/1998/Math/MathML" }; -// The "scope marker" for the list of active formatting elements. We use a +// The "scope marker" for the list of active formatting elements. We use a // pointer to this as a generic marker element, since the particular element // scope doesn't matter. static const GumboNode kActiveFormattingScopeMarker; @@ -315,15 +186,15 @@ static const bool kStartTag = true; static const bool kEndTag = false; // Because GumboStringPieces are immutable, we can't insert a character directly -// into a text node. Instead, we accumulate all pending characters here and +// into a text node. Instead, we accumulate all pending characters here and // flush them out to a text node whenever a new element is inserted. // -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-a-character +// https://html.spec.whatwg.org/multipage/parsing.html#insert-a-character typedef struct _TextNodeBufferState { // The accumulated text to be inserted into the current text node. GumboStringBuffer _buffer; - // A pointer to the original text represented by this text node. Note that + // A pointer to the original text represented by this text node. Note that // because of foster parenting and other strange DOM manipulations, this may // include other non-text HTML tags in it; it is defined as the span of // original text from the first character in this text node to the last @@ -338,24 +209,24 @@ typedef struct _TextNodeBufferState { } TextNodeBufferState; typedef struct GumboInternalParserState { - // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode + // https://html.spec.whatwg.org/multipage/parsing.html#insertion-mode GumboInsertionMode _insertion_mode; // Used for run_generic_parsing_algorithm, which needs to switch back to the // original insertion mode at its conclusion. GumboInsertionMode _original_insertion_mode; - // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-stack-of-open-elements + // https://html.spec.whatwg.org/multipage/parsing.html#the-stack-of-open-elements GumboVector /*GumboNode*/ _open_elements; - // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-list-of-active-formatting-elements + // https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements GumboVector /*GumboNode*/ _active_formatting_elements; // The stack of template insertion modes. - // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#the-insertion-mode + // https://html.spec.whatwg.org/multipage/parsing.html#the-insertion-mode GumboVector /*InsertionMode*/ _template_insertion_modes; - // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-element-pointers + // https://html.spec.whatwg.org/multipage/parsing.html#the-element-pointers GumboNode* _head_element; GumboNode* _form_element; @@ -375,7 +246,7 @@ typedef struct GumboInternalParserState { bool _ignore_next_linefeed; // The flag for "whenever a node would be inserted into the current node, it - // must instead be foster parented". This is used for misnested table + // must instead be foster parented". This is used for misnested table // content, which needs to be handled according to "in body" rules yet foster // parented outside of the table. // It would perhaps be more explicit to have this as a parameter to @@ -392,7 +263,7 @@ typedef struct GumboInternalParserState { // The way that the spec is written, the </body> and </html> tags are *always* // implicit, because encountering one of those tokens merely switches the - // insertion mode out of "in body". So we have individual state flags for + // insertion mode out of "in body". So we have individual state flags for // those end tags that are then inspected by pop_current_node when the <body> // and <html> nodes are popped to set the GUMBO_INSERTION_IMPLICIT_END_TAG // flag appropriately. @@ -407,23 +278,31 @@ static bool token_has_attribute(const GumboToken* token, const char* name) { // Checks if the value of the specified attribute is a case-insensitive match // for the specified string. -static bool attribute_matches( - const GumboVector* attributes, const char* name, const char* value) { +static bool attribute_matches ( + const GumboVector* attributes, + const char* name, + const char* value +) { const GumboAttribute* attr = gumbo_get_attribute(attributes, name); - return attr ? strcasecmp(value, attr->value) == 0 : false; + return attr ? gumbo_ascii_strcasecmp(value, attr->value) == 0 : false; } // Checks if the value of the specified attribute is a case-sensitive match // for the specified string. -static bool attribute_matches_case_sensitive( - const GumboVector* attributes, const char* name, const char* value) { +static bool attribute_matches_case_sensitive ( + const GumboVector* attributes, + const char* name, + const char* value +) { const GumboAttribute* attr = gumbo_get_attribute(attributes, name); return attr ? strcmp(value, attr->value) == 0 : false; } // Checks if the specified attribute vectors are identical. -static bool all_attributes_match( - const GumboVector* attr1, const GumboVector* attr2) { +static bool all_attributes_match ( + const GumboVector* attr1, + const GumboVector* attr2 +) { unsigned int num_unmatched_attr2_elements = attr2->length; for (unsigned int i = 0; i < attr1->length; ++i) { const GumboAttribute* attr = attr1->data[i]; @@ -441,8 +320,8 @@ static void set_frameset_not_ok(GumboParser* parser) { parser->_parser_state->_frameset_ok = false; } -static GumboNode* create_node(GumboParser* parser, GumboNodeType type) { - GumboNode* node = gumbo_parser_allocate(parser, sizeof(GumboNode)); +static GumboNode* create_node(GumboNodeType type) { + GumboNode* node = gumbo_alloc(sizeof(GumboNode)); node->parent = NULL; node->index_within_parent = -1; node->type = type; @@ -450,10 +329,10 @@ static GumboNode* create_node(GumboParser* parser, GumboNodeType type) { return node; } -static GumboNode* new_document_node(GumboParser* parser) { - GumboNode* document_node = create_node(parser, GUMBO_NODE_DOCUMENT); +static GumboNode* new_document_node() { + GumboNode* document_node = create_node(GUMBO_NODE_DOCUMENT); document_node->parse_flags = GUMBO_INSERTION_BY_PARSER; - gumbo_vector_init(parser, 1, &document_node->v.document.children); + gumbo_vector_init(1, &document_node->v.document.children); // Must be initialized explicitly, as there's no guarantee that we'll see a // doc type token. @@ -466,26 +345,26 @@ static GumboNode* new_document_node(GumboParser* parser) { } static void output_init(GumboParser* parser) { - GumboOutput* output = gumbo_parser_allocate(parser, sizeof(GumboOutput)); + GumboOutput* output = gumbo_alloc(sizeof(GumboOutput)); output->root = NULL; - output->document = new_document_node(parser); + output->document = new_document_node(); + output->status = GUMBO_STATUS_OK; parser->_output = output; gumbo_init_errors(parser); } static void parser_state_init(GumboParser* parser) { - GumboParserState* parser_state = - gumbo_parser_allocate(parser, sizeof(GumboParserState)); + GumboParserState* parser_state = gumbo_alloc(sizeof(GumboParserState)); parser_state->_insertion_mode = GUMBO_INSERTION_MODE_INITIAL; parser_state->_reprocess_current_token = false; parser_state->_frameset_ok = true; parser_state->_ignore_next_linefeed = false; parser_state->_foster_parent_insertions = false; parser_state->_text_node._type = GUMBO_NODE_WHITESPACE; - gumbo_string_buffer_init(parser, &parser_state->_text_node._buffer); - gumbo_vector_init(parser, 10, &parser_state->_open_elements); - gumbo_vector_init(parser, 5, &parser_state->_active_formatting_elements); - gumbo_vector_init(parser, 5, &parser_state->_template_insertion_modes); + gumbo_string_buffer_init(&parser_state->_text_node._buffer); + gumbo_vector_init(10, &parser_state->_open_elements); + gumbo_vector_init(5, &parser_state->_active_formatting_elements); + gumbo_vector_init(5, &parser_state->_template_insertion_modes); parser_state->_head_element = NULL; parser_state->_form_element = NULL; parser_state->_fragment_ctx = NULL; @@ -495,19 +374,94 @@ static void parser_state_init(GumboParser* parser) { parser->_parser_state = parser_state; } +typedef void (*TreeTraversalCallback)(GumboNode* node); + +static void tree_traverse(GumboNode* node, TreeTraversalCallback callback) { + GumboNode* current_node = node; + unsigned int offset = 0; + +tailcall: + switch (current_node->type) { + case GUMBO_NODE_DOCUMENT: + case GUMBO_NODE_TEMPLATE: + case GUMBO_NODE_ELEMENT: { + GumboVector* children = (current_node->type == GUMBO_NODE_DOCUMENT) + ? ¤t_node->v.document.children + : ¤t_node->v.element.children + ; + if (offset >= children->length) { + assert(offset == children->length); + break; + } else { + current_node = children->data[offset]; + offset = 0; + goto tailcall; + } + } + case GUMBO_NODE_TEXT: + case GUMBO_NODE_CDATA: + case GUMBO_NODE_COMMENT: + case GUMBO_NODE_WHITESPACE: + assert(offset == 0); + break; + } + + offset = current_node->index_within_parent + 1; + GumboNode* next_node = current_node->parent; + callback(current_node); + if (current_node == node) { + return; + } + current_node = next_node; + goto tailcall; +} + +static void destroy_node_callback(GumboNode* node) { + switch (node->type) { + case GUMBO_NODE_DOCUMENT: { + GumboDocument* doc = &node->v.document; + gumbo_free((void*) doc->children.data); + gumbo_free((void*) doc->name); + gumbo_free((void*) doc->public_identifier); + gumbo_free((void*) doc->system_identifier); + } break; + case GUMBO_NODE_TEMPLATE: + case GUMBO_NODE_ELEMENT: + for (unsigned int i = 0; i < node->v.element.attributes.length; ++i) { + gumbo_destroy_attribute(node->v.element.attributes.data[i]); + } + gumbo_free(node->v.element.attributes.data); + gumbo_free(node->v.element.children.data); + if (node->v.element.tag == GUMBO_TAG_UNKNOWN) + gumbo_free((void *)node->v.element.name); + break; + case GUMBO_NODE_TEXT: + case GUMBO_NODE_CDATA: + case GUMBO_NODE_COMMENT: + case GUMBO_NODE_WHITESPACE: + gumbo_free((void*) node->v.text.text); + break; + } + gumbo_free(node); +} + +static void destroy_node(GumboNode* node) { + tree_traverse(node, &destroy_node_callback); +} + static void parser_state_destroy(GumboParser* parser) { GumboParserState* state = parser->_parser_state; if (state->_fragment_ctx) { - destroy_node(parser, state->_fragment_ctx); + destroy_node(state->_fragment_ctx); } - gumbo_vector_destroy(parser, &state->_active_formatting_elements); - gumbo_vector_destroy(parser, &state->_open_elements); - gumbo_vector_destroy(parser, &state->_template_insertion_modes); - gumbo_string_buffer_destroy(parser, &state->_text_node._buffer); - gumbo_parser_deallocate(parser, state); + gumbo_vector_destroy(&state->_active_formatting_elements); + gumbo_vector_destroy(&state->_open_elements); + gumbo_vector_destroy(&state->_template_insertion_modes); + gumbo_string_buffer_destroy(&state->_text_node._buffer); + gumbo_free(state); } -static GumboNode* get_document_node(GumboParser* parser) { +static GumboNode* get_document_node(const GumboParser* parser) { return parser->_output->document; } @@ -517,8 +471,8 @@ static bool is_fragment_parser(const GumboParser* parser) { // Returns the node at the bottom of the stack of open elements, or NULL if no // elements have been added yet. -static GumboNode* get_current_node(GumboParser* parser) { - GumboVector* open_elements = &parser->_parser_state->_open_elements; +static GumboNode* get_current_node(const GumboParser* parser) { + const GumboVector* open_elements = &parser->_parser_state->_open_elements; if (open_elements->length == 0) { assert(!parser->_output->root); return NULL; @@ -528,8 +482,8 @@ static GumboNode* get_current_node(GumboParser* parser) { return open_elements->data[open_elements->length - 1]; } -static GumboNode* get_adjusted_current_node(GumboParser* parser) { - GumboParserState* state = parser->_parser_state; +static GumboNode* get_adjusted_current_node(const GumboParser* parser) { + const GumboParserState* state = parser->_parser_state; if (state->_open_elements.length == 1 && state->_fragment_ctx) { return state->_fragment_ctx; } @@ -537,15 +491,20 @@ static GumboNode* get_adjusted_current_node(GumboParser* parser) { } // Returns true if the given needle is in the given array of literal -// GumboStringPieces. If exact_match is true, this requires that they match +// GumboStringPieces. If exact_match is true, this requires that they match // exactly; otherwise, this performs a prefix match to check if any of the -// elements in haystack start with needle. This always performs a +// elements in haystack start with needle. This always performs a // case-insensitive match. -static bool is_in_static_list( - const char* needle, const GumboStringPiece* haystack, bool exact_match) { +static bool is_in_static_list ( + const char* needle, + const GumboStringPiece* haystack, + bool exact_match +) { for (unsigned int i = 0; haystack[i].length > 0; ++i) { - if ((exact_match && !strcmp(needle, haystack[i].data)) || - (!exact_match && !strcasecmp(needle, haystack[i].data))) { + if ( + (exact_match && !strcmp(needle, haystack[i].data)) + || (!exact_match && !gumbo_ascii_strcasecmp(needle, haystack[i].data)) + ) { return true; } } @@ -556,13 +515,109 @@ static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) { parser->_parser_state->_insertion_mode = mode; } -// http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reset-the-insertion-mode-appropriately +static void push_template_insertion_mode ( + GumboParser* parser, + GumboInsertionMode mode +) { + gumbo_vector_add ( + (void*) mode, + &parser->_parser_state->_template_insertion_modes + ); +} + +static void pop_template_insertion_mode(GumboParser* parser) { + gumbo_vector_pop(&parser->_parser_state->_template_insertion_modes); +} + +// Returns the current template insertion mode. If the stack of template +// insertion modes is empty, this returns GUMBO_INSERTION_MODE_INITIAL. +static GumboInsertionMode get_current_template_insertion_mode ( + const GumboParser* parser +) { + GumboVector* modes = &parser->_parser_state->_template_insertion_modes; + if (modes->length == 0) { + return GUMBO_INSERTION_MODE_INITIAL; + } + return (GumboInsertionMode) modes->data[(modes->length - 1)]; +} + +// Returns true if the specified token is either a start or end tag +// (specified by is_start) with one of the tag types in the TagSet. +static bool tag_in ( + const GumboToken* token, + bool is_start, + const TagSet* tags +) { + GumboTag token_tag; + if (is_start && token->type == GUMBO_TOKEN_START_TAG) { + token_tag = token->v.start_tag.tag; + } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) { + token_tag = token->v.end_tag.tag; + } else { + return false; + } + return (*tags)[(unsigned) token_tag] != 0u; +} + +// Like tag_in, but for the single-tag case. +static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) { + if (is_start && token->type == GUMBO_TOKEN_START_TAG) { + return token->v.start_tag.tag == tag; + } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) { + return token->v.end_tag.tag == tag; + } else { + return false; + } +} + +static inline bool tagset_includes ( + const TagSet* tagset, + GumboNamespaceEnum ns, + GumboTag tag +) { + return ((*tagset)[(unsigned) tag] & (1u << (unsigned) ns)) != 0u; +} + +// Like tag_in, but checks for the tag of a node, rather than a token. +static bool node_tag_in_set(const GumboNode* node, const TagSet* tags) { + assert(node != NULL); + if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) { + return false; + } + return tagset_includes ( + tags, + node->v.element.tag_namespace, + node->v.element.tag + ); +} + +// Like node_tag_in, but for the single-tag case. +static bool node_qualified_tag_is ( + const GumboNode* node, + GumboNamespaceEnum ns, + GumboTag tag +) { + assert(node); + return + (node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE) + && node->v.element.tag == tag + && node->v.element.tag_namespace == ns; +} + +// Like node_tag_in, but for the single-tag case in the HTML namespace +static bool node_html_tag_is(const GumboNode* node, GumboTag tag) { + return node_qualified_tag_is(node, GUMBO_NAMESPACE_HTML, tag); +} + +// https://html.spec.whatwg.org/multipage/parsing.html#reset-the-insertion-mode-appropriately // This is a helper function that returns the appropriate insertion mode instead -// of setting it. Returns GUMBO_INSERTION_MODE_INITIAL as a sentinel value to +// of setting it. Returns GUMBO_INSERTION_MODE_INITIAL as a sentinel value to // indicate that there is no appropriate insertion mode, and the loop should // continue. -static GumboInsertionMode get_appropriate_insertion_mode( - const GumboParser* parser, int index) { +static GumboInsertionMode get_appropriate_insertion_mode ( + const GumboParser* parser, + int index +) { const GumboVector* open_elements = &parser->_parser_state->_open_elements; const GumboNode* node = open_elements->data[index]; const bool is_last = index == 0; @@ -572,10 +627,10 @@ static GumboInsertionMode get_appropriate_insertion_mode( } assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); - if (node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML) - return is_last ? - GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL; - + if (node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML) { + return is_last ? GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL; + } + switch (node->v.element.tag) { case GUMBO_TAG_SELECT: { if (is_last) { @@ -619,8 +674,8 @@ static GumboInsertionMode get_appropriate_insertion_mode( return GUMBO_INSERTION_MODE_IN_FRAMESET; case GUMBO_TAG_HTML: return parser->_parser_state->_head_element - ? GUMBO_INSERTION_MODE_AFTER_HEAD - : GUMBO_INSERTION_MODE_BEFORE_HEAD; + ? GUMBO_INSERTION_MODE_AFTER_HEAD + : GUMBO_INSERTION_MODE_BEFORE_HEAD; default: break; } @@ -642,8 +697,10 @@ static void reset_insertion_mode_appropriately(GumboParser* parser) { assert(0); } -static GumboError* parser_add_parse_error( - GumboParser* parser, const GumboToken* token) { +static GumboError* parser_add_parse_error ( + GumboParser* parser, + const GumboToken* token +) { gumbo_debug("Adding parse error.\n"); GumboError* error = gumbo_add_error(parser); if (!error) { @@ -658,144 +715,97 @@ static GumboError* parser_add_parse_error( if (token->type == GUMBO_TOKEN_START_TAG) { extra_data->input_tag = token->v.start_tag.tag; } else if (token->type == GUMBO_TOKEN_END_TAG) { - extra_data->input_tag = token->v.end_tag; + extra_data->input_tag = token->v.end_tag.tag; } - GumboParserState* state = parser->_parser_state; + const GumboParserState* state = parser->_parser_state; extra_data->parser_state = state->_insertion_mode; - gumbo_vector_init( - parser, state->_open_elements.length, &extra_data->tag_stack); + gumbo_vector_init(state->_open_elements.length, &extra_data->tag_stack); for (unsigned int i = 0; i < state->_open_elements.length; ++i) { const GumboNode* node = state->_open_elements.data[i]; - assert( - node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); - gumbo_vector_add( - parser, (void*) node->v.element.tag, &extra_data->tag_stack); + assert ( + node->type == GUMBO_NODE_ELEMENT + || node->type == GUMBO_NODE_TEMPLATE + ); + gumbo_vector_add ( + (void*) node->v.element.tag, + &extra_data->tag_stack + ); } return error; } -// Returns true if the specified token is either a start or end tag (specified -// by is_start) with one of the tag types in the varargs list. Terminate the -// list with GUMBO_TAG_LAST; this functions as a sentinel since no portion of -// the spec references tags that are not in the spec. -static bool tag_in( - const GumboToken* token, bool is_start, const gumbo_tagset tags) { - GumboTag token_tag; - if (is_start && token->type == GUMBO_TOKEN_START_TAG) { - token_tag = token->v.start_tag.tag; - } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) { - token_tag = token->v.end_tag; - } else { - return false; - } - return (token_tag < GUMBO_TAG_LAST && tags[(int) token_tag] != 0); -} - -// Like tag_in, but for the single-tag case. -static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) { - if (is_start && token->type == GUMBO_TOKEN_START_TAG) { - return token->v.start_tag.tag == tag; - } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) { - return token->v.end_tag == tag; - } else { - return false; - } +// https://html.spec.whatwg.org/multipage/parsing.html#mathml-text-integration-point +static bool is_mathml_integration_point(const GumboNode* node) { + static const TagSet mathml_integration_point_tags = { + TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), + TAG_MATHML(MS), TAG_MATHML(MTEXT) + }; + return node_tag_in_set(node, &mathml_integration_point_tags); } -// Like tag_in, but checks for the tag of a node, rather than a token. -static bool node_tag_in_set(const GumboNode* node, const gumbo_tagset tags) { - assert(node != NULL); - if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) { - return false; +// https://html.spec.whatwg.org/multipage/parsing.html#html-integration-point +static bool is_html_integration_point(const GumboNode* node) { + static const TagSet html_integration_point_svg_tags = { + TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE) + }; + if (node_tag_in_set(node, &html_integration_point_svg_tags)) { + return true; } - return TAGSET_INCLUDES( - tags, node->v.element.tag_namespace, node->v.element.tag); -} - -// Like node_tag_in, but for the single-tag case. -static bool node_qualified_tag_is( - const GumboNode* node, GumboNamespaceEnum ns, GumboTag tag) { - assert(node); - return (node->type == GUMBO_NODE_ELEMENT || - node->type == GUMBO_NODE_TEMPLATE) && - node->v.element.tag == tag && node->v.element.tag_namespace == ns; -} -// Like node_tag_in, but for the single-tag case in the HTML namespace -static bool node_html_tag_is(const GumboNode* node, GumboTag tag) { - return node_qualified_tag_is(node, GUMBO_NAMESPACE_HTML, tag); -} - -static void push_template_insertion_mode( - GumboParser* parser, GumboInsertionMode mode) { - gumbo_vector_add( - parser, (void*) mode, &parser->_parser_state->_template_insertion_modes); -} - -static void pop_template_insertion_mode(GumboParser* parser) { - gumbo_vector_pop(parser, &parser->_parser_state->_template_insertion_modes); -} - -// Returns the current template insertion mode. If the stack of template -// insertion modes is empty, this returns GUMBO_INSERTION_MODE_INITIAL. -static GumboInsertionMode get_current_template_insertion_mode( - const GumboParser* parser) { - GumboVector* template_insertion_modes = - &parser->_parser_state->_template_insertion_modes; - if (template_insertion_modes->length == 0) { - return GUMBO_INSERTION_MODE_INITIAL; + const bool is_mathml_annotation_xml_element = node_qualified_tag_is ( + node, + GUMBO_NAMESPACE_MATHML, + GUMBO_TAG_ANNOTATION_XML + ); + const GumboVector* attributes = &node->v.element.attributes; + if ( + is_mathml_annotation_xml_element + && ( + attribute_matches(attributes, "encoding", "text/html") + || attribute_matches(attributes, "encoding", "application/xhtml+xml") + ) + ) { + return true; } - return (GumboInsertionMode) - template_insertion_modes->data[(template_insertion_modes->length - 1)]; -} - -// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#mathml-text-integration-point -static bool is_mathml_integration_point(const GumboNode* node) { - return node_tag_in_set( - node, (gumbo_tagset){TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), - TAG_MATHML(MS), TAG_MATHML(MTEXT)}); -} -// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#html-integration-point -static bool is_html_integration_point(const GumboNode* node) { - return node_tag_in_set(node, (gumbo_tagset){TAG_SVG(FOREIGNOBJECT), - TAG_SVG(DESC), TAG_SVG(TITLE)}) || - (node_qualified_tag_is( - node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) && - (attribute_matches( - &node->v.element.attributes, "encoding", "text/html") || - attribute_matches(&node->v.element.attributes, "encoding", - "application/xhtml+xml"))); + return false; } // This represents a place to insert a node, consisting of a target parent and a -// child index within that parent. If the node should be inserted at the end of +// child index within that parent. If the node should be inserted at the end of // the parent's child, index will be -1. typedef struct { GumboNode* target; int index; } InsertionLocation; -InsertionLocation get_appropriate_insertion_location( - GumboParser* parser, GumboNode* override_target) { +static InsertionLocation get_appropriate_insertion_location ( + const GumboParser* parser, + GumboNode* override_target +) { InsertionLocation retval = {override_target, -1}; if (retval.target == NULL) { // No override target; default to the current node, but special-case the // root node since get_current_node() assumes the stack of open elements is // non-empty. - retval.target = parser->_output->root != NULL ? get_current_node(parser) - : get_document_node(parser); + retval.target = (parser->_output->root != NULL) + ? get_current_node(parser) + : get_document_node(parser) + ; } - if (!parser->_parser_state->_foster_parent_insertions || - !node_tag_in_set(retval.target, (gumbo_tagset){TAG(TABLE), TAG(TBODY), - TAG(TFOOT), TAG(THEAD), TAG(TR)})) { + if ( + !parser->_parser_state->_foster_parent_insertions + || !node_tag_in_set(retval.target, &(const TagSet) { + TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR) + }) + ) { return retval; } // Foster-parenting case. int last_template_index = -1; int last_table_index = -1; - GumboVector* open_elements = &parser->_parser_state->_open_elements; + const GumboVector* open_elements = &parser->_parser_state->_open_elements; for (unsigned int i = 0; i < open_elements->length; ++i) { if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TEMPLATE)) { last_template_index = i; @@ -804,8 +814,10 @@ InsertionLocation get_appropriate_insertion_location( last_table_index = i; } } - if (last_template_index != -1 && - (last_table_index == -1 || last_template_index > last_table_index)) { + if ( + last_template_index != -1 + && (last_table_index == -1 || last_template_index > last_table_index) + ) { retval.target = open_elements->data[last_template_index]; return retval; } @@ -813,7 +825,7 @@ InsertionLocation get_appropriate_insertion_location( retval.target = open_elements->data[0]; return retval; } - GumboNode* last_table = open_elements->data[last_table_index]; + const GumboNode* last_table = open_elements->data[last_table_index]; if (last_table->parent != NULL) { retval.target = last_table->parent; retval.index = last_table->index_within_parent; @@ -826,13 +838,14 @@ InsertionLocation get_appropriate_insertion_location( // Appends a node to the end of its parent, setting the "parent" and // "index_within_parent" fields appropriately. -static void append_node( - GumboParser* parser, GumboNode* parent, GumboNode* node) { +static void append_node(GumboNode* parent, GumboNode* node) { assert(node->parent == NULL); - assert(node->index_within_parent == -1); + assert(node->index_within_parent == (unsigned int) -1); GumboVector* children; - if (parent->type == GUMBO_NODE_ELEMENT || - parent->type == GUMBO_NODE_TEMPLATE) { + if ( + parent->type == GUMBO_NODE_ELEMENT + || parent->type == GUMBO_NODE_TEMPLATE + ) { children = &parent->v.element.children; } else { assert(parent->type == GUMBO_NODE_DOCUMENT); @@ -840,23 +853,24 @@ static void append_node( } node->parent = parent; node->index_within_parent = children->length; - gumbo_vector_add(parser, (void*) node, children); + gumbo_vector_add((void*) node, children); assert(node->index_within_parent < children->length); } // Inserts a node at the specified InsertionLocation, updating the // "parent" and "index_within_parent" fields of it and all its siblings. // If the index of the location is -1, this calls append_node. -static void insert_node( - GumboParser* parser, GumboNode* node, InsertionLocation location) { +static void insert_node(GumboNode* node, InsertionLocation location) { assert(node->parent == NULL); - assert(node->index_within_parent == -1); + assert(node->index_within_parent == (unsigned int) -1); GumboNode* parent = location.target; int index = location.index; if (index != -1) { GumboVector* children = NULL; - if (parent->type == GUMBO_NODE_ELEMENT || - parent->type == GUMBO_NODE_TEMPLATE) { + if ( + parent->type == GUMBO_NODE_ELEMENT + || parent->type == GUMBO_NODE_TEMPLATE + ) { children = &parent->v.element.children; } else if (parent->type == GUMBO_NODE_DOCUMENT) { children = &parent->v.document.children; @@ -869,7 +883,7 @@ static void insert_node( assert((unsigned int) index < children->length); node->parent = parent; node->index_within_parent = index; - gumbo_vector_insert_at(parser, (void*) node, index, children); + gumbo_vector_insert_at((void*) node, index, children); assert(node->index_within_parent < children->length); for (unsigned int i = index + 1; i < children->length; ++i) { GumboNode* sibling = children->data[i]; @@ -877,7 +891,7 @@ static void insert_node( assert(sibling->index_within_parent < children->length); } } else { - append_node(parser, parent, node); + append_node(parent, node); } } @@ -888,42 +902,49 @@ static void maybe_flush_text_node_buffer(GumboParser* parser) { return; } - assert(buffer_state->_type == GUMBO_NODE_WHITESPACE || - buffer_state->_type == GUMBO_NODE_TEXT || - buffer_state->_type == GUMBO_NODE_CDATA); - GumboNode* text_node = create_node(parser, buffer_state->_type); + assert ( + buffer_state->_type == GUMBO_NODE_WHITESPACE + || buffer_state->_type == GUMBO_NODE_TEXT + || buffer_state->_type == GUMBO_NODE_CDATA + ); + GumboNode* text_node = create_node(buffer_state->_type); GumboText* text_node_data = &text_node->v.text; - text_node_data->text = - gumbo_string_buffer_to_string(parser, &buffer_state->_buffer); + text_node_data->text = gumbo_string_buffer_to_string(&buffer_state->_buffer); text_node_data->original_text.data = buffer_state->_start_original_text; text_node_data->original_text.length = state->_current_token->original_text.data - buffer_state->_start_original_text; text_node_data->start_pos = buffer_state->_start_position; - gumbo_debug("Flushing text node buffer of %.*s.\n", - (int) buffer_state->_buffer.length, buffer_state->_buffer.data); + gumbo_debug ( + "Flushing text node buffer of %.*s.\n", + (int) buffer_state->_buffer.length, + buffer_state->_buffer.data + ); InsertionLocation location = get_appropriate_insertion_location(parser, NULL); if (location.target->type == GUMBO_NODE_DOCUMENT) { // The DOM does not allow Document nodes to have Text children, so per the // spec, they are dropped on the floor. - destroy_node(parser, text_node); + destroy_node(text_node); } else { - insert_node(parser, text_node, location); + insert_node(text_node, location); } - gumbo_string_buffer_clear(parser, &buffer_state->_buffer); + gumbo_string_buffer_clear(&buffer_state->_buffer); buffer_state->_type = GUMBO_NODE_WHITESPACE; assert(buffer_state->_buffer.length == 0); } -static void record_end_of_element( - GumboToken* current_token, GumboElement* element) { +static void record_end_of_element ( + const GumboToken* current_token, + GumboElement* element +) { element->end_pos = current_token->position; - element->original_end_tag = current_token->type == GUMBO_TOKEN_END_TAG - ? current_token->original_text - : kGumboEmptyString; + element->original_end_tag = + (current_token->type == GUMBO_TOKEN_END_TAG) + ? current_token->original_text + : kGumboEmptyString; } static GumboNode* pop_current_node(GumboParser* parser) { @@ -931,24 +952,36 @@ static GumboNode* pop_current_node(GumboParser* parser) { maybe_flush_text_node_buffer(parser); if (state->_open_elements.length > 0) { assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML)); - gumbo_debug("Popping %s node.\n", - gumbo_normalized_tagname(get_current_node(parser)->v.element.tag)); + gumbo_debug ( + "Popping %s node.\n", + gumbo_normalized_tagname(get_current_node(parser)->v.element.tag) + ); } - GumboNode* current_node = gumbo_vector_pop(parser, &state->_open_elements); + GumboNode* current_node = gumbo_vector_pop(&state->_open_elements); if (!current_node) { assert(state->_open_elements.length == 0); return NULL; } - assert(current_node->type == GUMBO_NODE_ELEMENT || - current_node->type == GUMBO_NODE_TEMPLATE); + assert ( + current_node->type == GUMBO_NODE_ELEMENT + || current_node->type == GUMBO_NODE_TEMPLATE + ); bool is_closed_body_or_html_tag = - (node_html_tag_is(current_node, GUMBO_TAG_BODY) && - state->_closed_body_tag) || - (node_html_tag_is(current_node, GUMBO_TAG_HTML) && - state->_closed_html_tag); - if ((state->_current_token->type != GUMBO_TOKEN_END_TAG || - !node_html_tag_is(current_node, state->_current_token->v.end_tag)) && - !is_closed_body_or_html_tag) { + ( + node_html_tag_is(current_node, GUMBO_TAG_BODY) + && state->_closed_body_tag + ) || ( + node_html_tag_is(current_node, GUMBO_TAG_HTML) + && state->_closed_html_tag + ) + ; + if ( + ( + state->_current_token->type != GUMBO_TOKEN_END_TAG + || !node_html_tag_is(current_node, state->_current_token->v.end_tag.tag) + ) + && !is_closed_body_or_html_tag + ) { current_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG; } if (!is_closed_body_or_html_tag) { @@ -957,76 +990,89 @@ static GumboNode* pop_current_node(GumboParser* parser) { return current_node; } -static void append_comment_node( - GumboParser* parser, GumboNode* node, const GumboToken* token) { +static void append_comment_node ( + GumboParser* parser, + GumboNode* node, + const GumboToken* token +) { maybe_flush_text_node_buffer(parser); - GumboNode* comment = create_node(parser, GUMBO_NODE_COMMENT); + GumboNode* comment = create_node(GUMBO_NODE_COMMENT); comment->type = GUMBO_NODE_COMMENT; comment->parse_flags = GUMBO_INSERTION_NORMAL; comment->v.text.text = token->v.text; comment->v.text.original_text = token->original_text; comment->v.text.start_pos = token->position; - append_node(parser, node, comment); + append_node(node, comment); } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-row-context +// https://html.spec.whatwg.org/multipage/parsing.html#clear-the-stack-back-to-a-table-row-context static void clear_stack_to_table_row_context(GumboParser* parser) { - while (!node_tag_in_set(get_current_node(parser), - (gumbo_tagset){TAG(HTML), TAG(TR), TAG(TEMPLATE)})) { + static const TagSet tags = {TAG(HTML), TAG(TR), TAG(TEMPLATE)}; + while (!node_tag_in_set(get_current_node(parser), &tags)) { pop_current_node(parser); } } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-context +// https://html.spec.whatwg.org/multipage/parsing.html#clear-the-stack-back-to-a-table-context static void clear_stack_to_table_context(GumboParser* parser) { - while (!node_tag_in_set(get_current_node(parser), - (gumbo_tagset){TAG(HTML), TAG(TABLE), TAG(TEMPLATE)})) { + static const TagSet tags = {TAG(HTML), TAG(TABLE), TAG(TEMPLATE)}; + while (!node_tag_in_set(get_current_node(parser), &tags)) { pop_current_node(parser); } } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-body-context -void clear_stack_to_table_body_context(GumboParser* parser) { - while (!node_tag_in_set(get_current_node(parser), - (gumbo_tagset){TAG(HTML), TAG(TBODY), TAG(TFOOT), TAG(THEAD), - TAG(TEMPLATE)})) { +// https://html.spec.whatwg.org/multipage/parsing.html#clear-the-stack-back-to-a-table-body-context +static void clear_stack_to_table_body_context(GumboParser* parser) { + static const TagSet tags = { + TAG(HTML), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TEMPLATE) + }; + while (!node_tag_in_set(get_current_node(parser), &tags)) { pop_current_node(parser); } } // Creates a parser-inserted element in the HTML namespace and returns it. static GumboNode* create_element(GumboParser* parser, GumboTag tag) { - GumboNode* node = create_node(parser, GUMBO_NODE_ELEMENT); + GumboNode* node = create_node(GUMBO_NODE_ELEMENT); GumboElement* element = &node->v.element; - gumbo_vector_init(parser, 1, &element->children); - gumbo_vector_init(parser, 0, &element->attributes); + gumbo_vector_init(1, &element->children); + gumbo_vector_init(0, &element->attributes); element->tag = tag; + element->name = gumbo_normalized_tagname(tag); element->tag_namespace = GUMBO_NAMESPACE_HTML; element->original_tag = kGumboEmptyString; element->original_end_tag = kGumboEmptyString; element->start_pos = (parser->_parser_state->_current_token) - ? parser->_parser_state->_current_token->position - : kGumboEmptySourcePosition; + ? parser->_parser_state->_current_token->position + : kGumboEmptySourcePosition + ; element->end_pos = kGumboEmptySourcePosition; return node; } // Constructs an element from the given start tag token. -static GumboNode* create_element_from_token( - GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) { +static GumboNode* create_element_from_token ( + GumboToken* token, + GumboNamespaceEnum tag_namespace +) { assert(token->type == GUMBO_TOKEN_START_TAG); GumboTokenStartTag* start_tag = &token->v.start_tag; - GumboNodeType type = (tag_namespace == GUMBO_NAMESPACE_HTML && - start_tag->tag == GUMBO_TAG_TEMPLATE) - ? GUMBO_NODE_TEMPLATE - : GUMBO_NODE_ELEMENT; + GumboNodeType type = + ( + tag_namespace == GUMBO_NAMESPACE_HTML + && start_tag->tag == GUMBO_TAG_TEMPLATE + ) + ? GUMBO_NODE_TEMPLATE + : GUMBO_NODE_ELEMENT + ; - GumboNode* node = create_node(parser, type); + GumboNode* node = create_node(type); GumboElement* element = &node->v.element; - gumbo_vector_init(parser, 1, &element->children); + gumbo_vector_init(1, &element->children); element->attributes = start_tag->attributes; element->tag = start_tag->tag; + element->name = start_tag->name ? start_tag->name : gumbo_normalized_tagname(start_tag->tag); element->tag_namespace = tag_namespace; assert(token->original_text.length >= 2); @@ -1037,93 +1083,125 @@ static GumboNode* create_element_from_token( element->original_end_tag = kGumboEmptyString; element->end_pos = kGumboEmptySourcePosition; - // The element takes ownership of the attributes from the token, so any - // allocated-memory fields should be nulled out. + // The element takes ownership of the attributes and name from the token, so + // any allocated-memory fields should be nulled out. start_tag->attributes = kGumboEmptyVector; + start_tag->name = NULL; return node; } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-an-html-element -static void insert_element(GumboParser* parser, GumboNode* node, - bool is_reconstructing_formatting_elements) { +// https://html.spec.whatwg.org/multipage/parsing.html#insert-an-html-element +static void insert_element ( + GumboParser* parser, + GumboNode* node, + bool is_reconstructing_formatting_elements +) { GumboParserState* state = parser->_parser_state; // NOTE(jdtang): The text node buffer must always be flushed before inserting // a node, otherwise we're handling nodes in a different order than the spec - // mandated. However, one clause of the spec (character tokens in the body) + // mandated. However, one clause of the spec (character tokens in the body) // requires that we reconstruct the active formatting elements *before* adding // the character, and reconstructing the active formatting elements may itself // result in the insertion of new elements (which should be pushed onto the - // stack of open elements before the buffer is flushed). We solve this (for + // stack of open elements before the buffer is flushed). We solve this (for // the time being, the spec has been rewritten for <template> and the new // version may be simpler here) with a boolean flag to this method. if (!is_reconstructing_formatting_elements) { maybe_flush_text_node_buffer(parser); } InsertionLocation location = get_appropriate_insertion_location(parser, NULL); - insert_node(parser, node, location); - gumbo_vector_add(parser, (void*) node, &state->_open_elements); + insert_node(node, location); + gumbo_vector_add((void*) node, &state->_open_elements); } // Convenience method that combines create_element_from_token and // insert_element, inserting the generated element directly into the current -// node. Returns the node inserted. -static GumboNode* insert_element_from_token( - GumboParser* parser, GumboToken* token) { - GumboNode* element = - create_element_from_token(parser, token, GUMBO_NAMESPACE_HTML); +// node. Returns the node inserted. +static GumboNode* insert_element_from_token ( + GumboParser* parser, + GumboToken* token +) { + GumboNode* element = create_element_from_token(token, GUMBO_NAMESPACE_HTML); insert_element(parser, element, false); - gumbo_debug("Inserting <%s> element (@%x) from token.\n", - gumbo_normalized_tagname(element->v.element.tag), element); + gumbo_debug ( + "Inserting <%s> element (@%p) from token.\n", + gumbo_normalized_tagname(element->v.element.tag), + (void*)element + ); return element; } // Convenience method that combines create_element and insert_element, inserting -// a parser-generated element of a specific tag type. Returns the node +// a parser-generated element of a specific tag type. Returns the node // inserted. -static GumboNode* insert_element_of_tag_type( - GumboParser* parser, GumboTag tag, GumboParseFlags reason) { +static GumboNode* insert_element_of_tag_type ( + GumboParser* parser, + GumboTag tag, + GumboParseFlags reason +) { GumboNode* element = create_element(parser, tag); element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason; insert_element(parser, element, false); - gumbo_debug("Inserting %s element (@%x) from tag type.\n", - gumbo_normalized_tagname(tag), element); + gumbo_debug ( + "Inserting %s element (@%p) from tag type.\n", + gumbo_normalized_tagname(tag), + (void*)element + ); return element; } -// Convenience method for creating foreign namespaced element. Returns the node +// Convenience method for creating foreign namespaced element. Returns the node // inserted. -static GumboNode* insert_foreign_element( - GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) { +static GumboNode* insert_foreign_element ( + GumboParser* parser, + GumboToken* token, + GumboNamespaceEnum tag_namespace +) { assert(token->type == GUMBO_TOKEN_START_TAG); - GumboNode* element = create_element_from_token(parser, token, tag_namespace); + GumboNode* element = create_element_from_token(token, tag_namespace); insert_element(parser, element, false); - if (token_has_attribute(token, "xmlns") && - !attribute_matches_case_sensitive(&token->v.start_tag.attributes, "xmlns", - kLegalXmlns[tag_namespace])) { + if ( + token_has_attribute(token, "xmlns") + && !attribute_matches_case_sensitive ( + &token->v.start_tag.attributes, + "xmlns", + kLegalXmlns[tag_namespace] + ) + ) { // TODO(jdtang): Since there're multiple possible error codes here, we // eventually need reason codes to differentiate them. parser_add_parse_error(parser, token); } - if (token_has_attribute(token, "xmlns:xlink") && - !attribute_matches_case_sensitive(&token->v.start_tag.attributes, - "xmlns:xlink", "http://www.w3.org/1999/xlink")) { + if ( + token_has_attribute(token, "xmlns:xlink") + && !attribute_matches_case_sensitive ( + &token->v.start_tag.attributes, + "xmlns:xlink", + "http://www.w3.org/1999/xlink" + ) + ) { parser_add_parse_error(parser, token); } return element; } static void insert_text_token(GumboParser* parser, GumboToken* token) { - assert(token->type == GUMBO_TOKEN_WHITESPACE || - token->type == GUMBO_TOKEN_CHARACTER || - token->type == GUMBO_TOKEN_NULL || token->type == GUMBO_TOKEN_CDATA); + assert ( + token->type == GUMBO_TOKEN_WHITESPACE + || token->type == GUMBO_TOKEN_CHARACTER + || token->type == GUMBO_TOKEN_NULL + || token->type == GUMBO_TOKEN_CDATA + ); TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node; if (buffer_state->_buffer.length == 0) { // Initialize position fields. buffer_state->_start_original_text = token->original_text.data; buffer_state->_start_position = token->position; } - gumbo_string_buffer_append_codepoint( - parser, token->v.character, &buffer_state->_buffer); + gumbo_string_buffer_append_codepoint ( + token->v.character, + &buffer_state->_buffer + ); if (token->type == GUMBO_TOKEN_CHARACTER) { buffer_state->_type = GUMBO_NODE_TEXT; } else if (token->type == GUMBO_TOKEN_CDATA) { @@ -1132,14 +1210,17 @@ static void insert_text_token(GumboParser* parser, GumboToken* token) { gumbo_debug("Inserting text token '%c'.\n", token->v.character); } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generic-rcdata-element-parsing-algorithm -static void run_generic_parsing_algorithm( - GumboParser* parser, GumboToken* token, GumboTokenizerEnum lexer_state) { +// https://html.spec.whatwg.org/multipage/parsing.html#generic-rcdata-element-parsing-algorithm +static void run_generic_parsing_algorithm ( + GumboParser* parser, + GumboToken* token, + GumboTokenizerEnum lexer_state +) { insert_element_from_token(parser, token); gumbo_tokenizer_set_state(parser, lexer_state); - parser->_parser_state->_original_insertion_mode = - parser->_parser_state->_insertion_mode; - parser->_parser_state->_insertion_mode = GUMBO_INSERTION_MODE_TEXT; + GumboParserState* parser_state = parser->_parser_state; + parser_state->_original_insertion_mode = parser_state->_insertion_mode; + parser_state->_insertion_mode = GUMBO_INSERTION_MODE_TEXT; } static void acknowledge_self_closing_tag(GumboParser* parser) { @@ -1165,10 +1246,13 @@ static bool find_last_anchor_index(GumboParser* parser, int* anchor_index) { // Counts the number of open formatting elements in the list of active // formatting elements (after the last active scope marker) that have a specific -// tag. If this is > 0, then earliest_matching_index will be filled in with the +// tag. If this is > 0, then earliest_matching_index will be filled in with the // index of the first such element. -static int count_formatting_elements_of_tag(GumboParser* parser, - const GumboNode* desired_node, int* earliest_matching_index) { +static int count_formatting_elements_of_tag ( + GumboParser* parser, + const GumboNode* desired_node, + int* earliest_matching_index +) { const GumboElement* desired_element = &desired_node->v.element; GumboVector* elements = &parser->_parser_state->_active_formatting_elements; int num_identical_elements = 0; @@ -1178,10 +1262,10 @@ static int count_formatting_elements_of_tag(GumboParser* parser, break; } assert(node->type == GUMBO_NODE_ELEMENT); - if (node_qualified_tag_is( - node, desired_element->tag_namespace, desired_element->tag) && - all_attributes_match( - &node->v.element.attributes, &desired_element->attributes)) { + if ( + node_qualified_tag_is(node, desired_element->tag_namespace, desired_element->tag) + && all_attributes_match(&node->v.element.attributes, &desired_element->attributes) + ) { num_identical_elements++; *earliest_matching_index = i; } @@ -1189,10 +1273,12 @@ static int count_formatting_elements_of_tag(GumboParser* parser, return num_identical_elements; } -// http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reconstruct-the-active-formatting-elements +// https://html.spec.whatwg.org/multipage/parsing.html#reconstruct-the-active-formatting-elements static void add_formatting_element(GumboParser* parser, const GumboNode* node) { - assert(node == &kActiveFormattingScopeMarker || - node->type == GUMBO_NODE_ELEMENT); + assert ( + node == &kActiveFormattingScopeMarker + || node->type == GUMBO_NODE_ELEMENT + ); GumboVector* elements = &parser->_parser_state->_active_formatting_elements; if (node == &kActiveFormattingScopeMarker) { gumbo_debug("Adding a scope marker.\n"); @@ -1202,21 +1288,26 @@ static void add_formatting_element(GumboParser* parser, const GumboNode* node) { // Hunt for identical elements. int earliest_identical_element = elements->length; - int num_identical_elements = count_formatting_elements_of_tag( - parser, node, &earliest_identical_element); + int num_identical_elements = count_formatting_elements_of_tag ( + parser, + node, + &earliest_identical_element + ); // Noah's Ark clause: if there're at least 3, remove the earliest. if (num_identical_elements >= 3) { - gumbo_debug("Noah's ark clause: removing element at %d.\n", - earliest_identical_element); - gumbo_vector_remove_at(parser, earliest_identical_element, elements); + gumbo_debug ( + "Noah's ark clause: removing element at %d.\n", + earliest_identical_element + ); + gumbo_vector_remove_at(earliest_identical_element, elements); } - gumbo_vector_add(parser, (void*) node, elements); + gumbo_vector_add((void*) node, elements); } -static bool is_open_element(GumboParser* parser, const GumboNode* node) { - GumboVector* open_elements = &parser->_parser_state->_open_elements; +static bool is_open_element(const GumboParser* parser, const GumboNode* node) { + const GumboVector* open_elements = &parser->_parser_state->_open_elements; for (unsigned int i = 0; i < open_elements->length; ++i) { if (open_elements->data[i] == node) { return true; @@ -1225,13 +1316,15 @@ static bool is_open_element(GumboParser* parser, const GumboNode* node) { return false; } -// Clones attributes, tags, etc. of a node, but does not copy the content. The +// Clones attributes, tags, etc. of a node, but does not copy the content. The // clone shares no structure with the original node: all owned strings and // values are fresh copies. -GumboNode* clone_node( - GumboParser* parser, GumboNode* node, GumboParseFlags reason) { +static GumboNode* clone_node ( + GumboNode* node, + GumboParseFlags reason +) { assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); - GumboNode* new_node = gumbo_parser_allocate(parser, sizeof(GumboNode)); + GumboNode* new_node = gumbo_alloc(sizeof(GumboNode)); *new_node = *node; new_node->parent = NULL; new_node->index_within_parent = -1; @@ -1240,26 +1333,25 @@ GumboNode* clone_node( new_node->parse_flags &= ~GUMBO_INSERTION_IMPLICIT_END_TAG; new_node->parse_flags |= reason | GUMBO_INSERTION_BY_PARSER; GumboElement* element = &new_node->v.element; - gumbo_vector_init(parser, 1, &element->children); + gumbo_vector_init(1, &element->children); const GumboVector* old_attributes = &node->v.element.attributes; - gumbo_vector_init(parser, old_attributes->length, &element->attributes); + gumbo_vector_init(old_attributes->length, &element->attributes); for (unsigned int i = 0; i < old_attributes->length; ++i) { const GumboAttribute* old_attr = old_attributes->data[i]; - GumboAttribute* attr = - gumbo_parser_allocate(parser, sizeof(GumboAttribute)); + GumboAttribute* attr = gumbo_alloc(sizeof(GumboAttribute)); *attr = *old_attr; - attr->name = gumbo_copy_stringz(parser, old_attr->name); - attr->value = gumbo_copy_stringz(parser, old_attr->value); - gumbo_vector_add(parser, attr, &element->attributes); + attr->name = gumbo_strdup(old_attr->name); + attr->value = gumbo_strdup(old_attr->value); + gumbo_vector_add(attr, &element->attributes); } return new_node; } // "Reconstruct active formatting elements" part of the spec. -// This implementation is based on the html5lib translation from the mess of -// GOTOs in the spec to reasonably structured programming. -// http://code.google.com/p/html5lib/source/browse/python/html5lib/treebuilders/_base.py +// This implementation is based on the html5lib translation from the +// mess of GOTOs in the spec to reasonably structured programming. +// https://github.com/html5lib/html5lib-python/blob/master/html5lib/treebuilders/base.py static void reconstruct_active_formatting_elements(GumboParser* parser) { GumboVector* elements = &parser->_parser_state->_active_formatting_elements; // Step 1 @@ -1270,8 +1362,10 @@ static void reconstruct_active_formatting_elements(GumboParser* parser) { // Step 2 & 3 unsigned int i = elements->length - 1; GumboNode* element = elements->data[i]; - if (element == &kActiveFormattingScopeMarker || - is_open_element(parser, element)) { + if ( + element == &kActiveFormattingScopeMarker + || is_open_element(parser, element) + ) { return; } @@ -1284,31 +1378,43 @@ static void reconstruct_active_formatting_elements(GumboParser* parser) { } // Step 5 element = elements->data[--i]; - } while (element != &kActiveFormattingScopeMarker && - !is_open_element(parser, element)); + } while ( + element != &kActiveFormattingScopeMarker + && !is_open_element(parser, element) + ); ++i; - gumbo_debug("Reconstructing elements from %d on %s parent.\n", i, - gumbo_normalized_tagname(get_current_node(parser)->v.element.tag)); + gumbo_debug ( + "Reconstructing elements from %u on %s parent.\n", + i, + gumbo_normalized_tagname(get_current_node(parser)->v.element.tag) + ); for (; i < elements->length; ++i) { // Step 7 & 8. assert(elements->length > 0); assert(i < elements->length); element = elements->data[i]; assert(element != &kActiveFormattingScopeMarker); - GumboNode* clone = clone_node( - parser, element, GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT); + GumboNode* clone = clone_node ( + element, + GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT + ); // Step 9. InsertionLocation location = get_appropriate_insertion_location(parser, NULL); - insert_node(parser, clone, location); - gumbo_vector_add( - parser, (void*) clone, &parser->_parser_state->_open_elements); + insert_node(clone, location); + gumbo_vector_add ( + (void*) clone, + &parser->_parser_state->_open_elements + ); // Step 10. elements->data[i] = clone; - gumbo_debug("Reconstructed %s element at %d.\n", - gumbo_normalized_tagname(clone->v.element.tag), i); + gumbo_debug ( + "Reconstructed %s element at %u.\n", + gumbo_normalized_tagname(clone->v.element.tag), + i + ); } } @@ -1317,109 +1423,150 @@ static void clear_active_formatting_elements(GumboParser* parser) { int num_elements_cleared = 0; const GumboNode* node; do { - node = gumbo_vector_pop(parser, elements); + node = gumbo_vector_pop(elements); ++num_elements_cleared; } while (node && node != &kActiveFormattingScopeMarker); - gumbo_debug("Cleared %d elements from active formatting list.\n", - num_elements_cleared); + gumbo_debug ( + "Cleared %d elements from active formatting list.\n", + num_elements_cleared + ); } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-initial-insertion-mode -static GumboQuirksModeEnum compute_quirks_mode( - const GumboTokenDocType* doctype) { - if (doctype->force_quirks || strcmp(doctype->name, kDoctypeHtml.data) || - is_in_static_list( - doctype->public_identifier, kQuirksModePublicIdPrefixes, false) || - is_in_static_list( - doctype->public_identifier, kQuirksModePublicIdExactMatches, true) || - is_in_static_list( - doctype->system_identifier, kQuirksModeSystemIdExactMatches, true) || - (is_in_static_list(doctype->public_identifier, - kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false) && - !doctype->has_system_identifier)) { +// https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode +static GumboQuirksModeEnum compute_quirks_mode(const GumboTokenDocType* doctype) { + const char *const pubid = doctype->public_identifier; + const char *const sysid = doctype->system_identifier; + + if ( + doctype->force_quirks + || strcmp(doctype->name, "html") + || is_in_static_list(pubid, kQuirksModePublicIdPrefixes, false) + || is_in_static_list(pubid, kQuirksModePublicIdExactMatches, true) + || is_in_static_list(sysid, kQuirksModeSystemIdExactMatches, true) + || ( + !doctype->has_system_identifier + && is_in_static_list(pubid, kSystemIdDependentPublicIdPrefixes, false) + ) + ) { return GUMBO_DOCTYPE_QUIRKS; - } else if (is_in_static_list(doctype->public_identifier, - kLimitedQuirksPublicIdPrefixes, false) || - (is_in_static_list(doctype->public_identifier, - kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false) && - doctype->has_system_identifier)) { + } + + if ( + is_in_static_list(pubid, kLimitedQuirksPublicIdPrefixes, false) + || ( + doctype->has_system_identifier + && is_in_static_list(pubid, kSystemIdDependentPublicIdPrefixes, false) + ) + ) { return GUMBO_DOCTYPE_LIMITED_QUIRKS; } + return GUMBO_DOCTYPE_NO_QUIRKS; } // The following functions are all defined by the "has an element in __ scope" // sections of the HTML5 spec: -// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-the-specific-scope +// https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-the-specific-scope // The basic idea behind them is that they check for an element of the given // qualified name, contained within a scope formed by a set of other qualified -// names. For example, "has an element in list scope" looks for an element of +// names. For example, "has an element in list scope" looks for an element of // the given qualified name within the nearest enclosing <ol> or <ul>, along // with a bunch of generic element types that serve to "firewall" their content // from the rest of the document. Note that because of the way the spec is // written, // all elements are expected to be in the HTML namespace -static bool has_an_element_in_specific_scope(GumboParser* parser, - int expected_size, const GumboTag* expected, bool negate, - const gumbo_tagset tags) { - GumboVector* open_elements = &parser->_parser_state->_open_elements; +static bool has_an_element_in_specific_scope ( + const GumboParser* parser, + int expected_size, + const GumboTag* expected, + bool negate, + const TagSet* tags +) { + const GumboVector* open_elements = &parser->_parser_state->_open_elements; for (int i = open_elements->length; --i >= 0;) { const GumboNode* node = open_elements->data[i]; - if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) + if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) { continue; + } GumboTag node_tag = node->v.element.tag; GumboNamespaceEnum node_ns = node->v.element.tag_namespace; for (int j = 0; j < expected_size; ++j) { - if (node_tag == expected[j] && node_ns == GUMBO_NAMESPACE_HTML) + if (node_tag == expected[j] && node_ns == GUMBO_NAMESPACE_HTML) { return true; + } } - bool found = TAGSET_INCLUDES(tags, node_ns, node_tag); - if (negate != found) return false; + bool found = tagset_includes(tags, node_ns, node_tag); + if (negate != found) { + return false; + } } return false; } // Checks for the presence of an open element of the specified tag type. -static bool has_open_element(GumboParser* parser, GumboTag tag) { - return has_an_element_in_specific_scope( - parser, 1, &tag, false, (gumbo_tagset){TAG(HTML)}); +static bool has_open_element(const GumboParser* parser, GumboTag tag) { + static const TagSet tags = {TAG(HTML)}; + return has_an_element_in_specific_scope(parser, 1, &tag, false, &tags); } -// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-scope -static bool has_an_element_in_scope(GumboParser* parser, GumboTag tag) { - return has_an_element_in_specific_scope(parser, 1, &tag, false, - (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), - TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), - TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT), - TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), - TAG_SVG(TITLE)}); +// https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-scope +#define DEFAULT_SCOPE_TAGS \ + TAG(APPLET), \ + TAG(CAPTION), \ + TAG(HTML), \ + TAG(TABLE), \ + TAG(TD), \ + TAG(TH), \ + TAG(MARQUEE), \ + TAG(OBJECT), \ + TAG(TEMPLATE), \ + TAG_MATHML(MI), \ + TAG_MATHML(MO), \ + TAG_MATHML(MN), \ + TAG_MATHML(MS), \ + TAG_MATHML(MTEXT), \ + TAG_MATHML(ANNOTATION_XML), \ + TAG_SVG(FOREIGNOBJECT), \ + TAG_SVG(DESC), \ + TAG_SVG(TITLE) + +static const TagSet heading_tags = { + TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6) +}; + +static const TagSet td_th_tags = { + TAG(TD), TAG(TH) +}; + +static const TagSet dd_dt_tags = { + TAG(DD), TAG(DT) +}; + +// https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-scope +static bool has_an_element_in_scope(const GumboParser* parser, GumboTag tag) { + static const TagSet tags = {DEFAULT_SCOPE_TAGS}; + return has_an_element_in_specific_scope(parser, 1, &tag, false, &tags); } // Like "has an element in scope", but for the specific case of looking for a -// unique target node, not for any node with a given tag name. This duplicates +// unique target node, not for any node with a given tag name. This duplicates // much of the algorithm from has_an_element_in_specific_scope because the // predicate is different when checking for an exact node, and it's easier & // faster just to duplicate the code for this one case than to try and // parameterize it. -static bool has_node_in_scope(GumboParser* parser, const GumboNode* node) { - GumboVector* open_elements = &parser->_parser_state->_open_elements; +static bool has_node_in_scope(const GumboParser* parser, const GumboNode* node) { + static const TagSet tags = {DEFAULT_SCOPE_TAGS}; + const GumboVector* open_elements = &parser->_parser_state->_open_elements; for (int i = open_elements->length; --i >= 0;) { const GumboNode* current = open_elements->data[i]; + const GumboNodeType type = current->type; if (current == node) { return true; - } - if (current->type != GUMBO_NODE_ELEMENT && - current->type != GUMBO_NODE_TEMPLATE) { + } else if (type != GUMBO_NODE_ELEMENT && type != GUMBO_NODE_TEMPLATE) { continue; - } - if (node_tag_in_set(current, - (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), - TAG(TD), TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), - TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), - TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML), - TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE)})) { + } else if (node_tag_in_set(current, &tags)) { return false; } } @@ -1429,76 +1576,72 @@ static bool has_node_in_scope(GumboParser* parser, const GumboNode* node) { // Like has_an_element_in_scope, but restricts the expected qualified name to a // range of possible qualified names instead of just a single one. -static bool has_an_element_in_scope_with_tagname( - GumboParser* parser, int expected_len, const GumboTag expected[]) { - return has_an_element_in_specific_scope(parser, expected_len, expected, false, - (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), - TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), - TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT), - TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), - TAG_SVG(TITLE)}); +static bool has_an_element_in_scope_with_tagname ( + const GumboParser* parser, + int len, + const GumboTag expected[] +) { + static const TagSet tags = {DEFAULT_SCOPE_TAGS}; + return has_an_element_in_specific_scope(parser, len, expected, false, &tags); } -// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-list-item-scope -static bool has_an_element_in_list_scope(GumboParser* parser, GumboTag tag) { - return has_an_element_in_specific_scope(parser, 1, &tag, false, - (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), - TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), - TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT), - TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), - TAG_SVG(TITLE), TAG(OL), TAG(UL)}); +// https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-list-item-scope +static bool has_an_element_in_list_scope(const GumboParser* parser, GumboTag tag) { + static const TagSet tags = {DEFAULT_SCOPE_TAGS, TAG(OL), TAG(UL)}; + return has_an_element_in_specific_scope(parser, 1, &tag, false, &tags); } -// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-button-scope -static bool has_an_element_in_button_scope(GumboParser* parser, GumboTag tag) { - return has_an_element_in_specific_scope(parser, 1, &tag, false, - (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), - TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), - TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT), - TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), - TAG_SVG(TITLE), TAG(BUTTON)}); +// https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-button-scope +static bool has_an_element_in_button_scope(const GumboParser* parser, GumboTag tag) { + static const TagSet tags = {DEFAULT_SCOPE_TAGS, TAG(BUTTON)}; + return has_an_element_in_specific_scope(parser, 1, &tag, false, &tags); } -// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-table-scope -static bool has_an_element_in_table_scope(GumboParser* parser, GumboTag tag) { - return has_an_element_in_specific_scope(parser, 1, &tag, false, - (gumbo_tagset){TAG(HTML), TAG(TABLE), TAG(TEMPLATE)}); +// https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-table-scope +static bool has_an_element_in_table_scope(const GumboParser* parser, GumboTag tag) { + static const TagSet tags = {TAG(HTML), TAG(TABLE), TAG(TEMPLATE)}; + return has_an_element_in_specific_scope(parser, 1, &tag, false, &tags); } -// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-select-scope -static bool has_an_element_in_select_scope(GumboParser* parser, GumboTag tag) { - return has_an_element_in_specific_scope( - parser, 1, &tag, true, (gumbo_tagset){TAG(OPTGROUP), TAG(OPTION)}); +// https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-select-scope +static bool has_an_element_in_select_scope(const GumboParser* parser, GumboTag tag) { + static const TagSet tags = {TAG(OPTGROUP), TAG(OPTION)}; + return has_an_element_in_specific_scope(parser, 1, &tag, true, &tags); } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generate-implied-end-tags +// https://html.spec.whatwg.org/multipage/parsing.html#generate-implied-end-tags // "exception" is the "element to exclude from the process" listed in the spec. // Pass GUMBO_TAG_LAST to not exclude any of them. static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) { - for (; node_tag_in_set(get_current_node(parser), - (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(OPTION), - TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RB), TAG(RT), TAG(RTC)}) && - !node_html_tag_is(get_current_node(parser), exception); - pop_current_node(parser)) - ; + static const TagSet tags = { + TAG(DD), TAG(DT), TAG(LI), TAG(OPTION), TAG(OPTGROUP), + TAG(P), TAG(RP), TAG(RB), TAG(RT), TAG(RTC) + }; + while ( + node_tag_in_set(get_current_node(parser), &tags) + && !node_html_tag_is(get_current_node(parser), exception) + ) { + pop_current_node(parser); + } } // This is the "generate all implied end tags thoroughly" clause of the spec. -// https://html.spec.whatwg.org/multipage/syntax.html#closing-elements-that-have-implied-end-tags +// https://html.spec.whatwg.org/multipage/parsing.html#closing-elements-that-have-implied-end-tags static void generate_all_implied_end_tags_thoroughly(GumboParser* parser) { - for ( - ; node_tag_in_set(get_current_node(parser), - (gumbo_tagset){TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI), - TAG(OPTION), TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RT), TAG(RTC), - TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(HEAD), TAG(TR)}); - pop_current_node(parser)) - ; + static const TagSet tags = { + TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI), TAG(OPTION), + TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY), + TAG(TD), TAG(TFOOT), TAG(TH), TAG(HEAD), TAG(TR) + }; + while (node_tag_in_set(get_current_node(parser), &tags)) { + pop_current_node(parser); + } } // This factors out the clauses relating to "act as if an end tag token with tag -// name "table" had been seen. Returns true if there's a table element in table +// name "table" had been seen. Returns true if there's a table element in table // scope which was successfully closed, false if not and the token should be -// ignored. Does not add parse errors; callers should handle that. +// ignored. Does not add parse errors; callers should handle that. static bool close_table(GumboParser* parser) { if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TABLE)) { return false; @@ -1514,8 +1657,11 @@ static bool close_table(GumboParser* parser) { // This factors out the clauses relating to "act as if an end tag token with tag // name `cell_tag` had been seen". -static bool close_table_cell( - GumboParser* parser, const GumboToken* token, GumboTag cell_tag) { +static bool close_table_cell ( + GumboParser* parser, + const GumboToken* token, + GumboTag cell_tag +) { bool result = true; generate_implied_end_tags(parser, GUMBO_TAG_LAST); const GumboNode* node = get_current_node(parser); @@ -1532,7 +1678,7 @@ static bool close_table_cell( return result; } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#close-the-cell +// https://html.spec.whatwg.org/multipage/parsing.html#close-the-cell // This holds the logic to determine whether we should close a <td> or a <th>. static bool close_current_cell(GumboParser* parser, const GumboToken* token) { if (has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) { @@ -1545,7 +1691,7 @@ static bool close_current_cell(GumboParser* parser, const GumboToken* token) { } // This factors out the "act as if an end tag of tag name 'select' had been -// seen" clause of the spec, since it's referenced in several places. It pops +// seen" clause of the spec, since it's referenced in several places. It pops // all nodes from the stack until the current <select> has been closed, then // resets the insertion mode appropriately. static void close_current_select(GumboParser* parser) { @@ -1557,45 +1703,60 @@ static void close_current_select(GumboParser* parser) { } // The list of nodes in the "special" category: -// http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#special +// https://html.spec.whatwg.org/multipage/parsing.html#special static bool is_special_node(const GumboNode* node) { assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); - return node_tag_in_set(node, - (gumbo_tagset){TAG(ADDRESS), TAG(APPLET), TAG(AREA), TAG(ARTICLE), - TAG(ASIDE), TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(BLOCKQUOTE), - TAG(BODY), TAG(BR), TAG(BUTTON), TAG(CAPTION), TAG(CENTER), TAG(COL), - TAG(COLGROUP), TAG(MENUITEM), TAG(DD), TAG(DETAILS), TAG(DIR), - TAG(DIV), TAG(DL), TAG(DT), TAG(EMBED), TAG(FIELDSET), - TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(FORM), TAG(FRAME), - TAG(FRAMESET), TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6), - TAG(HEAD), TAG(HEADER), TAG(HGROUP), TAG(HR), TAG(HTML), TAG(IFRAME), - TAG(IMG), TAG(INPUT), TAG(ISINDEX), TAG(LI), TAG(LINK), TAG(LISTING), - TAG(MARQUEE), TAG(MENU), TAG(META), TAG(NAV), TAG(NOEMBED), - TAG(NOFRAMES), TAG(NOSCRIPT), TAG(OBJECT), TAG(OL), TAG(P), - TAG(PARAM), TAG(PLAINTEXT), TAG(PRE), TAG(SCRIPT), TAG(SECTION), - TAG(SELECT), TAG(STYLE), TAG(SUMMARY), TAG(TABLE), TAG(TBODY), - TAG(TD), TAG(TEMPLATE), TAG(TEXTAREA), TAG(TFOOT), TAG(TH), - TAG(THEAD), TAG(TITLE), TAG(TR), TAG(UL), TAG(WBR), TAG(XMP), - - TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), - TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML), - - TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC)}); + return node_tag_in_set(node, &(const TagSet) { + TAG(ADDRESS), TAG(APPLET), TAG(AREA), TAG(ARTICLE), + TAG(ASIDE), TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(BLOCKQUOTE), + TAG(BODY), TAG(BR), TAG(BUTTON), TAG(CAPTION), TAG(CENTER), TAG(COL), + TAG(COLGROUP), TAG(DD), TAG(DETAILS), TAG(DIR), + TAG(DIV), TAG(DL), TAG(DT), TAG(EMBED), TAG(FIELDSET), + TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(FORM), TAG(FRAME), + TAG(FRAMESET), TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6), + TAG(HEAD), TAG(HEADER), TAG(HGROUP), TAG(HR), TAG(HTML), TAG(IFRAME), + TAG(IMG), TAG(INPUT), TAG(LI), TAG(LINK), TAG(LISTING), + TAG(MARQUEE), TAG(MENU), TAG(META), TAG(NAV), TAG(NOEMBED), + TAG(NOFRAMES), TAG(NOSCRIPT), TAG(OBJECT), TAG(OL), TAG(P), + TAG(PARAM), TAG(PLAINTEXT), TAG(PRE), TAG(SCRIPT), TAG(SECTION), + TAG(SELECT), TAG(STYLE), TAG(SUMMARY), TAG(TABLE), TAG(TBODY), + TAG(TD), TAG(TEMPLATE), TAG(TEXTAREA), TAG(TFOOT), TAG(TH), + TAG(THEAD), TAG(TR), TAG(UL), TAG(WBR), TAG(XMP), + + TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), + TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML), + + TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), + + // This TagSet needs to include the "title" element in both the + // HTML and SVG namespaces. Using both TAG(TITLE) and TAG_SVG(TITLE) + // won't work, due to the simplistic way in which the TAG macros are + // implemented, so we do it like this instead: + [GUMBO_TAG_TITLE] = + (1 << GUMBO_NAMESPACE_HTML) | + (1 << GUMBO_NAMESPACE_SVG) + } + ); } // Implicitly closes currently open elements until it reaches an element with // the -// specified qualified name. If the elements closed are in the set handled by +// specified qualified name. If the elements closed are in the set handled by // generate_implied_end_tags, this is normal operation and this function returns -// true. Otherwise, a parse error is recorded and this function returns false. -static bool implicitly_close_tags(GumboParser* parser, GumboToken* token, - GumboNamespaceEnum target_ns, GumboTag target) { +// true. Otherwise, a parse error is recorded and this function returns false. +static bool implicitly_close_tags ( + GumboParser* parser, + GumboToken* token, + GumboNamespaceEnum target_ns, + GumboTag target +) { bool result = true; generate_implied_end_tags(parser, target); if (!node_qualified_tag_is(get_current_node(parser), target_ns, target)) { parser_add_parse_error(parser, token); while ( - !node_qualified_tag_is(get_current_node(parser), target_ns, target)) { + !node_qualified_tag_is(get_current_node(parser), target_ns, target) + ) { pop_current_node(parser); } result = false; @@ -1606,44 +1767,61 @@ static bool implicitly_close_tags(GumboParser* parser, GumboToken* token, } // If the stack of open elements has a <p> tag in button scope, this acts as if -// a </p> tag was encountered, implicitly closing tags. Returns false if a -// parse error occurs. This is a convenience function because this particular +// a </p> tag was encountered, implicitly closing tags. Returns false if a +// parse error occurs. This is a convenience function because this particular // clause appears several times in the spec. -static bool maybe_implicitly_close_p_tag( - GumboParser* parser, GumboToken* token) { +static bool maybe_implicitly_close_p_tag ( + GumboParser* parser, + GumboToken* token +) { if (has_an_element_in_button_scope(parser, GUMBO_TAG_P)) { - return implicitly_close_tags( - parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_P); + return implicitly_close_tags ( + parser, + token, + GUMBO_NAMESPACE_HTML, + GUMBO_TAG_P + ); } return true; } // Convenience function to encapsulate the logic for closing <li> or <dd>/<dt> -// tags. Pass true to is_li for handling <li> tags, false for <dd> and <dt>. -static void maybe_implicitly_close_list_tag( - GumboParser* parser, GumboToken* token, bool is_li) { +// tags. Pass true to is_li for handling <li> tags, false for <dd> and <dt>. +static void maybe_implicitly_close_list_tag ( + GumboParser* parser, + GumboToken* token, + bool is_li +) { GumboParserState* state = parser->_parser_state; state->_frameset_ok = false; for (int i = state->_open_elements.length; --i >= 0;) { const GumboNode* node = state->_open_elements.data[i]; - bool is_list_tag = - is_li ? node_html_tag_is(node, GUMBO_TAG_LI) - : node_tag_in_set(node, (gumbo_tagset){TAG(DD), TAG(DT)}); + bool is_list_tag = is_li + ? node_html_tag_is(node, GUMBO_TAG_LI) + : node_tag_in_set(node, &dd_dt_tags) + ; if (is_list_tag) { - implicitly_close_tags( - parser, token, node->v.element.tag_namespace, node->v.element.tag); + implicitly_close_tags ( + parser, + token, + node->v.element.tag_namespace, + node->v.element.tag + ); return; } - if (is_special_node(node) && - !node_tag_in_set( - node, (gumbo_tagset){TAG(ADDRESS), TAG(DIV), TAG(P)})) { + if ( + is_special_node(node) + && !node_tag_in_set(node, &(const TagSet){TAG(ADDRESS), TAG(DIV), TAG(P)}) + ) { return; } } } -static void merge_attributes( - GumboParser* parser, GumboToken* token, GumboNode* node) { +static void merge_attributes ( + GumboToken* token, + GumboNode* node +) { assert(token->type == GUMBO_TOKEN_START_TAG); assert(node->type == GUMBO_NODE_ELEMENT); const GumboVector* token_attr = &token->v.start_tag.attributes; @@ -1655,15 +1833,15 @@ static void merge_attributes( // Ownership of the attribute is transferred by this gumbo_vector_add, // so it has to be nulled out of the original token so it doesn't get // double-deleted. - gumbo_vector_add(parser, attr, node_attr); + gumbo_vector_add(attr, node_attr); token_attr->data[i] = NULL; } } // When attributes are merged, it means the token has been ignored and merged - // with another token, so we need to free its memory. The attributes that are + // with another token, so we need to free its memory. The attributes that are // transferred need to be nulled-out in the vector above so that they aren't // double-deleted. - gumbo_token_destroy(parser, token); + gumbo_token_destroy(token); #ifndef NDEBUG // Mark this sentinel so the assertion in the main loop knows it's been @@ -1673,80 +1851,107 @@ static void merge_attributes( } const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tag) { - for (size_t i = 0; i < sizeof(kSvgTagReplacements) / sizeof(ReplacementEntry); - ++i) { - const ReplacementEntry* entry = &kSvgTagReplacements[i]; - if (gumbo_string_equals_ignore_case(tag, &entry->from)) { - return entry->to.data; - } - } - return NULL; + const StringReplacement *replacement = gumbo_get_svg_tag_replacement ( + tag->data, + tag->length + ); + return replacement ? replacement->to : NULL; } -// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes +// https://html.spec.whatwg.org/multipage/parsing.html#adjust-foreign-attributes // This destructively modifies any matching attributes on the token and sets the // namespace appropriately. -static void adjust_foreign_attributes(GumboParser* parser, GumboToken* token) { +static void adjust_foreign_attributes(GumboToken* token) { assert(token->type == GUMBO_TOKEN_START_TAG); const GumboVector* attributes = &token->v.start_tag.attributes; - for (size_t i = 0; i < sizeof(kForeignAttributeReplacements) / - sizeof(NamespacedAttributeReplacement); - ++i) { - const NamespacedAttributeReplacement* entry = - &kForeignAttributeReplacements[i]; - GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from); - if (!attr) { + for (unsigned int i = 0, n = attributes->length; i < n; ++i) { + GumboAttribute* attr = attributes->data[i]; + const ForeignAttrReplacement* entry = gumbo_get_foreign_attr_replacement ( + attr->name, + strlen(attr->name) + ); + if (!entry) { continue; } - gumbo_parser_deallocate(parser, (void*) attr->name); + gumbo_free((void*) attr->name); attr->attr_namespace = entry->attr_namespace; - attr->name = gumbo_copy_stringz(parser, entry->local_name); + attr->name = gumbo_strdup(entry->local_name); + } +} + +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign +// This adjusts svg tags. +static void adjust_svg_tag(GumboToken* token) { + assert(token->type == GUMBO_TOKEN_START_TAG); + if (token->v.start_tag.tag == GUMBO_TAG_FOREIGNOBJECT) { + assert(token->v.start_tag.name == NULL); + token->v.start_tag.name = "foreignObject"; + } else if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN) { + assert(token->v.start_tag.name); + const StringReplacement *replacement = gumbo_get_svg_tag_replacement( + token->v.start_tag.name, + strlen(token->v.start_tag.name) + ); + if (replacement) { + // This cast is safe because we allocated this memory and we'll free it. + strcpy((char *)token->v.start_tag.name, replacement->to); + } } } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-svg-attributes +// https://html.spec.whatwg.org/multipage/parsing.html#adjust-svg-attributes // This destructively modifies any matching attributes on the token. -static void adjust_svg_attributes(GumboParser* parser, GumboToken* token) { +static void adjust_svg_attributes(GumboToken* token) { assert(token->type == GUMBO_TOKEN_START_TAG); const GumboVector* attributes = &token->v.start_tag.attributes; - for (size_t i = 0; - i < sizeof(kSvgAttributeReplacements) / sizeof(ReplacementEntry); ++i) { - const ReplacementEntry* entry = &kSvgAttributeReplacements[i]; - GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from.data); - if (!attr) { + for (unsigned int i = 0, n = attributes->length; i < n; i++) { + GumboAttribute* attr = (GumboAttribute*) attributes->data[i]; + const StringReplacement* replacement = gumbo_get_svg_attr_replacement ( + attr->name, + attr->original_name.length + ); + if (!replacement) { continue; } - gumbo_parser_deallocate(parser, (void*) attr->name); - attr->name = gumbo_copy_stringz(parser, entry->to.data); + gumbo_free((void*) attr->name); + attr->name = gumbo_strdup(replacement->to); } } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-mathml-attributes +// https://html.spec.whatwg.org/multipage/parsing.html#adjust-mathml-attributes // Note that this may destructively modify the token with the new attribute // value. -static void adjust_mathml_attributes(GumboParser* parser, GumboToken* token) { +static void adjust_mathml_attributes(GumboToken* token) { assert(token->type == GUMBO_TOKEN_START_TAG); - GumboAttribute* attr = - gumbo_get_attribute(&token->v.start_tag.attributes, "definitionurl"); + GumboAttribute* attr = gumbo_get_attribute ( + &token->v.start_tag.attributes, + "definitionurl" + ); if (!attr) { return; } - gumbo_parser_deallocate(parser, (void*) attr->name); - attr->name = gumbo_copy_stringz(parser, "definitionURL"); + gumbo_free((void*) attr->name); + attr->name = gumbo_strdup("definitionURL"); } -static bool doctype_matches(const GumboTokenDocType* doctype, - const GumboStringPiece* public_id, const GumboStringPiece* system_id, - bool allow_missing_system_id) { - return !strcmp(doctype->public_identifier, public_id->data) && - (allow_missing_system_id || doctype->has_system_identifier) && - !strcmp(doctype->system_identifier, system_id->data); +static bool doctype_matches ( + const GumboTokenDocType* doctype, + const GumboStringPiece* public_id, + const GumboStringPiece* system_id, + bool allow_missing_system_id +) { + return + !strcmp(doctype->public_identifier, public_id->data) + && (allow_missing_system_id || doctype->has_system_identifier) + && !strcmp(doctype->system_identifier, system_id->data); } -static bool maybe_add_doctype_error( - GumboParser* parser, const GumboToken* token) { +static bool maybe_add_doctype_error ( + GumboParser* parser, + const GumboToken* token +) { const GumboTokenDocType* doctype = &token->v.doc_type; - bool html_doctype = !strcmp(doctype->name, kDoctypeHtml.data); + bool html_doctype = !strcmp(doctype->name, "html"); if ((!html_doctype || doctype->has_public_identifier || (doctype->has_system_identifier && !strcmp( @@ -1765,10 +1970,10 @@ static bool maybe_add_doctype_error( return true; } -static void remove_from_parent(GumboParser* parser, GumboNode* node) { +static void remove_from_parent(GumboNode* node) { if (!node->parent) { // The node may not have a parent if, for example, it is a newly-cloned copy - // of an active formatting element. DOM manipulations continue with the + // of an active formatting element. DOM manipulations continue with the // orphaned fragment of the DOM tree until it's appended/foster-parented to // the common ancestor at the end of the adoption agency algorithm. return; @@ -1778,7 +1983,7 @@ static void remove_from_parent(GumboParser* parser, GumboNode* node) { int index = gumbo_vector_index_of(children, node); assert(index != -1); - gumbo_vector_remove_at(parser, index, children); + gumbo_vector_remove_at(index, children); node->parent = NULL; node->index_within_parent = -1; for (unsigned int i = index; i < children->length; ++i) { @@ -1787,18 +1992,25 @@ static void remove_from_parent(GumboParser* parser, GumboNode* node) { } } -// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser +// https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser // Also described in the "in body" handling for end formatting tags. -static bool adoption_agency_algorithm( - GumboParser* parser, GumboToken* token, GumboTag subject) { +static bool adoption_agency_algorithm ( + GumboParser* parser, + GumboToken* token, + GumboTag subject +) { GumboParserState* state = parser->_parser_state; gumbo_debug("Entering adoption agency algorithm.\n"); // Step 1. GumboNode* current_node = get_current_node(parser); - if (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML && - current_node->v.element.tag == subject && - gumbo_vector_index_of( - &state->_active_formatting_elements, current_node) == -1) { + if ( + current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML + && current_node->v.element.tag == subject + && -1 == gumbo_vector_index_of ( + &state->_active_formatting_elements, + current_node + ) + ) { pop_current_node(parser); return false; } @@ -1817,11 +2029,15 @@ static bool adoption_agency_algorithm( if (node_html_tag_is(current_node, subject)) { // Found it. formatting_node = current_node; - formatting_node_in_open_elements = - gumbo_vector_index_of(&state->_open_elements, formatting_node); - gumbo_debug("Formatting element of tag %s at %d.\n", - gumbo_normalized_tagname(subject), - formatting_node_in_open_elements); + formatting_node_in_open_elements = gumbo_vector_index_of ( + &state->_open_elements, + formatting_node + ); + gumbo_debug ( + "Formatting element of tag %s at %d.\n", + gumbo_normalized_tagname(subject), + formatting_node_in_open_elements + ); break; } } @@ -1837,8 +2053,10 @@ static bool adoption_agency_algorithm( if (formatting_node_in_open_elements == -1) { gumbo_debug("Formatting node not on stack of open elements.\n"); parser_add_parse_error(parser, token); - gumbo_vector_remove( - parser, formatting_node, &state->_active_formatting_elements); + gumbo_vector_remove ( + formatting_node, + &state->_active_formatting_elements + ); return false; } @@ -1859,8 +2077,11 @@ static bool adoption_agency_algorithm( // Step 9 & 10 GumboNode* furthest_block = NULL; - for (unsigned int j = formatting_node_in_open_elements; - j < state->_open_elements.length; ++j) { + for ( + unsigned int j = formatting_node_in_open_elements; + j < state->_open_elements.length; + ++j + ) { assert(j > 0); GumboNode* current = state->_open_elements.data[j]; if (is_special_node(current)) { @@ -1876,8 +2097,10 @@ static bool adoption_agency_algorithm( } // And the formatting element itself. pop_current_node(parser); - gumbo_vector_remove( - parser, formatting_node, &state->_active_formatting_elements); + gumbo_vector_remove ( + formatting_node, + &state->_active_formatting_elements + ); return false; } assert(!node_html_tag_is(furthest_block, GUMBO_TAG_HTML)); @@ -1886,18 +2109,20 @@ static bool adoption_agency_algorithm( // Step 11. // Elements may be moved and reparented by this algorithm, so // common_ancestor is not necessarily the same as formatting_node->parent. - GumboNode* common_ancestor = - state->_open_elements.data[gumbo_vector_index_of(&state->_open_elements, - formatting_node) - - 1]; - gumbo_debug("Common ancestor tag = %s, furthest block tag = %s.\n", - gumbo_normalized_tagname(common_ancestor->v.element.tag), - gumbo_normalized_tagname(furthest_block->v.element.tag)); + GumboNode* common_ancestor = state->_open_elements.data [ + gumbo_vector_index_of(&state->_open_elements, formatting_node) - 1 + ]; + gumbo_debug ( + "Common ancestor tag = %s, furthest block tag = %s.\n", + gumbo_normalized_tagname(common_ancestor->v.element.tag), + gumbo_normalized_tagname(furthest_block->v.element.tag) + ); // Step 12. - int bookmark = gumbo_vector_index_of( - &state->_active_formatting_elements, formatting_node) + - 1; + int bookmark = 1 + gumbo_vector_index_of ( + &state->_active_formatting_elements, + formatting_node + ); gumbo_debug("Bookmark at %d.\n", bookmark); // Step 13. GumboNode* node = furthest_block; @@ -1912,8 +2137,11 @@ static bool adoption_agency_algorithm( ++j; // Step 13.3. int node_index = gumbo_vector_index_of(&state->_open_elements, node); - gumbo_debug( - "Current index: %d, last index: %d.\n", node_index, saved_node_index); + gumbo_debug ( + "Current index: %d, last index: %d.\n", + node_index, + saved_node_index + ); if (node_index == -1) { node_index = saved_node_index; } @@ -1926,13 +2154,17 @@ static bool adoption_agency_algorithm( // Step 13.4. break; } - int formatting_index = - gumbo_vector_index_of(&state->_active_formatting_elements, node); + int formatting_index = gumbo_vector_index_of ( + &state->_active_formatting_elements, + node + ); if (j > 3 && formatting_index != -1) { // Step 13.5. gumbo_debug("Removing formatting element at %d.\n", formatting_index); - gumbo_vector_remove_at( - parser, formatting_index, &state->_active_formatting_elements); + gumbo_vector_remove_at ( + formatting_index, + &state->_active_formatting_elements + ); // Removing the element shifts all indices over by one, so we may need // to move the bookmark. if (formatting_index < bookmark) { @@ -1943,13 +2175,13 @@ static bool adoption_agency_algorithm( } if (formatting_index == -1) { // Step 13.6. - gumbo_vector_remove_at(parser, node_index, &state->_open_elements); + gumbo_vector_remove_at(node_index, &state->_open_elements); continue; } // Step 13.7. // "common ancestor as the intended parent" doesn't actually mean insert // it into the common ancestor; that happens below. - node = clone_node(parser, node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED); + node = clone_node(node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED); assert(formatting_index >= 0); state->_active_formatting_elements.data[formatting_index] = node; assert(node_index >= 0); @@ -1962,35 +2194,42 @@ static bool adoption_agency_algorithm( } // Step 13.9. last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED; - remove_from_parent(parser, last_node); - append_node(parser, node, last_node); + remove_from_parent(last_node); + append_node(node, last_node); // Step 13.10. last_node = node; } // Step 13.11. // Step 14. - gumbo_debug("Removing %s node from parent ", - gumbo_normalized_tagname(last_node->v.element.tag)); - remove_from_parent(parser, last_node); + gumbo_debug ( + "Removing %s node from parent ", + gumbo_normalized_tagname(last_node->v.element.tag) + ); + remove_from_parent(last_node); last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED; - InsertionLocation location = - get_appropriate_insertion_location(parser, common_ancestor); - gumbo_debug("and inserting it into %s.\n", - gumbo_normalized_tagname(location.target->v.element.tag)); - insert_node(parser, last_node, location); + InsertionLocation location = get_appropriate_insertion_location ( + parser, + common_ancestor + ); + gumbo_debug ( + "and inserting it into %s.\n", + gumbo_normalized_tagname(location.target->v.element.tag) + ); + insert_node(last_node, location); // Step 15. - GumboNode* new_formatting_node = clone_node( - parser, formatting_node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED); + GumboNode* new_formatting_node = clone_node ( + formatting_node, + GUMBO_INSERTION_ADOPTION_AGENCY_CLONED + ); formatting_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG; - // Step 16. Instead of appending nodes one-by-one, we swap the children + // Step 16. Instead of appending nodes one-by-one, we swap the children // vector of furthest_block with the empty children of new_formatting_node, - // reducing memory traffic and allocations. We still have to reset their + // reducing memory traffic and allocations. We still have to reset their // parent pointers, though. GumboVector temp = new_formatting_node->v.element.children; - new_formatting_node->v.element.children = - furthest_block->v.element.children; + new_formatting_node->v.element.children = furthest_block->v.element.children; furthest_block->v.element.children = temp; temp = new_formatting_node->v.element.children; @@ -2000,36 +2239,49 @@ static bool adoption_agency_algorithm( } // Step 17. - append_node(parser, furthest_block, new_formatting_node); + append_node(furthest_block, new_formatting_node); // Step 18. // If the formatting node was before the bookmark, it may shift over all // indices after it, so we need to explicitly find the index and possibly // adjust the bookmark. - int formatting_node_index = gumbo_vector_index_of( - &state->_active_formatting_elements, formatting_node); + int formatting_node_index = gumbo_vector_index_of ( + &state->_active_formatting_elements, + formatting_node + ); assert(formatting_node_index != -1); if (formatting_node_index < bookmark) { - gumbo_debug( - "Formatting node at %d is before bookmark at %d; decrementing.\n", - formatting_node_index, bookmark); + gumbo_debug ( + "Formatting node at %d is before bookmark at %d; decrementing.\n", + formatting_node_index, bookmark + ); --bookmark; } - gumbo_vector_remove_at( - parser, formatting_node_index, &state->_active_formatting_elements); + gumbo_vector_remove_at ( + formatting_node_index, + &state->_active_formatting_elements + ); assert(bookmark >= 0); assert((unsigned int) bookmark <= state->_active_formatting_elements.length); - gumbo_vector_insert_at(parser, new_formatting_node, bookmark, - &state->_active_formatting_elements); + gumbo_vector_insert_at ( + new_formatting_node, + bookmark, + &state->_active_formatting_elements + ); // Step 19. - gumbo_vector_remove(parser, formatting_node, &state->_open_elements); - int insert_at = - gumbo_vector_index_of(&state->_open_elements, furthest_block) + 1; + gumbo_vector_remove(formatting_node, &state->_open_elements); + int insert_at = 1 + gumbo_vector_index_of ( + &state->_open_elements, + furthest_block + ); assert(insert_at >= 0); assert((unsigned int) insert_at <= state->_open_elements.length); - gumbo_vector_insert_at( - parser, new_formatting_node, insert_at, &state->_open_elements); + gumbo_vector_insert_at ( + new_formatting_node, + insert_at, + &state->_open_elements + ); } // Step 20. return true; } @@ -2041,25 +2293,31 @@ static void ignore_token(GumboParser* parser) { // element, but if no element is emitted (as happens in non-verbatim-mode // when a token is ignored), we need to free it here to prevent a memory // leak. - gumbo_token_destroy(parser, token); + gumbo_token_destroy(token); #ifndef NDEBUG if (token->type == GUMBO_TOKEN_START_TAG) { // Mark this sentinel so the assertion in the main loop knows it's been // destroyed. token->v.start_tag.attributes = kGumboEmptyVector; + token->v.start_tag.name = NULL; } #endif } -// http://www.whatwg.org/specs/web-apps/current-work/complete/the-end.html +// https://html.spec.whatwg.org/multipage/parsing.html#the-end static void finish_parsing(GumboParser* parser) { gumbo_debug("Finishing parsing"); maybe_flush_text_node_buffer(parser); GumboParserState* state = parser->_parser_state; - for (GumboNode* node = pop_current_node(parser); node; - node = pop_current_node(parser)) { - if ((node_html_tag_is(node, GUMBO_TAG_BODY) && state->_closed_body_tag) || - (node_html_tag_is(node, GUMBO_TAG_HTML) && state->_closed_html_tag)) { + for ( + GumboNode* node = pop_current_node(parser); + node; + node = pop_current_node(parser) + ) { + if ( + (node_html_tag_is(node, GUMBO_TAG_BODY) && state->_closed_body_tag) + || (node_html_tag_is(node, GUMBO_TAG_HTML) && state->_closed_html_tag) + ) { continue; } node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG; @@ -2092,7 +2350,7 @@ static bool handle_initial(GumboParser* parser, GumboToken* token) { return true; } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-html-insertion-mode +// https://html.spec.whatwg.org/multipage/parsing.html#the-before-html-insertion-mode static bool handle_before_html(GumboParser* parser, GumboToken* token) { if (token->type == GUMBO_TOKEN_DOCTYPE) { parser_add_parse_error(parser, token); @@ -2109,15 +2367,19 @@ static bool handle_before_html(GumboParser* parser, GumboToken* token) { parser->_output->root = html_node; set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD); return true; - } else if (token->type == GUMBO_TOKEN_END_TAG && - !tag_in(token, false, - (gumbo_tagset){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})) { + } else if ( + token->type == GUMBO_TOKEN_END_TAG + && !tag_in(token, false, &(const TagSet){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)}) + ) { parser_add_parse_error(parser, token); ignore_token(parser); return false; } else { - GumboNode* html_node = insert_element_of_tag_type( - parser, GUMBO_TAG_HTML, GUMBO_INSERTION_IMPLIED); + GumboNode* html_node = insert_element_of_tag_type ( + parser, + GUMBO_TAG_HTML, + GUMBO_INSERTION_IMPLIED + ); assert(html_node); parser->_output->root = html_node; set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD); @@ -2126,7 +2388,7 @@ static bool handle_before_html(GumboParser* parser, GumboToken* token) { } } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-head-insertion-mode +// https://html.spec.whatwg.org/multipage/parsing.html#the-before-head-insertion-mode static bool handle_before_head(GumboParser* parser, GumboToken* token) { if (token->type == GUMBO_TOKEN_DOCTYPE) { parser_add_parse_error(parser, token); @@ -2143,15 +2405,19 @@ static bool handle_before_head(GumboParser* parser, GumboToken* token) { set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD); parser->_parser_state->_head_element = node; return true; - } else if (token->type == GUMBO_TOKEN_END_TAG && - !tag_in(token, false, - (gumbo_tagset){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})) { + } else if ( + token->type == GUMBO_TOKEN_END_TAG + && !tag_in(token, false, &(const TagSet){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)}) + ) { parser_add_parse_error(parser, token); ignore_token(parser); return false; } else { - GumboNode* node = insert_element_of_tag_type( - parser, GUMBO_TAG_HEAD, GUMBO_INSERTION_IMPLIED); + GumboNode* node = insert_element_of_tag_type ( + parser, + GUMBO_TAG_HEAD, + GUMBO_INSERTION_IMPLIED + ); set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD); parser->_parser_state->_head_element = node; parser->_parser_state->_reprocess_current_token = true; @@ -2162,8 +2428,9 @@ static bool handle_before_head(GumboParser* parser, GumboToken* token) { // Forward declarations because of mutual dependencies. static bool handle_token(GumboParser* parser, GumboToken* token); static bool handle_in_body(GumboParser* parser, GumboToken* token); +static bool handle_in_template(GumboParser* parser, GumboToken* token); -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inhead +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhead static bool handle_in_head(GumboParser* parser, GumboToken* token) { if (token->type == GUMBO_TOKEN_WHITESPACE) { insert_text_token(parser, token); @@ -2177,9 +2444,11 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) { return true; } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) { return handle_in_body(parser, token); - } else if (tag_in(token, kStartTag, - (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), - TAG(MENUITEM), TAG(LINK)})) { + } else if ( + tag_in(token, kStartTag, &(const TagSet) { + TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK) + }) + ) { insert_element_from_token(parser, token); pop_current_node(parser); acknowledge_self_closing_tag(parser); @@ -2189,15 +2458,16 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) { pop_current_node(parser); acknowledge_self_closing_tag(parser); // NOTE(jdtang): Gumbo handles only UTF-8, so the encoding clause of the - // spec doesn't apply. If clients want to handle meta-tag re-encoding, they + // spec doesn't apply. If clients want to handle meta-tag re-encoding, they // should specifically look for that string in the document and re-encode it // before passing to Gumbo. return true; } else if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) { run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA); return true; - } else if (tag_in( - token, kStartTag, (gumbo_tagset){TAG(NOFRAMES), TAG(STYLE)})) { + } else if ( + tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)}) + ) { run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT); return true; } else if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) { @@ -2209,12 +2479,13 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) { return true; } else if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) { GumboNode* head = pop_current_node(parser); - AVOID_UNUSED_VARIABLE_WARNING(head); + UNUSED_IF_NDEBUG(head); assert(node_html_tag_is(head, GUMBO_TAG_HEAD)); set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD); return true; - } else if (tag_in(token, kEndTag, - (gumbo_tagset){TAG(BODY), TAG(HTML), TAG(BR)})) { + } else if ( + tag_in(token, kEndTag, &(const TagSet){TAG(BODY), TAG(HTML), TAG(BR)}) + ) { pop_current_node(parser); set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD); parser->_parser_state->_reprocess_current_token = true; @@ -2244,8 +2515,10 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) { pop_template_insertion_mode(parser); reset_insertion_mode_appropriately(parser); return success; - } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) || - (token->type == GUMBO_TOKEN_END_TAG)) { + } else if ( + tag_is(token, kStartTag, GUMBO_TAG_HEAD) + || (token->type == GUMBO_TOKEN_END_TAG) + ) { parser_add_parse_error(parser, token); ignore_token(parser); return false; @@ -2258,7 +2531,7 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) { return true; } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inheadnoscript +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inheadnoscript static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) { if (token->type == GUMBO_TOKEN_DOCTYPE) { parser_add_parse_error(parser, token); @@ -2268,19 +2541,25 @@ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) { } else if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) { const GumboNode* node = pop_current_node(parser); assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT)); - AVOID_UNUSED_VARIABLE_WARNING(node); + UNUSED_IF_NDEBUG(node); set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD); return true; - } else if (token->type == GUMBO_TOKEN_WHITESPACE || - token->type == GUMBO_TOKEN_COMMENT || - tag_in(token, kStartTag, - (gumbo_tagset){TAG(BASEFONT), TAG(BGSOUND), TAG(LINK), - TAG(META), TAG(NOFRAMES), TAG(STYLE)})) { + } else if ( + token->type == GUMBO_TOKEN_WHITESPACE + || token->type == GUMBO_TOKEN_COMMENT + || tag_in (token, kStartTag, &(const TagSet) { + TAG(BASEFONT), TAG(BGSOUND), TAG(LINK), + TAG(META), TAG(NOFRAMES), TAG(STYLE) + }) + ) { return handle_in_head(parser, token); - } else if (tag_in( - token, kStartTag, (gumbo_tagset){TAG(HEAD), TAG(NOSCRIPT)}) || - (token->type == GUMBO_TOKEN_END_TAG && - !tag_is(token, kEndTag, GUMBO_TAG_BR))) { + } else if ( + tag_in(token, kStartTag, &(const TagSet){TAG(HEAD), TAG(NOSCRIPT)}) + || ( + token->type == GUMBO_TOKEN_END_TAG + && !tag_is(token, kEndTag, GUMBO_TAG_BR) + ) + ) { parser_add_parse_error(parser, token); ignore_token(parser); return false; @@ -2288,14 +2567,14 @@ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) { parser_add_parse_error(parser, token); const GumboNode* node = pop_current_node(parser); assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT)); - AVOID_UNUSED_VARIABLE_WARNING(node); + UNUSED_IF_NDEBUG(node); set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD); parser->_parser_state->_reprocess_current_token = true; return false; } } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-head-insertion-mode +// https://html.spec.whatwg.org/multipage/parsing.html#the-after-head-insertion-mode static bool handle_after_head(GumboParser* parser, GumboToken* token) { GumboParserState* state = parser->_parser_state; if (token->type == GUMBO_TOKEN_WHITESPACE) { @@ -2319,25 +2598,30 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) { insert_element_from_token(parser, token); set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET); return true; - } else if (tag_in(token, kStartTag, - (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), - TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(SCRIPT), - TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)})) { + } else if ( + tag_in(token, kStartTag, &(const TagSet) { + TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK), TAG(META), + TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), TAG(TITLE) + }) + ) { parser_add_parse_error(parser, token); assert(state->_head_element != NULL); // This must be flushed before we push the head element on, as there may be // pending character tokens that should be attached to the root. maybe_flush_text_node_buffer(parser); - gumbo_vector_add(parser, state->_head_element, &state->_open_elements); + gumbo_vector_add(state->_head_element, &state->_open_elements); bool result = handle_in_head(parser, token); - gumbo_vector_remove(parser, state->_head_element, &state->_open_elements); + gumbo_vector_remove(state->_head_element, &state->_open_elements); return result; } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { return handle_in_head(parser, token); - } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) || - (token->type == GUMBO_TOKEN_END_TAG && - !tag_in(token, kEndTag, - (gumbo_tagset){TAG(BODY), TAG(HTML), TAG(BR)}))) { + } else if ( + tag_is(token, kStartTag, GUMBO_TAG_HEAD) + || ( + token->type == GUMBO_TOKEN_END_TAG + && !tag_in(token, kEndTag, &(const TagSet){TAG(BODY), TAG(HTML), TAG(BR)}) + ) + ) { parser_add_parse_error(parser, token); ignore_token(parser); return false; @@ -2349,40 +2633,7 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) { } } -static void destroy_node(GumboParser* parser, GumboNode* node) { - switch (node->type) { - case GUMBO_NODE_DOCUMENT: { - GumboDocument* doc = &node->v.document; - for (unsigned int i = 0; i < doc->children.length; ++i) { - destroy_node(parser, doc->children.data[i]); - } - gumbo_parser_deallocate(parser, (void*) doc->children.data); - gumbo_parser_deallocate(parser, (void*) doc->name); - gumbo_parser_deallocate(parser, (void*) doc->public_identifier); - gumbo_parser_deallocate(parser, (void*) doc->system_identifier); - } break; - case GUMBO_NODE_TEMPLATE: - case GUMBO_NODE_ELEMENT: - for (unsigned int i = 0; i < node->v.element.attributes.length; ++i) { - gumbo_destroy_attribute(parser, node->v.element.attributes.data[i]); - } - gumbo_parser_deallocate(parser, node->v.element.attributes.data); - for (unsigned int i = 0; i < node->v.element.children.length; ++i) { - destroy_node(parser, node->v.element.children.data[i]); - } - gumbo_parser_deallocate(parser, node->v.element.children.data); - break; - case GUMBO_NODE_TEXT: - case GUMBO_NODE_CDATA: - case GUMBO_NODE_COMMENT: - case GUMBO_NODE_WHITESPACE: - gumbo_parser_deallocate(parser, (void*) node->v.text.text); - break; - } - gumbo_parser_deallocate(parser, node); -} - -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inbody +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody static bool handle_in_body(GumboParser* parser, GumboToken* token) { GumboParserState* state = parser->_parser_state; assert(state->_open_elements.length > 0); @@ -2394,8 +2645,10 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { reconstruct_active_formatting_elements(parser); insert_text_token(parser, token); return true; - } else if (token->type == GUMBO_TOKEN_CHARACTER || - token->type == GUMBO_TOKEN_CDATA) { + } else if ( + token->type == GUMBO_TOKEN_CHARACTER + || token->type == GUMBO_TOKEN_CDATA + ) { reconstruct_active_formatting_elements(parser); insert_text_token(parser, token); set_frameset_not_ok(parser); @@ -2415,30 +2668,37 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { } assert(parser->_output->root != NULL); assert(parser->_output->root->type == GUMBO_NODE_ELEMENT); - merge_attributes(parser, token, parser->_output->root); + merge_attributes(token, parser->_output->root); return false; - } else if (tag_in(token, kStartTag, - (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), - TAG(MENUITEM), TAG(LINK), TAG(META), TAG(NOFRAMES), - TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)}) || - tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { + } else if ( + tag_in(token, kStartTag, &(const TagSet) { + TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK), + TAG(META), TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), + TAG(TITLE) + }) + || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE) + ) { return handle_in_head(parser, token); } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) { parser_add_parse_error(parser, token); - if (state->_open_elements.length < 2 || - !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) || - has_open_element(parser, GUMBO_TAG_TEMPLATE)) { + if ( + state->_open_elements.length < 2 + || !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) + || has_open_element(parser, GUMBO_TAG_TEMPLATE) + ) { ignore_token(parser); return false; } state->_frameset_ok = false; - merge_attributes(parser, token, state->_open_elements.data[1]); + merge_attributes(token, state->_open_elements.data[1]); return false; } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) { parser_add_parse_error(parser, token); - if (state->_open_elements.length < 2 || - !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) || - !state->_frameset_ok) { + if ( + state->_open_elements.length < 2 + || !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) + || !state->_frameset_ok + ) { ignore_token(parser); return false; } @@ -2454,20 +2714,20 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { // Removing & destroying the body node is going to kill any nodes that have // been added to the list of active formatting elements, and so we should // clear it to prevent a use-after-free if the list of active formatting - // elements is reconstructed afterwards. This may happen if whitespace + // elements is reconstructed afterwards. This may happen if whitespace // follows the </frameset>. clear_active_formatting_elements(parser); - // Remove the body node. We may want to factor this out into a generic + // Remove the body node. We may want to factor this out into a generic // helper, but right now this is the only code that needs to do this. GumboVector* children = &parser->_output->root->v.element.children; for (unsigned int i = 0; i < children->length; ++i) { if (children->data[i] == body_node) { - gumbo_vector_remove_at(parser, i, children); + gumbo_vector_remove_at(i, children); break; } } - destroy_node(parser, body_node); + destroy_node(body_node); // Insert the <frameset>, and switch the insertion mode. insert_element_from_token(parser, token); @@ -2475,10 +2735,12 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { return true; } else if (token->type == GUMBO_TOKEN_EOF) { for (unsigned int i = 0; i < state->_open_elements.length; ++i) { - if (!node_tag_in_set(state->_open_elements.data[i], - (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(P), TAG(TBODY), - TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR), TAG(BODY), - TAG(HTML)})) { + if ( + !node_tag_in_set(state->_open_elements.data[i], &(const TagSet) { + TAG(DD), TAG(DT), TAG(LI), TAG(P), TAG(TBODY), TAG(TD), TAG(TFOOT), + TAG(TH), TAG(THEAD), TAG(TR), TAG(BODY), TAG(HTML) + }) + ) { parser_add_parse_error(parser, token); } } @@ -2487,7 +2749,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { return handle_in_template(parser, token); } return true; - } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(BODY), TAG(HTML)})) { + } else if (tag_in(token, kEndTag, &(const TagSet){TAG(BODY), TAG(HTML)})) { if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) { parser_add_parse_error(parser, token); ignore_token(parser); @@ -2495,11 +2757,13 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { } bool success = true; for (unsigned int i = 0; i < state->_open_elements.length; ++i) { - if (!node_tag_in_set(state->_open_elements.data[i], - (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), - TAG(OPTION), TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC), - TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR), - TAG(BODY), TAG(HTML)})) { + if ( + !node_tag_in_set(state->_open_elements.data[i], &(const TagSet) { + TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION), TAG(P), + TAG(RB), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY), TAG(TD), + TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR), TAG(BODY), TAG(HTML) + }) + ) { parser_add_parse_error(parser, token); success = false; break; @@ -2514,37 +2778,38 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { record_end_of_element(state->_current_token, &body->v.element); } return success; - } else if (tag_in(token, kStartTag, - (gumbo_tagset){TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), - TAG(BLOCKQUOTE), TAG(CENTER), TAG(DETAILS), TAG(DIR), - TAG(DIV), TAG(DL), TAG(FIELDSET), TAG(FIGCAPTION), - TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP), - TAG(MENU), TAG(MAIN), TAG(NAV), TAG(OL), TAG(P), - TAG(SECTION), TAG(SUMMARY), TAG(UL)})) { + } else if ( + tag_in(token, kStartTag, &(const TagSet) { + TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), TAG(BLOCKQUOTE), TAG(CENTER), + TAG(DETAILS), TAG(DIALOG), TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET), + TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP), + TAG(MENU), TAG(MAIN), TAG(NAV), TAG(OL), TAG(P), TAG(SECTION), + TAG(SUMMARY), TAG(UL) + }) + ) { bool result = maybe_implicitly_close_p_tag(parser, token); insert_element_from_token(parser, token); return result; - } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3), - TAG(H4), TAG(H5), TAG(H6)})) { + } else if (tag_in(token, kStartTag, &heading_tags)) { bool result = maybe_implicitly_close_p_tag(parser, token); - if (node_tag_in_set( - get_current_node(parser), (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3), - TAG(H4), TAG(H5), TAG(H6)})) { + if (node_tag_in_set(get_current_node(parser), &heading_tags)) { parser_add_parse_error(parser, token); pop_current_node(parser); result = false; } insert_element_from_token(parser, token); return result; - } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(PRE), TAG(LISTING)})) { + } else if (tag_in(token, kStartTag, &(const TagSet){TAG(PRE), TAG(LISTING)})) { bool result = maybe_implicitly_close_p_tag(parser, token); insert_element_from_token(parser, token); state->_ignore_next_linefeed = true; state->_frameset_ok = false; return result; } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) { - if (state->_form_element != NULL && - !has_open_element(parser, GUMBO_TAG_TEMPLATE)) { + if ( + state->_form_element != NULL + && !has_open_element(parser, GUMBO_TAG_TEMPLATE) + ) { gumbo_debug("Ignoring nested form.\n"); parser_add_parse_error(parser, token); ignore_token(parser); @@ -2561,7 +2826,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { bool result = maybe_implicitly_close_p_tag(parser, token); insert_element_from_token(parser, token); return result; - } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(DD), TAG(DT)})) { + } else if (tag_in(token, kStartTag, &dd_dt_tags)) { maybe_implicitly_close_list_tag(parser, token, false); bool result = maybe_implicitly_close_p_tag(parser, token); insert_element_from_token(parser, token); @@ -2574,8 +2839,12 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { } else if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) { if (has_an_element_in_scope(parser, GUMBO_TAG_BUTTON)) { parser_add_parse_error(parser, token); - implicitly_close_tags( - parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_BUTTON); + implicitly_close_tags ( + parser, + token, + GUMBO_NAMESPACE_HTML, + GUMBO_TAG_BUTTON + ); state->_reprocess_current_token = true; return false; } @@ -2583,21 +2852,27 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { insert_element_from_token(parser, token); state->_frameset_ok = false; return true; - } else if (tag_in(token, kEndTag, - (gumbo_tagset){TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), - TAG(BLOCKQUOTE), TAG(BUTTON), TAG(CENTER), TAG(DETAILS), - TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET), - TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER), - TAG(HGROUP), TAG(LISTING), TAG(MAIN), TAG(MENU), TAG(NAV), - TAG(OL), TAG(PRE), TAG(SECTION), TAG(SUMMARY), TAG(UL)})) { - GumboTag tag = token->v.end_tag; + } else if ( + tag_in(token, kEndTag, &(const TagSet) { + TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), TAG(BLOCKQUOTE), TAG(BUTTON), + TAG(CENTER), TAG(DETAILS), TAG(DIALOG), TAG(DIR), TAG(DIV), TAG(DL), + TAG(FIELDSET), TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER), + TAG(HGROUP), TAG(LISTING), TAG(MAIN), TAG(MENU), TAG(NAV), TAG(OL), + TAG(PRE), TAG(SECTION), TAG(SUMMARY), TAG(UL) + }) + ) { + GumboTag tag = token->v.end_tag.tag; if (!has_an_element_in_scope(parser, tag)) { parser_add_parse_error(parser, token); ignore_token(parser); return false; } - implicitly_close_tags( - parser, token, GUMBO_NAMESPACE_HTML, token->v.end_tag); + implicitly_close_tags ( + parser, + token, + GUMBO_NAMESPACE_HTML, + token->v.end_tag.tag + ); return true; } else if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) { if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) { @@ -2617,7 +2892,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { return success; } else { bool result = true; - const GumboNode* node = state->_form_element; + GumboNode* node = state->_form_element; assert(!node || node->type == GUMBO_NODE_ELEMENT); state->_form_element = NULL; if (!node || !has_node_in_scope(parser, node)) { @@ -2632,48 +2907,67 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { if (get_current_node(parser) != node) { parser_add_parse_error(parser, token); result = false; + } else { + record_end_of_element(token, &node->v.element); } GumboVector* open_elements = &state->_open_elements; int index = gumbo_vector_index_of(open_elements, node); assert(index >= 0); - gumbo_vector_remove_at(parser, index, open_elements); + gumbo_vector_remove_at(index, open_elements); return result; } } else if (tag_is(token, kEndTag, GUMBO_TAG_P)) { if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) { parser_add_parse_error(parser, token); // reconstruct_active_formatting_elements(parser); - insert_element_of_tag_type( - parser, GUMBO_TAG_P, GUMBO_INSERTION_CONVERTED_FROM_END_TAG); + insert_element_of_tag_type ( + parser, + GUMBO_TAG_P, + GUMBO_INSERTION_CONVERTED_FROM_END_TAG + ); state->_reprocess_current_token = true; return false; } - return implicitly_close_tags( - parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_P); + return implicitly_close_tags ( + parser, + token, + GUMBO_NAMESPACE_HTML, + GUMBO_TAG_P + ); } else if (tag_is(token, kEndTag, GUMBO_TAG_LI)) { if (!has_an_element_in_list_scope(parser, GUMBO_TAG_LI)) { parser_add_parse_error(parser, token); ignore_token(parser); return false; } - return implicitly_close_tags( - parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_LI); - } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(DD), TAG(DT)})) { + return implicitly_close_tags ( + parser, + token, + GUMBO_NAMESPACE_HTML, + GUMBO_TAG_LI + ); + } else if (tag_in(token, kEndTag, &dd_dt_tags)) { assert(token->type == GUMBO_TOKEN_END_TAG); - GumboTag token_tag = token->v.end_tag; + GumboTag token_tag = token->v.end_tag.tag; if (!has_an_element_in_scope(parser, token_tag)) { parser_add_parse_error(parser, token); ignore_token(parser); return false; } - return implicitly_close_tags( - parser, token, GUMBO_NAMESPACE_HTML, token_tag); - } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3), - TAG(H4), TAG(H5), TAG(H6)})) { - if (!has_an_element_in_scope_with_tagname( - parser, 6, (GumboTag[]){GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, - GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6})) { + return implicitly_close_tags ( + parser, + token, + GUMBO_NAMESPACE_HTML, + token_tag + ); + } else if (tag_in(token, kEndTag, &heading_tags)) { + if ( + !has_an_element_in_scope_with_tagname(parser, 6, (GumboTag[]) { + GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, GUMBO_TAG_H4, + GUMBO_TAG_H5, GUMBO_TAG_H6 + }) + ) { // No heading open; ignore the token entirely. parser_add_parse_error(parser, token); ignore_token(parser); @@ -2681,7 +2975,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { } else { generate_implied_end_tags(parser, GUMBO_TAG_LAST); const GumboNode* current_node = get_current_node(parser); - bool success = node_html_tag_is(current_node, token->v.end_tag); + bool success = node_html_tag_is(current_node, token->v.end_tag.tag); if (!success) { // There're children of the heading currently open; close them below and // record a parse error. @@ -2691,9 +2985,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { } do { current_node = pop_current_node(parser); - } while (!node_tag_in_set( - current_node, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3), - TAG(H4), TAG(H5), TAG(H6)})); + } while (!node_tag_in_set(current_node, &heading_tags)); return success; } } else if (tag_is(token, kStartTag, GUMBO_TAG_A)) { @@ -2706,22 +2998,26 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { adoption_agency_algorithm(parser, token, GUMBO_TAG_A); // The adoption agency algorithm usually removes all instances of <a> // from the list of active formatting elements, but in case it doesn't, - // we're supposed to do this. (The conditions where it might not are + // we're supposed to do this. (The conditions where it might not are // listed in the spec.) if (find_last_anchor_index(parser, &last_a)) { - void* last_element = gumbo_vector_remove_at( - parser, last_a, &state->_active_formatting_elements); - gumbo_vector_remove(parser, last_element, &state->_open_elements); + void* last_element = gumbo_vector_remove_at ( + last_a, + &state->_active_formatting_elements + ); + gumbo_vector_remove(last_element, &state->_open_elements); } success = false; } reconstruct_active_formatting_elements(parser); add_formatting_element(parser, insert_element_from_token(parser, token)); return success; - } else if (tag_in(token, kStartTag, - (gumbo_tagset){TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT), - TAG(I), TAG(S), TAG(SMALL), TAG(STRIKE), TAG(STRONG), - TAG(TT), TAG(U)})) { + } else if ( + tag_in(token, kStartTag, &(const TagSet) { + TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT), TAG(I), TAG(S), + TAG(SMALL), TAG(STRIKE), TAG(STRONG), TAG(TT), TAG(U) + }) + ) { reconstruct_active_formatting_elements(parser); add_formatting_element(parser, insert_element_from_token(parser, token)); return true; @@ -2737,21 +3033,26 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { insert_element_from_token(parser, token); add_formatting_element(parser, get_current_node(parser)); return result; - } else if (tag_in(token, kEndTag, - (gumbo_tagset){TAG(A), TAG(B), TAG(BIG), TAG(CODE), TAG(EM), - TAG(FONT), TAG(I), TAG(NOBR), TAG(S), TAG(SMALL), - TAG(STRIKE), TAG(STRONG), TAG(TT), TAG(U)})) { - return adoption_agency_algorithm(parser, token, token->v.end_tag); - } else if (tag_in(token, kStartTag, - (gumbo_tagset){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})) { + } else if ( + tag_in(token, kEndTag, &(const TagSet) { + TAG(A), TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT), TAG(I), + TAG(NOBR), TAG(S), TAG(SMALL), TAG(STRIKE), TAG(STRONG), TAG(TT), + TAG(U) + }) + ) { + return adoption_agency_algorithm(parser, token, token->v.end_tag.tag); + } else if ( + tag_in(token, kStartTag, &(const TagSet){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)}) + ) { reconstruct_active_formatting_elements(parser); insert_element_from_token(parser, token); add_formatting_element(parser, &kActiveFormattingScopeMarker); set_frameset_not_ok(parser); return true; - } else if (tag_in(token, kEndTag, - (gumbo_tagset){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})) { - GumboTag token_tag = token->v.end_tag; + } else if ( + tag_in(token, kEndTag, &(const TagSet){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)}) + ) { + GumboTag token_tag = token->v.end_tag.tag; if (!has_an_element_in_table_scope(parser, token_tag)) { parser_add_parse_error(parser, token); ignore_token(parser); @@ -2761,17 +3062,22 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { clear_active_formatting_elements(parser); return true; } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) { - if (get_document_node(parser)->v.document.doc_type_quirks_mode != - GUMBO_DOCTYPE_QUIRKS) { + if ( + get_document_node(parser)->v.document.doc_type_quirks_mode + != GUMBO_DOCTYPE_QUIRKS + ) { maybe_implicitly_close_p_tag(parser, token); } insert_element_from_token(parser, token); set_frameset_not_ok(parser); set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); return true; - } else if (tag_in(token, kStartTag, - (gumbo_tagset){TAG(AREA), TAG(BR), TAG(EMBED), TAG(IMG), - TAG(IMAGE), TAG(KEYGEN), TAG(WBR)})) { + } else if ( + tag_in(token, kStartTag, &(const TagSet) { + TAG(AREA), TAG(BR), TAG(EMBED), TAG(IMG), TAG(IMAGE), TAG(KEYGEN), + TAG(WBR) + }) + ) { bool success = true; if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) { success = false; @@ -2801,8 +3107,9 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { pop_current_node(parser); acknowledge_self_closing_tag(parser); return true; - } else if (tag_in(token, kStartTag, - (gumbo_tagset){TAG(PARAM), TAG(SOURCE), TAG(TRACK)})) { + } else if ( + tag_in(token, kStartTag, &(const TagSet){TAG(PARAM), TAG(SOURCE), TAG(TRACK)}) + ) { insert_element_from_token(parser, token); pop_current_node(parser); acknowledge_self_closing_tag(parser); @@ -2814,101 +3121,6 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { acknowledge_self_closing_tag(parser); set_frameset_not_ok(parser); return result; - } else if (tag_is(token, kStartTag, GUMBO_TAG_ISINDEX)) { - parser_add_parse_error(parser, token); - if (parser->_parser_state->_form_element != NULL && - !has_open_element(parser, GUMBO_TAG_TEMPLATE)) { - ignore_token(parser); - return false; - } - acknowledge_self_closing_tag(parser); - maybe_implicitly_close_p_tag(parser, token); - set_frameset_not_ok(parser); - - GumboVector* token_attrs = &token->v.start_tag.attributes; - GumboAttribute* prompt_attr = gumbo_get_attribute(token_attrs, "prompt"); - GumboAttribute* action_attr = gumbo_get_attribute(token_attrs, "action"); - GumboAttribute* name_attr = gumbo_get_attribute(token_attrs, "name"); - - GumboNode* form = insert_element_of_tag_type( - parser, GUMBO_TAG_FORM, GUMBO_INSERTION_FROM_ISINDEX); - if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) { - parser->_parser_state->_form_element = form; - } - if (action_attr) { - gumbo_vector_add(parser, action_attr, &form->v.element.attributes); - } - insert_element_of_tag_type( - parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX); - pop_current_node(parser); // <hr> - - insert_element_of_tag_type( - parser, GUMBO_TAG_LABEL, GUMBO_INSERTION_FROM_ISINDEX); - TextNodeBufferState* text_state = &parser->_parser_state->_text_node; - text_state->_start_original_text = token->original_text.data; - text_state->_start_position = token->position; - text_state->_type = GUMBO_NODE_TEXT; - if (prompt_attr) { - int prompt_attr_length = strlen(prompt_attr->value); - gumbo_string_buffer_destroy(parser, &text_state->_buffer); - text_state->_buffer.data = gumbo_copy_stringz(parser, prompt_attr->value); - text_state->_buffer.length = prompt_attr_length; - text_state->_buffer.capacity = prompt_attr_length + 1; - gumbo_destroy_attribute(parser, prompt_attr); - } else { - GumboStringPiece prompt_text = - GUMBO_STRING("This is a searchable index. Enter search keywords: "); - gumbo_string_buffer_append_string( - parser, &prompt_text, &text_state->_buffer); - } - - GumboNode* input = insert_element_of_tag_type( - parser, GUMBO_TAG_INPUT, GUMBO_INSERTION_FROM_ISINDEX); - for (unsigned int i = 0; i < token_attrs->length; ++i) { - GumboAttribute* attr = token_attrs->data[i]; - if (attr != prompt_attr && attr != action_attr && attr != name_attr) { - gumbo_vector_add(parser, attr, &input->v.element.attributes); - } - token_attrs->data[i] = NULL; - } - - // All attributes have been successfully transferred and nulled out at this - // point, so the call to ignore_token will free the memory for it without - // touching the attributes. - ignore_token(parser); - - // The name attribute, if present, should be destroyed since it's ignored - // when copying over. The action attribute should be kept since it's moved - // to the form. - if (name_attr) { - gumbo_destroy_attribute(parser, name_attr); - } - - GumboAttribute* name = - gumbo_parser_allocate(parser, sizeof(GumboAttribute)); - GumboStringPiece name_str = GUMBO_STRING("name"); - GumboStringPiece isindex_str = GUMBO_STRING("isindex"); - name->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE; - name->name = gumbo_copy_stringz(parser, "name"); - name->value = gumbo_copy_stringz(parser, "isindex"); - name->original_name = name_str; - name->original_value = isindex_str; - name->name_start = kGumboEmptySourcePosition; - name->name_end = kGumboEmptySourcePosition; - name->value_start = kGumboEmptySourcePosition; - name->value_end = kGumboEmptySourcePosition; - gumbo_vector_add(parser, name, &input->v.element.attributes); - - pop_current_node(parser); // <input> - pop_current_node(parser); // <label> - insert_element_of_tag_type( - parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX); - pop_current_node(parser); // <hr> - pop_current_node(parser); // <form> - if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) { - parser->_parser_state->_form_element = NULL; - } - return false; } else if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) { run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA); parser->_parser_state->_ignore_next_linefeed = true; @@ -2932,37 +3144,45 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { insert_element_from_token(parser, token); set_frameset_not_ok(parser); GumboInsertionMode state = parser->_parser_state->_insertion_mode; - if (state == GUMBO_INSERTION_MODE_IN_TABLE || - state == GUMBO_INSERTION_MODE_IN_CAPTION || - state == GUMBO_INSERTION_MODE_IN_TABLE_BODY || - state == GUMBO_INSERTION_MODE_IN_ROW || - state == GUMBO_INSERTION_MODE_IN_CELL) { + if ( + state == GUMBO_INSERTION_MODE_IN_TABLE + || state == GUMBO_INSERTION_MODE_IN_CAPTION + || state == GUMBO_INSERTION_MODE_IN_TABLE_BODY + || state == GUMBO_INSERTION_MODE_IN_ROW + || state == GUMBO_INSERTION_MODE_IN_CELL + ) { set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE); } else { set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT); } return true; - } else if (tag_in(token, kStartTag, - (gumbo_tagset){TAG(OPTION), TAG(OPTGROUP)})) { + } else if ( + tag_in(token, kStartTag, &(const TagSet){TAG(OPTION), TAG(OPTGROUP)}) + ) { if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) { pop_current_node(parser); } reconstruct_active_formatting_elements(parser); insert_element_from_token(parser, token); return true; - } else if (tag_in(token, kStartTag, - (gumbo_tagset){TAG(RB), TAG(RP), TAG(RT), TAG(RTC)})) { + } else if ( + tag_in(token, kStartTag, &(const TagSet){TAG(RB), TAG(RP), TAG(RT), TAG(RTC)}) + ) { bool success = true; - GumboTag exception = - tag_in(token, kStartTag, (gumbo_tagset){TAG(RT), TAG(RP)}) - ? GUMBO_TAG_RTC - : GUMBO_TAG_LAST; + GumboTag exception = tag_in(token, kStartTag, &(const TagSet){TAG(RT), TAG(RP)}) + ? GUMBO_TAG_RTC + : GUMBO_TAG_LAST + ; if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) { generate_implied_end_tags(parser, exception); } - if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_RUBY) && - !(exception == GUMBO_TAG_LAST || - node_html_tag_is(get_current_node(parser), GUMBO_TAG_RTC))) { + if ( + !node_html_tag_is(get_current_node(parser), GUMBO_TAG_RUBY) + && !( + exception == GUMBO_TAG_LAST + || node_html_tag_is(get_current_node(parser), GUMBO_TAG_RTC) + ) + ) { parser_add_parse_error(parser, token); success = false; } @@ -2971,14 +3191,17 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { } else if (tag_is(token, kEndTag, GUMBO_TAG_BR)) { parser_add_parse_error(parser, token); reconstruct_active_formatting_elements(parser); - insert_element_of_tag_type( - parser, GUMBO_TAG_BR, GUMBO_INSERTION_CONVERTED_FROM_END_TAG); + insert_element_of_tag_type ( + parser, + GUMBO_TAG_BR, + GUMBO_INSERTION_CONVERTED_FROM_END_TAG + ); pop_current_node(parser); return false; } else if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) { reconstruct_active_formatting_elements(parser); - adjust_mathml_attributes(parser, token); - adjust_foreign_attributes(parser, token); + adjust_mathml_attributes(token); + adjust_foreign_attributes(token); insert_foreign_element(parser, token, GUMBO_NAMESPACE_MATHML); if (token->v.start_tag.is_self_closing) { pop_current_node(parser); @@ -2987,18 +3210,20 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { return true; } else if (tag_is(token, kStartTag, GUMBO_TAG_SVG)) { reconstruct_active_formatting_elements(parser); - adjust_svg_attributes(parser, token); - adjust_foreign_attributes(parser, token); + adjust_svg_attributes(token); + adjust_foreign_attributes(token); insert_foreign_element(parser, token, GUMBO_NAMESPACE_SVG); if (token->v.start_tag.is_self_closing) { pop_current_node(parser); acknowledge_self_closing_tag(parser); } return true; - } else if (tag_in(token, kStartTag, - (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP), - TAG(FRAME), TAG(HEAD), TAG(TBODY), TAG(TD), TAG(TFOOT), - TAG(TH), TAG(THEAD), TAG(TR)})) { + } else if ( + tag_in(token, kStartTag, &(const TagSet) { + TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(FRAME), TAG(HEAD), + TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR) + }) + ) { parser_add_parse_error(parser, token); ignore_token(parser); return false; @@ -3008,22 +3233,26 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { return true; } else { assert(token->type == GUMBO_TOKEN_END_TAG); - GumboTag end_tag = token->v.end_tag; + GumboTag end_tag = token->v.end_tag.tag; assert(state->_open_elements.length > 0); assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML)); // Walk up the stack of open elements until we find one that either: // a) Matches the tag name we saw // b) Is in the "special" category. - // If we see a), implicitly close everything up to and including it. If we + // If we see a), implicitly close everything up to and including it. If we // see b), then record a parse error, don't close anything (except the // implied end tags) and ignore the end tag token. for (int i = state->_open_elements.length; --i >= 0;) { const GumboNode* node = state->_open_elements.data[i]; + // XXX(sfc): This doesn't work for something like <body><foo></bar> + // since foo and bar have the same tag of GUMBO_TAG_UNKNOWN if (node_html_tag_is(node, end_tag)) { generate_implied_end_tags(parser, end_tag); // TODO(jdtang): Do I need to add a parse error here? The condition in // the spec seems like it's the inverse of the loop condition above, and // so would never fire. + // XXX(sfc): Yes, an error is needed here + // I think <p><div></p> is an example. while (node != pop_current_node(parser)) ; // Pop everything. return true; @@ -3039,15 +3268,17 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { } } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incdata +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incdata static bool handle_text(GumboParser* parser, GumboToken* token) { - if (token->type == GUMBO_TOKEN_CHARACTER || - token->type == GUMBO_TOKEN_WHITESPACE) { + if ( + token->type == GUMBO_TOKEN_CHARACTER + || token->type == GUMBO_TOKEN_WHITESPACE + ) { insert_text_token(parser, token); } else { // We provide only bare-bones script handling that doesn't involve any of // the parser-pause/already-started/script-nesting flags or re-entrant - // invocations of the tokenizer. Because the intended usage of this library + // invocations of the tokenizer. Because the intended usage of this library // is mostly for templating, refactoring, and static-analysis libraries, we // provide the script body as a text-node child of the <script> element. // This behavior doesn't support document.write of partial HTML elements, @@ -3062,13 +3293,15 @@ static bool handle_text(GumboParser* parser, GumboToken* token) { return true; } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intable +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intable static bool handle_in_table(GumboParser* parser, GumboToken* token) { GumboParserState* state = parser->_parser_state; - if (token->type == GUMBO_TOKEN_CHARACTER || - token->type == GUMBO_TOKEN_WHITESPACE) { + if ( + token->type == GUMBO_TOKEN_CHARACTER + || token->type == GUMBO_TOKEN_WHITESPACE + ) { // The "pending table character tokens" list described in the spec is - // nothing more than the TextNodeBufferState. We accumulate text tokens as + // nothing more than the TextNodeBufferState. We accumulate text tokens as // normal, except that when we go to flush them in the handle_in_table_text, // we set _foster_parent_insertions if there're non-whitespace characters in // the buffer. @@ -3097,19 +3330,27 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) { return true; } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) { clear_stack_to_table_context(parser); - insert_element_of_tag_type( - parser, GUMBO_TAG_COLGROUP, GUMBO_INSERTION_IMPLIED); + insert_element_of_tag_type ( + parser, + GUMBO_TAG_COLGROUP, + GUMBO_INSERTION_IMPLIED + ); parser->_parser_state->_reprocess_current_token = true; set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP); return true; - } else if (tag_in(token, kStartTag, - (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TD), - TAG(TH), TAG(TR)})) { + } else if ( + tag_in(token, kStartTag, &(const TagSet) { + TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TD), TAG(TH), TAG(TR) + }) + ) { clear_stack_to_table_context(parser); set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); - if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH), TAG(TR)})) { - insert_element_of_tag_type( - parser, GUMBO_TAG_TBODY, GUMBO_INSERTION_IMPLIED); + if (tag_in(token, kStartTag, &(const TagSet){TAG(TD), TAG(TH), TAG(TR)})) { + insert_element_of_tag_type ( + parser, + GUMBO_TAG_TBODY, + GUMBO_INSERTION_IMPLIED + ); state->_reprocess_current_token = true; } else { insert_element_from_token(parser, token); @@ -3129,20 +3370,24 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) { return false; } return true; - } else if (tag_in(token, kEndTag, - (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL), - TAG(COLGROUP), TAG(HTML), TAG(TBODY), TAG(TD), TAG(TFOOT), - TAG(TH), TAG(THEAD), TAG(TR)})) { + } else if ( + tag_in(token, kEndTag, &(const TagSet) { + TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML), + TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR) + }) + ) { parser_add_parse_error(parser, token); ignore_token(parser); return false; - } else if (tag_in(token, kStartTag, - (gumbo_tagset){TAG(STYLE), TAG(SCRIPT), TAG(TEMPLATE)}) || - (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE))) { + } else if ( + tag_in(token, kStartTag, &(const TagSet){TAG(STYLE), TAG(SCRIPT), TAG(TEMPLATE)}) + || (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) + ) { return handle_in_head(parser, token); - } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT) && - attribute_matches( - &token->v.start_tag.attributes, "type", "hidden")) { + } else if ( + tag_is(token, kStartTag, GUMBO_TAG_INPUT) + && attribute_matches(&token->v.start_tag.attributes, "type", "hidden") + ) { parser_add_parse_error(parser, token); insert_element_from_token(parser, token); pop_current_node(parser); @@ -3167,32 +3412,40 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) { } } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intabletext +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intabletext static bool handle_in_table_text(GumboParser* parser, GumboToken* token) { if (token->type == GUMBO_TOKEN_NULL) { parser_add_parse_error(parser, token); ignore_token(parser); return false; - } else if (token->type == GUMBO_TOKEN_CHARACTER || - token->type == GUMBO_TOKEN_WHITESPACE) { + } else if ( + token->type == GUMBO_TOKEN_CHARACTER + || token->type == GUMBO_TOKEN_WHITESPACE + ) { insert_text_token(parser, token); return true; } else { GumboParserState* state = parser->_parser_state; GumboStringBuffer* buffer = &state->_text_node._buffer; - // Can't use strspn for this because GumboStringBuffers are not - // null-terminated. - // Note that TextNodeBuffer may contain UTF-8 characters, but the presence - // of any one byte that is not whitespace means we flip the flag, so this - // loop is still valid. - for (unsigned int i = 0; i < buffer->length; ++i) { - if (!isspace((unsigned char) buffer->data[i]) || - buffer->data[i] == '\v') { + const char* data = buffer->data; + // Note that TextNodeBuffer may contain UTF-8 characters, but the + // presence of any one byte that is not whitespace means we flip + // the flag, so this loop is still valid. + for (size_t i = 0, n = buffer->length; i < n; ++i) { + switch (data[i]) { + case '\t': + case '\n': + case '\f': + case '\r': + case ' ': + continue; + default: state->_foster_parent_insertions = true; reconstruct_active_formatting_elements(parser); - break; + goto loopbreak; } } + loopbreak: maybe_flush_text_node_buffer(parser); state->_foster_parent_insertions = false; state->_reprocess_current_token = true; @@ -3201,7 +3454,7 @@ static bool handle_in_table_text(GumboParser* parser, GumboToken* token) { } } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incaption +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incaption static bool handle_in_caption(GumboParser* parser, GumboToken* token) { if (tag_is(token, kEndTag, GUMBO_TAG_CAPTION)) { if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) { @@ -3220,11 +3473,13 @@ static bool handle_in_caption(GumboParser* parser, GumboToken* token) { set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); return result; } - } else if (tag_in(token, kStartTag, - (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP), - TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), - TAG(TR)}) || - (tag_is(token, kEndTag, GUMBO_TAG_TABLE))) { + } else if ( + tag_in(token, kStartTag, &(const TagSet) { + TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TD), + TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR) + }) + || (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) + ) { if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) { parser_add_parse_error(parser, token); ignore_token(parser); @@ -3236,10 +3491,12 @@ static bool handle_in_caption(GumboParser* parser, GumboToken* token) { set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); parser->_parser_state->_reprocess_current_token = true; return true; - } else if (tag_in(token, kEndTag, - (gumbo_tagset){TAG(BODY), TAG(COL), TAG(COLGROUP), TAG(HTML), - TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), - TAG(TR)})) { + } else if ( + tag_in(token, kEndTag, &(const TagSet) { + TAG(BODY), TAG(COL), TAG(COLGROUP), TAG(HTML), TAG(TBODY), TAG(TD), + TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR) + }) + ) { parser_add_parse_error(parser, token); ignore_token(parser); return false; @@ -3248,7 +3505,7 @@ static bool handle_in_caption(GumboParser* parser, GumboToken* token) { } } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incolgroup +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incolgroup static bool handle_in_column_group(GumboParser* parser, GumboToken* token) { if (token->type == GUMBO_TOKEN_WHITESPACE) { insert_text_token(parser, token); @@ -3280,8 +3537,10 @@ static bool handle_in_column_group(GumboParser* parser, GumboToken* token) { parser_add_parse_error(parser, token); ignore_token(parser); return false; - } else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE) || - tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { + } else if ( + tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE) + || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE) + ) { return handle_in_head(parser, token); } else if (token->type == GUMBO_TOKEN_EOF) { return handle_in_body(parser, token); @@ -3298,23 +3557,24 @@ static bool handle_in_column_group(GumboParser* parser, GumboToken* token) { } } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intbody +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intbody static bool handle_in_table_body(GumboParser* parser, GumboToken* token) { if (tag_is(token, kStartTag, GUMBO_TAG_TR)) { clear_stack_to_table_body_context(parser); insert_element_from_token(parser, token); set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW); return true; - } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH)})) { + } else if (tag_in(token, kStartTag, &td_th_tags)) { parser_add_parse_error(parser, token); clear_stack_to_table_body_context(parser); insert_element_of_tag_type(parser, GUMBO_TAG_TR, GUMBO_INSERTION_IMPLIED); parser->_parser_state->_reprocess_current_token = true; set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW); return false; - } else if (tag_in(token, kEndTag, - (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) { - if (!has_an_element_in_table_scope(parser, token->v.end_tag)) { + } else if ( + tag_in(token, kEndTag, &(const TagSet){TAG(TBODY), TAG(TFOOT), TAG(THEAD)}) + ) { + if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) { parser_add_parse_error(parser, token); ignore_token(parser); return false; @@ -3323,13 +3583,20 @@ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) { pop_current_node(parser); set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); return true; - } else if (tag_in(token, kStartTag, - (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP), - TAG(TBODY), TAG(TFOOT), TAG(THEAD)}) || - tag_is(token, kEndTag, GUMBO_TAG_TABLE)) { - if (!(has_an_element_in_table_scope(parser, GUMBO_TAG_TBODY) || - has_an_element_in_table_scope(parser, GUMBO_TAG_THEAD) || - has_an_element_in_table_scope(parser, GUMBO_TAG_TFOOT))) { + } else if ( + tag_in(token, kStartTag, &(const TagSet) { + TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT), + TAG(THEAD) + }) + || tag_is(token, kEndTag, GUMBO_TAG_TABLE) + ) { + if ( + !( + has_an_element_in_table_scope(parser, GUMBO_TAG_TBODY) + || has_an_element_in_table_scope(parser, GUMBO_TAG_THEAD) + || has_an_element_in_table_scope(parser, GUMBO_TAG_TFOOT) + ) + ) { parser_add_parse_error(parser, token); ignore_token(parser); return false; @@ -3339,9 +3606,12 @@ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) { set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); parser->_parser_state->_reprocess_current_token = true; return true; - } else if (tag_in(token, kEndTag, - (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL), TAG(TR), - TAG(COLGROUP), TAG(HTML), TAG(TD), TAG(TH)})) { + } else if ( + tag_in(token, kEndTag, &(const TagSet) { + TAG(BODY), TAG(CAPTION), TAG(COL), TAG(TR), TAG(COLGROUP), + TAG(HTML), TAG(TD), TAG(TH) + }) + ) { parser_add_parse_error(parser, token); ignore_token(parser); return false; @@ -3350,9 +3620,9 @@ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) { } } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intr +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intr static bool handle_in_row(GumboParser* parser, GumboToken* token) { - if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TH), TAG(TD)})) { + if (tag_in(token, kStartTag, &td_th_tags)) { clear_stack_to_table_row_context(parser); insert_element_from_token(parser, token); set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CELL); @@ -3369,10 +3639,13 @@ static bool handle_in_row(GumboParser* parser, GumboToken* token) { set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); return true; } - } else if (tag_in(token, kStartTag, - (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP), - TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR)}) || - tag_is(token, kEndTag, GUMBO_TAG_TABLE)) { + } else if ( + tag_in(token, kStartTag, &(const TagSet) { + TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT), + TAG(THEAD), TAG(TR) + }) + || tag_is(token, kEndTag, GUMBO_TAG_TABLE) + ) { if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) { parser_add_parse_error(parser, token); ignore_token(parser); @@ -3384,10 +3657,13 @@ static bool handle_in_row(GumboParser* parser, GumboToken* token) { parser->_parser_state->_reprocess_current_token = true; return true; } - } else if (tag_in(token, kEndTag, - (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) { - if (!has_an_element_in_table_scope(parser, token->v.end_tag) || - (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR))) { + } else if ( + tag_in(token, kEndTag, &(const TagSet) {TAG(TBODY), TAG(TFOOT), TAG(THEAD)}) + ) { + if ( + !has_an_element_in_table_scope(parser, token->v.end_tag.tag) + || !has_an_element_in_table_scope(parser, GUMBO_TAG_TR) + ) { parser_add_parse_error(parser, token); ignore_token(parser); return false; @@ -3398,9 +3674,12 @@ static bool handle_in_row(GumboParser* parser, GumboToken* token) { parser->_parser_state->_reprocess_current_token = true; return true; } - } else if (tag_in(token, kEndTag, - (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL), - TAG(COLGROUP), TAG(HTML), TAG(TD), TAG(TH)})) { + } else if ( + tag_in(token, kEndTag, &(const TagSet) { + TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML), + TAG(TD), TAG(TH) + }) + ) { parser_add_parse_error(parser, token); ignore_token(parser); return false; @@ -3409,23 +3688,27 @@ static bool handle_in_row(GumboParser* parser, GumboToken* token) { } } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intd +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intd static bool handle_in_cell(GumboParser* parser, GumboToken* token) { - if (tag_in(token, kEndTag, (gumbo_tagset){TAG(TD), TAG(TH)})) { - GumboTag token_tag = token->v.end_tag; + if (tag_in(token, kEndTag, &td_th_tags)) { + GumboTag token_tag = token->v.end_tag.tag; if (!has_an_element_in_table_scope(parser, token_tag)) { parser_add_parse_error(parser, token); ignore_token(parser); return false; } return close_table_cell(parser, token, token_tag); - } else if (tag_in(token, kStartTag, - (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP), - TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), - TAG(TR)})) { + } else if ( + tag_in(token, kStartTag, &(const TagSet) { + TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TD), + TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR) + }) + ) { gumbo_debug("Handling <td> in cell.\n"); - if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TH) && - !has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) { + if ( + !has_an_element_in_table_scope(parser, GUMBO_TAG_TH) + && !has_an_element_in_table_scope(parser, GUMBO_TAG_TD) + ) { gumbo_debug("Bailing out because there's no <td> or <th> in scope.\n"); parser_add_parse_error(parser, token); ignore_token(parser); @@ -3433,14 +3716,20 @@ static bool handle_in_cell(GumboParser* parser, GumboToken* token) { } parser->_parser_state->_reprocess_current_token = true; return close_current_cell(parser, token); - } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(BODY), TAG(CAPTION), - TAG(COL), TAG(COLGROUP), TAG(HTML)})) { + } else if ( + tag_in(token, kEndTag, &(const TagSet) { + TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML) + }) + ) { parser_add_parse_error(parser, token); ignore_token(parser); return false; - } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(TABLE), TAG(TBODY), - TAG(TFOOT), TAG(THEAD), TAG(TR)})) { - if (!has_an_element_in_table_scope(parser, token->v.end_tag)) { + } else if ( + tag_in(token, kEndTag, &(const TagSet) { + TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR) + }) + ) { + if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) { parser_add_parse_error(parser, token); ignore_token(parser); return false; @@ -3452,14 +3741,16 @@ static bool handle_in_cell(GumboParser* parser, GumboToken* token) { } } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselect +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inselect static bool handle_in_select(GumboParser* parser, GumboToken* token) { if (token->type == GUMBO_TOKEN_NULL) { parser_add_parse_error(parser, token); ignore_token(parser); return false; - } else if (token->type == GUMBO_TOKEN_CHARACTER || - token->type == GUMBO_TOKEN_WHITESPACE) { + } else if ( + token->type == GUMBO_TOKEN_CHARACTER + || token->type == GUMBO_TOKEN_WHITESPACE + ) { insert_text_token(parser, token); return true; } else if (token->type == GUMBO_TOKEN_DOCTYPE) { @@ -3488,9 +3779,13 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) { return true; } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) { GumboVector* open_elements = &parser->_parser_state->_open_elements; - if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION) && - node_html_tag_is(open_elements->data[open_elements->length - 2], - GUMBO_TAG_OPTGROUP)) { + if ( + node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION) + && node_html_tag_is ( + open_elements->data[open_elements->length - 2], + GUMBO_TAG_OPTGROUP + ) + ) { pop_current_node(parser); } if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) { @@ -3525,8 +3820,9 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) { close_current_select(parser); } return false; - } else if (tag_in(token, kStartTag, - (gumbo_tagset){TAG(INPUT), TAG(KEYGEN), TAG(TEXTAREA)})) { + } else if ( + tag_in(token, kStartTag, &(const TagSet) {TAG(INPUT), TAG(KEYGEN), TAG(TEXTAREA)}) + ) { parser_add_parse_error(parser, token); if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) { ignore_token(parser); @@ -3535,9 +3831,10 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) { parser->_parser_state->_reprocess_current_token = true; } return false; - } else if (tag_in(token, kStartTag, - (gumbo_tagset){TAG(SCRIPT), TAG(TEMPLATE)}) || - tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { + } else if ( + tag_in(token, kStartTag, &(const TagSet){TAG(SCRIPT), TAG(TEMPLATE)}) + || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE) + ) { return handle_in_head(parser, token); } else if (token->type == GUMBO_TOKEN_EOF) { return handle_in_body(parser, token); @@ -3548,20 +3845,20 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) { } } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselectintable +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inselectintable static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) { - if (tag_in(token, kStartTag, - (gumbo_tagset){TAG(CAPTION), TAG(TABLE), TAG(TBODY), TAG(TFOOT), - TAG(THEAD), TAG(TR), TAG(TD), TAG(TH)})) { + static const TagSet tags = { + TAG(CAPTION), TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD), + TAG(TR), TAG(TD), TAG(TH) + }; + if (tag_in(token, kStartTag, &tags)) { parser_add_parse_error(parser, token); close_current_select(parser); parser->_parser_state->_reprocess_current_token = true; return false; - } else if (tag_in(token, kEndTag, - (gumbo_tagset){TAG(CAPTION), TAG(TABLE), TAG(TBODY), - TAG(TFOOT), TAG(THEAD), TAG(TR), TAG(TD), TAG(TH)})) { + } else if (tag_in(token, kEndTag, &tags)) { parser_add_parse_error(parser, token); - if (!has_an_element_in_table_scope(parser, token->v.end_tag)) { + if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) { ignore_token(parser); return false; } else { @@ -3577,23 +3874,32 @@ static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) { } } -// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-intemplate +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intemplate static bool handle_in_template(GumboParser* parser, GumboToken* token) { GumboParserState* state = parser->_parser_state; - if (token->type == GUMBO_TOKEN_WHITESPACE || - token->type == GUMBO_TOKEN_CHARACTER || - token->type == GUMBO_TOKEN_COMMENT || token->type == GUMBO_TOKEN_NULL || - token->type == GUMBO_TOKEN_DOCTYPE) { - return handle_in_body(parser, token); - } else if (tag_in(token, kStartTag, - (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), - TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(SCRIPT), - TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)}) || - tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) { + switch (token->type) { + case GUMBO_TOKEN_WHITESPACE: + case GUMBO_TOKEN_CHARACTER: + case GUMBO_TOKEN_COMMENT: + case GUMBO_TOKEN_NULL: + case GUMBO_TOKEN_DOCTYPE: + return handle_in_body(parser, token); + default: + break; + } + if ( + tag_in(token, kStartTag, &(const TagSet) { + TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK), TAG(META), + TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), TAG(TITLE) + }) + || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE) + ) { return handle_in_head(parser, token); - } else if (tag_in( - token, kStartTag, (gumbo_tagset){TAG(CAPTION), TAG(COLGROUP), - TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) { + } else if ( + tag_in(token, kStartTag, &(const TagSet) { + TAG(CAPTION), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT), TAG(THEAD) + }) + ) { pop_template_insertion_mode(parser); push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE); @@ -3611,7 +3917,7 @@ static bool handle_in_template(GumboParser* parser, GumboToken* token) { set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY); state->_reprocess_current_token = true; return true; - } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH)})) { + } else if (tag_in(token, kStartTag, &td_th_tags)) { pop_template_insertion_mode(parser); push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW); set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW); @@ -3646,10 +3952,12 @@ static bool handle_in_template(GumboParser* parser, GumboToken* token) { } } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterbody +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-afterbody static bool handle_after_body(GumboParser* parser, GumboToken* token) { - if (token->type == GUMBO_TOKEN_WHITESPACE || - tag_is(token, kStartTag, GUMBO_TAG_HTML)) { + if ( + token->type == GUMBO_TOKEN_WHITESPACE + || tag_is(token, kStartTag, GUMBO_TAG_HTML) + ) { return handle_in_body(parser, token); } else if (token->type == GUMBO_TOKEN_COMMENT) { GumboNode* html_node = parser->_output->root; @@ -3670,8 +3978,10 @@ static bool handle_after_body(GumboParser* parser, GumboToken* token) { set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_BODY); GumboNode* html = parser->_parser_state->_open_elements.data[0]; assert(node_html_tag_is(html, GUMBO_TAG_HTML)); - record_end_of_element( - parser->_parser_state->_current_token, &html->v.element); + record_end_of_element ( + parser->_parser_state->_current_token, + &html->v.element + ); return true; } else if (token->type == GUMBO_TOKEN_EOF) { return true; @@ -3683,7 +3993,7 @@ static bool handle_after_body(GumboParser* parser, GumboToken* token) { } } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inframeset +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inframeset static bool handle_in_frameset(GumboParser* parser, GumboToken* token) { if (token->type == GUMBO_TOKEN_WHITESPACE) { insert_text_token(parser, token); @@ -3707,8 +4017,10 @@ static bool handle_in_frameset(GumboParser* parser, GumboToken* token) { return false; } pop_current_node(parser); - if (!is_fragment_parser(parser) && - !node_html_tag_is(get_current_node(parser), GUMBO_TAG_FRAMESET)) { + if ( + !is_fragment_parser(parser) + && !node_html_tag_is(get_current_node(parser), GUMBO_TAG_FRAMESET) + ) { set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_FRAMESET); } return true; @@ -3732,7 +4044,7 @@ static bool handle_in_frameset(GumboParser* parser, GumboToken* token) { } } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterframeset +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-afterframeset static bool handle_after_frameset(GumboParser* parser, GumboToken* token) { if (token->type == GUMBO_TOKEN_WHITESPACE) { insert_text_token(parser, token); @@ -3749,8 +4061,10 @@ static bool handle_after_frameset(GumboParser* parser, GumboToken* token) { } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) { GumboNode* html = parser->_parser_state->_open_elements.data[0]; assert(node_html_tag_is(html, GUMBO_TAG_HTML)); - record_end_of_element( - parser->_parser_state->_current_token, &html->v.element); + record_end_of_element ( + parser->_parser_state->_current_token, + &html->v.element + ); set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET); return true; } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) { @@ -3764,14 +4078,16 @@ static bool handle_after_frameset(GumboParser* parser, GumboToken* token) { } } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-body-insertion-mode +// https://html.spec.whatwg.org/multipage/parsing.html#the-after-after-body-insertion-mode static bool handle_after_after_body(GumboParser* parser, GumboToken* token) { if (token->type == GUMBO_TOKEN_COMMENT) { append_comment_node(parser, get_document_node(parser), token); return true; - } else if (token->type == GUMBO_TOKEN_DOCTYPE || - token->type == GUMBO_TOKEN_WHITESPACE || - tag_is(token, kStartTag, GUMBO_TAG_HTML)) { + } else if ( + token->type == GUMBO_TOKEN_DOCTYPE + || token->type == GUMBO_TOKEN_WHITESPACE + || tag_is(token, kStartTag, GUMBO_TAG_HTML) + ) { return handle_in_body(parser, token); } else if (token->type == GUMBO_TOKEN_EOF) { return true; @@ -3783,15 +4099,19 @@ static bool handle_after_after_body(GumboParser* parser, GumboToken* token) { } } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-frameset-insertion-mode -static bool handle_after_after_frameset( - GumboParser* parser, GumboToken* token) { +// https://html.spec.whatwg.org/multipage/parsing.html#the-after-after-frameset-insertion-mode +static bool handle_after_after_frameset ( + GumboParser* parser, + GumboToken* token +) { if (token->type == GUMBO_TOKEN_COMMENT) { append_comment_node(parser, get_document_node(parser), token); return true; - } else if (token->type == GUMBO_TOKEN_DOCTYPE || - token->type == GUMBO_TOKEN_WHITESPACE || - tag_is(token, kStartTag, GUMBO_TAG_HTML)) { + } else if ( + token->type == GUMBO_TOKEN_DOCTYPE + || token->type == GUMBO_TOKEN_WHITESPACE + || tag_is(token, kStartTag, GUMBO_TAG_HTML) + ) { return handle_in_body(parser, token); } else if (token->type == GUMBO_TOKEN_EOF) { return true; @@ -3804,24 +4124,42 @@ static bool handle_after_after_frameset( } } -// Function pointers for each insertion mode. Keep in sync with -// insertion_mode.h. +// Function pointers for each insertion mode. +// Keep in sync with insertion_mode.h. typedef bool (*TokenHandler)(GumboParser* parser, GumboToken* token); -static const TokenHandler kTokenHandlers[] = {handle_initial, - handle_before_html, handle_before_head, handle_in_head, - handle_in_head_noscript, handle_after_head, handle_in_body, handle_text, - handle_in_table, handle_in_table_text, handle_in_caption, - handle_in_column_group, handle_in_table_body, handle_in_row, handle_in_cell, - handle_in_select, handle_in_select_in_table, handle_in_template, - handle_after_body, handle_in_frameset, handle_after_frameset, - handle_after_after_body, handle_after_after_frameset}; +static const TokenHandler kTokenHandlers[] = { + handle_initial, + handle_before_html, + handle_before_head, + handle_in_head, + handle_in_head_noscript, + handle_after_head, + handle_in_body, + handle_text, + handle_in_table, + handle_in_table_text, + handle_in_caption, + handle_in_column_group, + handle_in_table_body, + handle_in_row, + handle_in_cell, + handle_in_select, + handle_in_select_in_table, + handle_in_template, + handle_after_body, + handle_in_frameset, + handle_after_frameset, + handle_after_after_body, + handle_after_after_frameset +}; static bool handle_html_content(GumboParser* parser, GumboToken* token) { - return kTokenHandlers[(unsigned int) parser->_parser_state->_insertion_mode]( - parser, token); + const GumboInsertionMode mode = parser->_parser_state->_insertion_mode; + const TokenHandler handler = kTokenHandlers[mode]; + return handler(parser, token); } -// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inforeign +// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) { gumbo_debug("Handling foreign content"); switch (token->type) { @@ -3850,19 +4188,25 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) { break; } // Order matters for these clauses. - if (tag_in(token, kStartTag, - (gumbo_tagset){TAG(B), TAG(BIG), TAG(BLOCKQUOTE), TAG(BODY), TAG(BR), - TAG(CENTER), TAG(CODE), TAG(DD), TAG(DIV), TAG(DL), TAG(DT), - TAG(EM), TAG(EMBED), TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), - TAG(H6), TAG(HEAD), TAG(HR), TAG(I), TAG(IMG), TAG(LI), - TAG(LISTING), TAG(MENU), TAG(META), TAG(NOBR), TAG(OL), TAG(P), - TAG(PRE), TAG(RUBY), TAG(S), TAG(SMALL), TAG(SPAN), TAG(STRONG), - TAG(STRIKE), TAG(SUB), TAG(SUP), TAG(TABLE), TAG(TT), TAG(U), - TAG(UL), TAG(VAR)}) || - (tag_is(token, kStartTag, GUMBO_TAG_FONT) && - (token_has_attribute(token, "color") || - token_has_attribute(token, "face") || - token_has_attribute(token, "size")))) { + if ( + tag_in(token, kStartTag, &(const TagSet) { + TAG(B), TAG(BIG), TAG(BLOCKQUOTE), TAG(BODY), TAG(BR), TAG(CENTER), + TAG(CODE), TAG(DD), TAG(DIV), TAG(DL), TAG(DT), TAG(EM), TAG(EMBED), + TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6), TAG(HEAD), + TAG(HR), TAG(I), TAG(IMG), TAG(LI), TAG(LISTING), TAG(MENU), TAG(META), + TAG(NOBR), TAG(OL), TAG(P), TAG(PRE), TAG(RUBY), TAG(S), TAG(SMALL), + TAG(SPAN), TAG(STRONG), TAG(STRIKE), TAG(SUB), TAG(SUP), TAG(TABLE), + TAG(TT), TAG(U), TAG(UL), TAG(VAR) + }) + || ( + tag_is(token, kStartTag, GUMBO_TAG_FONT) + && ( + token_has_attribute(token, "color") + || token_has_attribute(token, "face") + || token_has_attribute(token, "size") + ) + ) + ) { /* Parse error */ parser_add_parse_error(parser, token); @@ -3874,10 +4218,13 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) { if (!is_fragment_parser(parser)) { do { pop_current_node(parser); - } while (!(is_mathml_integration_point(get_current_node(parser)) || - is_html_integration_point(get_current_node(parser)) || - get_current_node(parser)->v.element.tag_namespace == - GUMBO_NAMESPACE_HTML)); + } while ( + !( + is_mathml_integration_point(get_current_node(parser)) + || is_html_integration_point(get_current_node(parser)) + || get_current_node(parser)->v.element.tag_namespace == GUMBO_NAMESPACE_HTML + ) + ); parser->_parser_state->_reprocess_current_token = true; return false; } @@ -3889,14 +4236,13 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) { const GumboNamespaceEnum current_namespace = get_adjusted_current_node(parser)->v.element.tag_namespace; if (current_namespace == GUMBO_NAMESPACE_MATHML) { - adjust_mathml_attributes(parser, token); + adjust_mathml_attributes(token); } if (current_namespace == GUMBO_NAMESPACE_SVG) { - // Tag adjustment is left to the gumbo_normalize_svg_tagname helper - // function. - adjust_svg_attributes(parser, token); + adjust_svg_tag(token); + adjust_svg_attributes(token); } - adjust_foreign_attributes(parser, token); + adjust_foreign_attributes(token); insert_foreign_element(parser, token, current_namespace); if (token->v.start_tag.is_self_closing) { pop_current_node(parser); @@ -3909,6 +4255,7 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) { assert(token->type == GUMBO_TOKEN_END_TAG); GumboNode* node = get_current_node(parser); assert(node != NULL); + // XXX(sfc): This doesn't properly handle replacements. GumboStringPiece token_tagname = token->original_text; GumboStringPiece node_tagname = node->v.element.original_tag; gumbo_tag_from_original_text(&token_tagname); @@ -3925,12 +4272,16 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) { // case we do nothing) or we find the element that we're about to // close (in which case we pop everything we've seen until that // point.) - gumbo_debug("Foreign %.*s node at %d.\n", node_tagname.length, - node_tagname.data, i); + gumbo_debug ( + "Foreign %.*s node at %d.\n", + (int) node_tagname.length, + node_tagname.data, + i + ); if (gumbo_string_equals_ignore_case(&node_tagname, &token_tagname)) { gumbo_debug("Matches.\n"); while (pop_current_node(parser) != node) { - // Pop all the nodes below the current one. Node is guaranteed to + // Pop all the nodes below the current one. Node is guaranteed to // be an element on the stack of open elements (set below), so // this loop is guaranteed to terminate. } @@ -3954,10 +4305,12 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) { } } -// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#tree-construction +// https://html.spec.whatwg.org/multipage/parsing.html#tree-construction static bool handle_token(GumboParser* parser, GumboToken* token) { - if (parser->_parser_state->_ignore_next_linefeed && - token->type == GUMBO_TOKEN_WHITESPACE && token->v.character == '\n') { + if ( + parser->_parser_state->_ignore_next_linefeed + && token->type == GUMBO_TOKEN_WHITESPACE && token->v.character == '\n' + ) { parser->_parser_state->_ignore_next_linefeed = false; ignore_token(parser); return true; @@ -3976,11 +4329,16 @@ static bool handle_token(GumboParser* parser, GumboToken* token) { } const GumboNode* current_node = get_adjusted_current_node(parser); - assert(!current_node || current_node->type == GUMBO_NODE_ELEMENT || - current_node->type == GUMBO_NODE_TEMPLATE); + assert ( + !current_node + || current_node->type == GUMBO_NODE_ELEMENT + || current_node->type == GUMBO_NODE_TEMPLATE + ); if (current_node) { - gumbo_debug("Current node: <%s>.\n", - gumbo_normalized_tagname(current_node->v.element.tag)); + gumbo_debug ( + "Current node: <%s>.\n", + gumbo_normalized_tagname(current_node->v.element.tag) + ); } if (!current_node || current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML || @@ -3990,7 +4348,7 @@ static bool handle_token(GumboParser* parser, GumboToken* token) { token->type == GUMBO_TOKEN_NULL || (token->type == GUMBO_TOKEN_START_TAG && !tag_in(token, kStartTag, - (gumbo_tagset){TAG(MGLYPH), TAG(MALIGNMARK)})))) || + &(const TagSet){TAG(MGLYPH), TAG(MALIGNMARK)})))) || (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML && node_qualified_tag_is( current_node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) && @@ -4007,8 +4365,11 @@ static bool handle_token(GumboParser* parser, GumboToken* token) { } } -static void fragment_parser_init(GumboParser* parser, GumboTag fragment_ctx, - GumboNamespaceEnum fragment_namespace) { +static void fragment_parser_init ( + GumboParser* parser, + GumboTag fragment_ctx, + GumboNamespaceEnum fragment_namespace +) { GumboNode* root; assert(fragment_ctx != GUMBO_TAG_LAST); @@ -4054,8 +4415,11 @@ static void fragment_parser_init(GumboParser* parser, GumboTag fragment_ctx, } // 5. 6. 7. - root = insert_element_of_tag_type( - parser, GUMBO_TAG_HTML, GUMBO_INSERTION_IMPLIED); + root = insert_element_of_tag_type ( + parser, + GUMBO_TAG_HTML, + GUMBO_INSERTION_IMPLIED + ); parser->_output->root = root; // 8. @@ -4068,12 +4432,18 @@ static void fragment_parser_init(GumboParser* parser, GumboTag fragment_ctx, } GumboOutput* gumbo_parse(const char* buffer) { - return gumbo_parse_with_options( - &kGumboDefaultOptions, buffer, strlen(buffer)); + return gumbo_parse_with_options ( + &kGumboDefaultOptions, + buffer, + strlen(buffer) + ); } -GumboOutput* gumbo_parse_with_options( - const GumboOptions* options, const char* buffer, size_t length) { +GumboOutput* gumbo_parse_with_options ( + const GumboOptions* options, + const char* buffer, + size_t length +) { GumboParser parser; parser._options = options; output_init(&parser); @@ -4081,16 +4451,23 @@ GumboOutput* gumbo_parse_with_options( parser_state_init(&parser); if (options->fragment_context != GUMBO_TAG_LAST) { - fragment_parser_init( - &parser, options->fragment_context, options->fragment_namespace); + fragment_parser_init ( + &parser, + options->fragment_context, + options->fragment_namespace + ); } GumboParserState* state = parser._parser_state; - gumbo_debug("Parsing %.*s.\n", length, buffer); + gumbo_debug ( + "Parsing %.*s.\n", + (int) length, + buffer + ); // Sanity check so that infinite loops die with an assertion failure instead // of hanging the process before we ever get an error. - int loop_count = 0; + uint_fast32_t loop_count = 0; GumboToken token; bool has_error = false; @@ -4100,21 +4477,27 @@ GumboOutput* gumbo_parse_with_options( state->_reprocess_current_token = false; } else { GumboNode* current_node = get_current_node(&parser); - gumbo_tokenizer_set_is_current_node_foreign(&parser, - current_node && - current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML); + gumbo_tokenizer_set_is_current_node_foreign ( + &parser, + current_node && + current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML + ); has_error = !gumbo_lex(&parser, &token) || has_error; } + const char* token_type = "text"; switch (token.type) { case GUMBO_TOKEN_DOCTYPE: token_type = "doctype"; break; case GUMBO_TOKEN_START_TAG: - token_type = gumbo_normalized_tagname(token.v.start_tag.tag); + if (token.v.start_tag.tag == GUMBO_TAG_UNKNOWN) + token_type = token.v.start_tag.name; + else + token_type = gumbo_normalized_tagname(token.v.start_tag.tag); break; case GUMBO_TOKEN_END_TAG: - token_type = gumbo_normalized_tagname(token.v.end_tag); + token_type = gumbo_normalized_tagname(token.v.end_tag.tag); break; case GUMBO_TOKEN_COMMENT: token_type = "comment"; @@ -4122,47 +4505,70 @@ GumboOutput* gumbo_parse_with_options( default: break; } - gumbo_debug("Handling %s token @%d:%d in state %d.\n", (char*) token_type, - token.position.line, token.position.column, state->_insertion_mode); + gumbo_debug ( + "Handling %s token @%zu:%zu in state %u.\n", + (char*) token_type, + token.position.line, + token.position.column, + state->_insertion_mode + ); state->_current_token = &token; - state->_self_closing_flag_acknowledged = - !(token.type == GUMBO_TOKEN_START_TAG && - token.v.start_tag.is_self_closing); + state->_self_closing_flag_acknowledged = false; has_error = !handle_token(&parser, &token) || has_error; // Check for memory leaks when ownership is transferred from start tag // tokens to nodes. - assert(state->_reprocess_current_token || - token.type != GUMBO_TOKEN_START_TAG || - token.v.start_tag.attributes.data == NULL); - - if (!state->_self_closing_flag_acknowledged) { - GumboError* error = parser_add_parse_error(&parser, &token); - if (error) { - error->type = GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG; + assert ( + state->_reprocess_current_token + || token.type != GUMBO_TOKEN_START_TAG + || (token.v.start_tag.attributes.data == NULL + && token.v.start_tag.name == NULL) + ); + + if (!state->_reprocess_current_token) { + if (token.type == GUMBO_TOKEN_START_TAG && + token.v.start_tag.is_self_closing && + !state->_self_closing_flag_acknowledged) { + GumboError* error = parser_add_parse_error(&parser, &token); + if (error) + error->type = GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG; } + if (token.type == GUMBO_TOKEN_END_TAG && + token.v.end_tag.is_self_closing) { + GumboError* error = parser_add_parse_error(&parser, &token); + if (error) + error->type = GUMBO_ERR_SELF_CLOSING_END_TAG; + } + } + + if (unlikely(parser._parser_state->_open_elements.length > 400)) { + parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP; + gumbo_debug("Tree depth limit exceeded.\n"); + break; } ++loop_count; - assert(loop_count < 1000000000); + assert(loop_count < 1000000000UL); - } while ((token.type != GUMBO_TOKEN_EOF || state->_reprocess_current_token) && - !(options->stop_on_first_error && has_error)); + } while ( + (token.type != GUMBO_TOKEN_EOF || state->_reprocess_current_token) + && !(options->stop_on_first_error && has_error) + ); finish_parsing(&parser); // For API uniformity reasons, if the doctype still has nulls, convert them to // empty strings. GumboDocument* doc_type = &parser._output->document->v.document; if (doc_type->name == NULL) { - doc_type->name = gumbo_copy_stringz(&parser, ""); + doc_type->name = gumbo_strdup(""); } if (doc_type->public_identifier == NULL) { - doc_type->public_identifier = gumbo_copy_stringz(&parser, ""); + doc_type->public_identifier = gumbo_strdup(""); } if (doc_type->system_identifier == NULL) { - doc_type->system_identifier = gumbo_copy_stringz(&parser, ""); + doc_type->system_identifier = gumbo_strdup(""); } parser_state_destroy(&parser); @@ -4170,23 +4576,28 @@ GumboOutput* gumbo_parse_with_options( return parser._output; } -void gumbo_destroy_node(GumboOptions* options, GumboNode* node) { - // Need a dummy GumboParser because the allocator comes along with the - // options object. - GumboParser parser; - parser._options = options; - destroy_node(&parser, node); +const char* gumbo_status_to_string(GumboOutputStatus status) { + switch (status) { + case GUMBO_STATUS_OK: + return "OK"; + case GUMBO_STATUS_OUT_OF_MEMORY: + return "System allocator returned NULL during parsing"; + case GUMBO_STATUS_TREE_TOO_DEEP: + return "Document tree depth limit exceeded"; + default: + return "Unknown GumboOutputStatus value"; + } } -void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output) { - // Need a dummy GumboParser because the allocator comes along with the - // options object. - GumboParser parser; - parser._options = options; - destroy_node(&parser, output->document); +void gumbo_destroy_node(GumboNode* node) { + destroy_node(node); +} + +void gumbo_destroy_output(GumboOutput* output) { + destroy_node(output->document); for (unsigned int i = 0; i < output->errors.length; ++i) { - gumbo_error_destroy(&parser, output->errors.data[i]); + gumbo_error_destroy(output->errors.data[i]); } - gumbo_vector_destroy(&parser, &output->errors); - gumbo_parser_deallocate(&parser, output); + gumbo_vector_destroy(&output->errors); + gumbo_free(output); } diff --git a/gumbo-parser/src/parser.h b/gumbo-parser/src/parser.h index 95019e3e..740559f7 100644 --- a/gumbo-parser/src/parser.h +++ b/gumbo-parser/src/parser.h @@ -1,22 +1,3 @@ -// Copyright 2010 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) -// -// Contains the definition of the top-level GumboParser structure that's -// threaded through basically every internal function in the library. - #ifndef GUMBO_PARSER_H_ #define GUMBO_PARSER_H_ @@ -24,13 +5,16 @@ extern "C" { #endif +// Contains the definition of the top-level GumboParser structure that's +// threaded through basically every internal function in the library. + struct GumboInternalParserState; struct GumboInternalOutput; struct GumboInternalOptions; struct GumboInternalTokenizerState; // An overarching struct that's threaded through (nearly) all functions in the -// library, OOP-style. This gives each function access to the options and +// library, OOP-style. This gives each function access to the options and // output, along with any internal state needed for the parse. typedef struct GumboInternalParser { // Settings for this parse run. @@ -40,12 +24,12 @@ typedef struct GumboInternalParser { struct GumboInternalOutput* _output; // The internal tokenizer state, defined as a pointer to avoid a cyclic - // dependency on html5tokenizer.h. The main parse routine is responsible for + // dependency on html5tokenizer.h. The main parse routine is responsible for // initializing this on parse start, and destroying it on parse end. // End-users will never see a non-garbage value in this pointer. struct GumboInternalTokenizerState* _tokenizer_state; - // The internal parser state. Initialized on parse start and destroyed on + // The internal parser state. Initialized on parse start and destroyed on // parse end; end-users will never see a non-garbage value in this pointer. struct GumboInternalParserState* _parser_state; } GumboParser; diff --git a/gumbo-parser/src/replacement.h b/gumbo-parser/src/replacement.h new file mode 100644 index 00000000..327264d4 --- /dev/null +++ b/gumbo-parser/src/replacement.h @@ -0,0 +1,33 @@ +#ifndef GUMBO_REPLACEMENT_H_ +#define GUMBO_REPLACEMENT_H_ + +#include <stddef.h> +#include "gumbo.h" + +typedef struct { + const char *const from; + const char *const to; +} StringReplacement; + +const StringReplacement *gumbo_get_svg_tag_replacement ( + const char* str, + size_t len +); + +const StringReplacement *gumbo_get_svg_attr_replacement ( + const char* str, + size_t len +); + +typedef struct { + const char *const from; + const char *const local_name; + const GumboAttributeNamespaceEnum attr_namespace; +} ForeignAttrReplacement; + +const ForeignAttrReplacement *gumbo_get_foreign_attr_replacement ( + const char* str, + size_t len +); + +#endif // GUMBO_REPLACEMENT_H_ diff --git a/gumbo-parser/src/string_buffer.c b/gumbo-parser/src/string_buffer.c index d9be2f6b..729ff815 100644 --- a/gumbo-parser/src/string_buffer.c +++ b/gumbo-parser/src/string_buffer.c @@ -1,67 +1,61 @@ -// Copyright 2010 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) +/* + Copyright 2010 Google Inc. -#include "string_buffer.h" + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at -#include <assert.h> -#include <stdlib.h> -#include <string.h> -#include <strings.h> + https://www.apache.org/licenses/LICENSE-2.0 -#include "string_piece.h" -#include "util.h" + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ -struct GumboInternalParser; +#include <string.h> +#include "string_buffer.h" +#include "util.h" // Size chosen via statistical analysis of ~60K websites. // 99% of text nodes and 98% of attribute names/values fit in this initial size. static const size_t kDefaultStringBufferSize = 5; -static void maybe_resize_string_buffer(struct GumboInternalParser* parser, - size_t additional_chars, GumboStringBuffer* buffer) { +static void maybe_resize_string_buffer ( + size_t additional_chars, + GumboStringBuffer* buffer +) { size_t new_length = buffer->length + additional_chars; size_t new_capacity = buffer->capacity; while (new_capacity < new_length) { new_capacity *= 2; } if (new_capacity != buffer->capacity) { - char* new_data = gumbo_parser_allocate(parser, new_capacity); - memcpy(new_data, buffer->data, buffer->length); - gumbo_parser_deallocate(parser, buffer->data); - buffer->data = new_data; + buffer->data = gumbo_realloc(buffer->data, new_capacity); buffer->capacity = new_capacity; } } -void gumbo_string_buffer_init( - struct GumboInternalParser* parser, GumboStringBuffer* output) { - output->data = gumbo_parser_allocate(parser, kDefaultStringBufferSize); +void gumbo_string_buffer_init(GumboStringBuffer* output) { + output->data = gumbo_alloc(kDefaultStringBufferSize); output->length = 0; output->capacity = kDefaultStringBufferSize; } -void gumbo_string_buffer_reserve(struct GumboInternalParser* parser, - size_t min_capacity, GumboStringBuffer* output) { - maybe_resize_string_buffer(parser, min_capacity - output->length, output); +void gumbo_string_buffer_reserve ( + size_t min_capacity, + GumboStringBuffer* output +) { + maybe_resize_string_buffer(min_capacity - output->length, output); } -void gumbo_string_buffer_append_codepoint( - struct GumboInternalParser* parser, int c, GumboStringBuffer* output) { +void gumbo_string_buffer_append_codepoint ( + int c, + GumboStringBuffer* output +) { // num_bytes is actually the number of continuation bytes, 1 less than the - // total number of bytes. This is done to keep the loop below simple and + // total number of bytes. This is done to keep the loop below simple and // should probably change if we unroll it. int num_bytes, prefix; if (c <= 0x7f) { @@ -77,34 +71,33 @@ void gumbo_string_buffer_append_codepoint( num_bytes = 3; prefix = 0xf0; } - maybe_resize_string_buffer(parser, num_bytes + 1, output); + maybe_resize_string_buffer(num_bytes + 1, output); output->data[output->length++] = prefix | (c >> (num_bytes * 6)); for (int i = num_bytes - 1; i >= 0; --i) { output->data[output->length++] = 0x80 | (0x3f & (c >> (i * 6))); } } -void gumbo_string_buffer_append_string(struct GumboInternalParser* parser, - GumboStringPiece* str, GumboStringBuffer* output) { - maybe_resize_string_buffer(parser, str->length, output); +void gumbo_string_buffer_append_string ( + GumboStringPiece* str, + GumboStringBuffer* output +) { + maybe_resize_string_buffer(str->length, output); memcpy(output->data + output->length, str->data, str->length); output->length += str->length; } -char* gumbo_string_buffer_to_string( - struct GumboInternalParser* parser, GumboStringBuffer* input) { - char* buffer = gumbo_parser_allocate(parser, input->length + 1); +char* gumbo_string_buffer_to_string(const GumboStringBuffer* input) { + char* buffer = gumbo_alloc(input->length + 1); memcpy(buffer, input->data, input->length); buffer[input->length] = '\0'; return buffer; } -void gumbo_string_buffer_clear( - struct GumboInternalParser* parser, GumboStringBuffer* input) { +void gumbo_string_buffer_clear(GumboStringBuffer* input) { input->length = 0; } -void gumbo_string_buffer_destroy( - struct GumboInternalParser* parser, GumboStringBuffer* buffer) { - gumbo_parser_deallocate(parser, buffer->data); +void gumbo_string_buffer_destroy(GumboStringBuffer* buffer) { + gumbo_free(buffer->data); } diff --git a/gumbo-parser/src/string_buffer.h b/gumbo-parser/src/string_buffer.h index ee7956ac..41cabd1b 100644 --- a/gumbo-parser/src/string_buffer.h +++ b/gumbo-parser/src/string_buffer.h @@ -1,19 +1,3 @@ -// Copyright 2010 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) -// #ifndef GUMBO_STRING_BUFFER_H_ #define GUMBO_STRING_BUFFER_H_ @@ -26,18 +10,16 @@ extern "C" { #endif -struct GumboInternalParser; - -// A struct representing a mutable, growable string. This consists of a -// heap-allocated buffer that may grow (by doubling) as necessary. When +// A struct representing a mutable, growable string. This consists of a +// heap-allocated buffer that may grow (by doubling) as necessary. When // converting to a string, this allocates a new buffer that is only as long as -// it needs to be. Note that the internal buffer here is *not* nul-terminated, +// it needs to be. Note that the internal buffer here is *not* nul-terminated, // so be sure not to use ordinary string manipulation functions on it. typedef struct { - // A pointer to the beginning of the string. NULL iff length == 0. + // A pointer to the beginning of the string. NULL if length == 0. char* data; - // The length of the string fragment, in bytes. May be zero. + // The length of the string fragment, in bytes. May be zero. size_t length; // The capacity of the buffer, in bytes. @@ -45,40 +27,42 @@ typedef struct { } GumboStringBuffer; // Initializes a new GumboStringBuffer. -void gumbo_string_buffer_init( - struct GumboInternalParser* parser, GumboStringBuffer* output); +void gumbo_string_buffer_init(GumboStringBuffer* output); -// Ensures that the buffer contains at least a certain amount of space. Most +// Ensures that the buffer contains at least a certain amount of space. Most // useful with snprintf and the other length-delimited string functions, which // may want to write directly into the buffer. -void gumbo_string_buffer_reserve(struct GumboInternalParser* parser, - size_t min_capacity, GumboStringBuffer* output); +void gumbo_string_buffer_reserve ( + size_t min_capacity, + GumboStringBuffer* output +); // Appends a single Unicode codepoint onto the end of the GumboStringBuffer. // This is essentially a UTF-8 encoder, and may add 1-4 bytes depending on the // value of the codepoint. -void gumbo_string_buffer_append_codepoint( - struct GumboInternalParser* parser, int c, GumboStringBuffer* output); +void gumbo_string_buffer_append_codepoint ( + int c, + GumboStringBuffer* output +); // Appends a string onto the end of the GumboStringBuffer. -void gumbo_string_buffer_append_string(struct GumboInternalParser* parser, - GumboStringPiece* str, GumboStringBuffer* output); +void gumbo_string_buffer_append_string ( + GumboStringPiece* str, + GumboStringBuffer* output +); // Converts this string buffer to const char*, alloctaing a new buffer for it. -char* gumbo_string_buffer_to_string( - struct GumboInternalParser* parser, GumboStringBuffer* input); +char* gumbo_string_buffer_to_string(const GumboStringBuffer* input); -// Reinitialize this string buffer. This clears it by setting length=0. It +// Reinitialize this string buffer. This clears it by setting length=0. It // does not zero out the buffer itself. -void gumbo_string_buffer_clear( - struct GumboInternalParser* parser, GumboStringBuffer* input); +void gumbo_string_buffer_clear(GumboStringBuffer* input); // Deallocates this GumboStringBuffer. -void gumbo_string_buffer_destroy( - struct GumboInternalParser* parser, GumboStringBuffer* buffer); +void gumbo_string_buffer_destroy(GumboStringBuffer* buffer); #ifdef __cplusplus } #endif -#endif // GUMBO_STRING_BUFFER_H_ +#endif // GUMBO_STRING_BUFFER_H_ diff --git a/gumbo-parser/src/string_piece.c b/gumbo-parser/src/string_piece.c index 8ad5b846..129c8e53 100644 --- a/gumbo-parser/src/string_piece.c +++ b/gumbo-parser/src/string_piece.c @@ -1,48 +1,44 @@ -// Copyright 2010 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) +/* + Copyright 2018 Craig Barnes. + Copyright 2010 Google Inc. -#include "string_piece.h" + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at -#include <assert.h> -#include <stdlib.h> -#include <string.h> -#include <strings.h> - -#include "util.h" - -struct GumboInternalParser; + https://www.apache.org/licenses/LICENSE-2.0 -const GumboStringPiece kGumboEmptyString = {NULL, 0}; + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ -bool gumbo_string_equals( - const GumboStringPiece* str1, const GumboStringPiece* str2) { - return str1->length == str2->length && - !memcmp(str1->data, str2->data, str1->length); -} - -bool gumbo_string_equals_ignore_case( - const GumboStringPiece* str1, const GumboStringPiece* str2) { - return str1->length == str2->length && - !strncasecmp(str1->data, str2->data, str1->length); +#include <stddef.h> +#include <string.h> +#include "gumbo.h" +#include "ascii.h" + +const GumboStringPiece kGumboEmptyString = { \ + .data = NULL, \ + .length = 0 \ +}; + +bool gumbo_string_equals ( + const GumboStringPiece* str1, + const GumboStringPiece* str2 +) { + return + str1->length == str2->length + && !memcmp(str1->data, str2->data, str1->length); } -void gumbo_string_copy(struct GumboInternalParser* parser, - GumboStringPiece* dest, const GumboStringPiece* source) { - dest->length = source->length; - char* buffer = gumbo_parser_allocate(parser, source->length); - memcpy(buffer, source->data, source->length); - dest->data = buffer; +bool gumbo_string_equals_ignore_case ( + const GumboStringPiece* str1, + const GumboStringPiece* str2 +) { + return + str1->length == str2->length + && !gumbo_ascii_strncasecmp(str1->data, str2->data, str1->length); } diff --git a/gumbo-parser/src/string_piece.h b/gumbo-parser/src/string_piece.h deleted file mode 100644 index 8c8188c5..00000000 --- a/gumbo-parser/src/string_piece.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2010 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) - -#ifndef GUMBO_STRING_PIECE_H_ -#define GUMBO_STRING_PIECE_H_ - -#include "gumbo.h" - -#ifdef __cplusplus -extern "C" { -#endif - -struct GumboInternalParser; - -// Performs a deep-copy of an GumboStringPiece, allocating a fresh buffer in the -// destination and copying over the characters from source. Dest should be -// empty, with no buffer allocated; otherwise, this leaks it. -void gumbo_string_copy(struct GumboInternalParser* parser, - GumboStringPiece* dest, const GumboStringPiece* source); - -#ifdef __cplusplus -} -#endif - -#endif // GUMBO_STRING_PIECE_H_ diff --git a/gumbo-parser/src/svg_attrs.c b/gumbo-parser/src/svg_attrs.c new file mode 100644 index 00000000..76a44779 --- /dev/null +++ b/gumbo-parser/src/svg_attrs.c @@ -0,0 +1,174 @@ +/* ANSI-C code produced by gperf version 3.1 */ +/* Command-line: gperf -m100 lib/svg_attrs.gperf */ +/* Computed positions: -k'1,10,$' */ +/* Filtered by: mk/gperf-filter.sed */ + +#include "replacement.h" +#include "macros.h" +#include "ascii.h" +#include <string.h> + +#define TOTAL_KEYWORDS 58 +#define MIN_WORD_LENGTH 4 +#define MAX_WORD_LENGTH 19 +#define MIN_HASH_VALUE 5 +#define MAX_HASH_VALUE 77 +/* maximum key range = 73, duplicates = 0 */ + + + +static inline unsigned int +hash (register const char *str, register size_t len) +{ + static const unsigned char asso_values[] = + { + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 5, 78, 39, 14, 1, + 31, 31, 13, 13, 78, 78, 22, 25, 10, 2, + 7, 78, 22, 0, 1, 3, 1, 78, 0, 36, + 14, 17, 20, 78, 78, 78, 78, 5, 78, 39, + 14, 1, 31, 31, 13, 13, 78, 78, 22, 25, + 10, 2, 7, 78, 22, 0, 1, 3, 1, 78, + 0, 36, 14, 17, 20, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78 + }; + register unsigned int hval = len; + + switch (hval) + { + default: + hval += asso_values[(unsigned char)str[9]]; + /*FALLTHROUGH*/ + case 9: + case 8: + case 7: + case 6: + case 5: + case 4: + case 3: + case 2: + case 1: + hval += asso_values[(unsigned char)str[0]+2]; + break; + } + return hval + asso_values[(unsigned char)str[len - 1]]; +} + +const StringReplacement * +gumbo_get_svg_attr_replacement (register const char *str, register size_t len) +{ + static const unsigned char lengthtable[] = + { + 0, 0, 0, 0, 0, 4, 0, 7, 7, 0, 8, 9, 10, 11, + 11, 11, 11, 10, 16, 18, 16, 12, 16, 11, 13, 11, 12, 11, + 16, 0, 17, 9, 9, 8, 9, 10, 13, 10, 12, 14, 8, 4, + 12, 19, 7, 9, 12, 12, 11, 14, 10, 19, 8, 16, 13, 16, + 16, 15, 10, 12, 0, 0, 13, 13, 13, 0, 0, 9, 16, 0, + 0, 0, 0, 0, 0, 0, 0, 17 + }; + static const StringReplacement wordlist[] = + { + {(char*)0,(char*)0}, {(char*)0,(char*)0}, + {(char*)0,(char*)0}, {(char*)0,(char*)0}, + {(char*)0,(char*)0}, + {"refx", "refX"}, + {(char*)0,(char*)0}, + {"viewbox", "viewBox"}, + {"targetx", "targetX"}, + {(char*)0,(char*)0}, + {"calcmode", "calcMode"}, + {"maskunits", "maskUnits"}, + {"viewtarget", "viewTarget"}, + {"tablevalues", "tableValues"}, + {"markerunits", "markerUnits"}, + {"stitchtiles", "stitchTiles"}, + {"startoffset", "startOffset"}, + {"numoctaves", "numOctaves"}, + {"requiredfeatures", "requiredFeatures"}, + {"requiredextensions", "requiredExtensions"}, + {"specularexponent", "specularExponent"}, + {"surfacescale", "surfaceScale"}, + {"specularconstant", "specularConstant"}, + {"repeatcount", "repeatCount"}, + {"clippathunits", "clipPathUnits"}, + {"filterunits", "filterUnits"}, + {"lengthadjust", "lengthAdjust"}, + {"markerwidth", "markerWidth"}, + {"maskcontentunits", "maskContentUnits"}, + {(char*)0,(char*)0}, + {"limitingconeangle", "limitingConeAngle"}, + {"pointsatx", "pointsAtX"}, + {"repeatdur", "repeatDur"}, + {"keytimes", "keyTimes"}, + {"keypoints", "keyPoints"}, + {"keysplines", "keySplines"}, + {"gradientunits", "gradientUnits"}, + {"textlength", "textLength"}, + {"stddeviation", "stdDeviation"}, + {"primitiveunits", "primitiveUnits"}, + {"edgemode", "edgeMode"}, + {"refy", "refY"}, + {"spreadmethod", "spreadMethod"}, + {"preserveaspectratio", "preserveAspectRatio"}, + {"targety", "targetY"}, + {"pointsatz", "pointsAtZ"}, + {"markerheight", "markerHeight"}, + {"patternunits", "patternUnits"}, + {"baseprofile", "baseProfile"}, + {"systemlanguage", "systemLanguage"}, + {"zoomandpan", "zoomAndPan"}, + {"patterncontentunits", "patternContentUnits"}, + {"glyphref", "glyphRef"}, + {"xchannelselector", "xChannelSelector"}, + {"attributetype", "attributeType"}, + {"kernelunitlength", "kernelUnitLength"}, + {"ychannelselector", "yChannelSelector"}, + {"diffuseconstant", "diffuseConstant"}, + {"pathlength", "pathLength"}, + {"kernelmatrix", "kernelMatrix"}, + {(char*)0,(char*)0}, {(char*)0,(char*)0}, + {"preservealpha", "preserveAlpha"}, + {"attributename", "attributeName"}, + {"basefrequency", "baseFrequency"}, + {(char*)0,(char*)0}, {(char*)0,(char*)0}, + {"pointsaty", "pointsAtY"}, + {"patterntransform", "patternTransform"}, + {(char*)0,(char*)0}, {(char*)0,(char*)0}, + {(char*)0,(char*)0}, {(char*)0,(char*)0}, + {(char*)0,(char*)0}, {(char*)0,(char*)0}, + {(char*)0,(char*)0}, {(char*)0,(char*)0}, + {"gradienttransform", "gradientTransform"} + }; + + if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) + { + register unsigned int key = hash (str, len); + + if (key <= MAX_HASH_VALUE) + if (len == lengthtable[key]) + { + register const char *s = wordlist[key].from; + + if (s && (((unsigned char)*str ^ (unsigned char)*s) & ~32) == 0 && !gumbo_ascii_strncasecmp(str, s, len)) + return &wordlist[key]; + } + } + return 0; +} diff --git a/gumbo-parser/src/svg_attrs.gperf b/gumbo-parser/src/svg_attrs.gperf new file mode 100644 index 00000000..ce9b2eb2 --- /dev/null +++ b/gumbo-parser/src/svg_attrs.gperf @@ -0,0 +1,77 @@ +%{ +#include "replacement.h" +#include "macros.h" +#include "ascii.h" +%} + +%ignore-case +%struct-type +%omit-struct-type +%compare-lengths +%readonly-tables +%null-strings +%includes +%define lookup-function-name gumbo_get_svg_attr_replacement +%define slot-name from +%define initializer-suffix ,(char*)0 +StringReplacement; + +%% +"attributename", "attributeName" +"attributetype", "attributeType" +"basefrequency", "baseFrequency" +"baseprofile", "baseProfile" +"calcmode", "calcMode" +"clippathunits", "clipPathUnits" +"diffuseconstant", "diffuseConstant" +"edgemode", "edgeMode" +"filterunits", "filterUnits" +"glyphref", "glyphRef" +"gradienttransform", "gradientTransform" +"gradientunits", "gradientUnits" +"kernelmatrix", "kernelMatrix" +"kernelunitlength", "kernelUnitLength" +"keypoints", "keyPoints" +"keysplines", "keySplines" +"keytimes", "keyTimes" +"lengthadjust", "lengthAdjust" +"limitingconeangle", "limitingConeAngle" +"markerheight", "markerHeight" +"markerunits", "markerUnits" +"markerwidth", "markerWidth" +"maskcontentunits", "maskContentUnits" +"maskunits", "maskUnits" +"numoctaves", "numOctaves" +"pathlength", "pathLength" +"patterncontentunits", "patternContentUnits" +"patterntransform", "patternTransform" +"patternunits", "patternUnits" +"pointsatx", "pointsAtX" +"pointsaty", "pointsAtY" +"pointsatz", "pointsAtZ" +"preservealpha", "preserveAlpha" +"preserveaspectratio", "preserveAspectRatio" +"primitiveunits", "primitiveUnits" +"refx", "refX" +"refy", "refY" +"repeatcount", "repeatCount" +"repeatdur", "repeatDur" +"requiredextensions", "requiredExtensions" +"requiredfeatures", "requiredFeatures" +"specularconstant", "specularConstant" +"specularexponent", "specularExponent" +"spreadmethod", "spreadMethod" +"startoffset", "startOffset" +"stddeviation", "stdDeviation" +"stitchtiles", "stitchTiles" +"surfacescale", "surfaceScale" +"systemlanguage", "systemLanguage" +"tablevalues", "tableValues" +"targetx", "targetX" +"targety", "targetY" +"textlength", "textLength" +"viewbox", "viewBox" +"viewtarget", "viewTarget" +"xchannelselector", "xChannelSelector" +"ychannelselector", "yChannelSelector" +"zoomandpan", "zoomAndPan" diff --git a/gumbo-parser/src/svg_tags.c b/gumbo-parser/src/svg_tags.c new file mode 100644 index 00000000..5d835454 --- /dev/null +++ b/gumbo-parser/src/svg_tags.c @@ -0,0 +1,137 @@ +/* ANSI-C code produced by gperf version 3.1 */ +/* Command-line: gperf -m100 lib/svg_tags.gperf */ +/* Computed positions: -k'3,7' */ +/* Filtered by: mk/gperf-filter.sed */ + +#include "replacement.h" +#include "macros.h" +#include "ascii.h" +#include <string.h> + +#define TOTAL_KEYWORDS 36 +#define MIN_WORD_LENGTH 6 +#define MAX_WORD_LENGTH 19 +#define MIN_HASH_VALUE 6 +#define MAX_HASH_VALUE 42 +/* maximum key range = 37, duplicates = 0 */ + + + +static inline unsigned int +hash (register const char *str, register size_t len) +{ + static const unsigned char asso_values[] = + { + 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 12, 2, 10, 22, + 1, 28, 15, 1, 43, 43, 43, 0, 9, 26, + 3, 17, 1, 11, 0, 22, 5, 43, 3, 2, + 43, 43, 43, 43, 43, 43, 43, 43, 12, 2, + 10, 22, 1, 28, 15, 1, 43, 43, 43, 0, + 9, 26, 3, 17, 1, 11, 0, 22, 5, 43, + 3, 2, 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43 + }; + register unsigned int hval = len; + + switch (hval) + { + default: + hval += asso_values[(unsigned char)str[6]+1]; + /*FALLTHROUGH*/ + case 6: + case 5: + case 4: + case 3: + hval += asso_values[(unsigned char)str[2]]; + break; + } + return hval; +} + +const StringReplacement * +gumbo_get_svg_tag_replacement (register const char *str, register size_t len) +{ + static const unsigned char lengthtable[] = + { + 0, 0, 0, 0, 0, 0, 6, 0, 7, 7, 7, 8, 11, 12, + 12, 13, 11, 12, 16, 7, 7, 16, 11, 7, 19, 8, 13, 17, + 11, 12, 7, 8, 17, 8, 18, 8, 14, 12, 14, 14, 13, 7, + 14 + }; + static const StringReplacement wordlist[] = + { + {(char*)0,(char*)0}, {(char*)0,(char*)0}, + {(char*)0,(char*)0}, {(char*)0,(char*)0}, + {(char*)0,(char*)0}, {(char*)0,(char*)0}, + {"fetile", "feTile"}, + {(char*)0,(char*)0}, + {"femerge", "feMerge"}, + {"feimage", "feImage"}, + {"fefuncb", "feFuncB"}, + {"glyphref", "glyphRef"}, + {"femergenode", "feMergeNode"}, + {"femorphology", "feMorphology"}, + {"animatecolor", "animateColor"}, + {"animatemotion", "animateMotion"}, + {"fecomposite", "feComposite"}, + {"feturbulence", "feTurbulence"}, + {"animatetransform", "animateTransform"}, + {"fefuncr", "feFuncR"}, + {"fefunca", "feFuncA"}, + {"feconvolvematrix", "feConvolveMatrix"}, + {"fespotlight", "feSpotLight"}, + {"fefuncg", "feFuncG"}, + {"fecomponenttransfer", "feComponentTransfer"}, + {"altglyph", "altGlyph"}, + {"fecolormatrix", "feColorMatrix"}, + {"fedisplacementmap", "feDisplacementMap"}, + {"altglyphdef", "altGlyphDef"}, + {"altglyphitem", "altGlyphItem"}, + {"feflood", "feFlood"}, + {"clippath", "clipPath"}, + {"fediffuselighting", "feDiffuseLighting"}, + {"textpath", "textPath"}, + {"fespecularlighting", "feSpecularLighting"}, + {"feoffset", "feOffset"}, + {"fedistantlight", "feDistantLight"}, + {"fepointlight", "fePointLight"}, + {"lineargradient", "linearGradient"}, + {"radialgradient", "radialGradient"}, + {"foreignobject", "foreignObject"}, + {"feblend", "feBlend"}, + {"fegaussianblur", "feGaussianBlur"} + }; + + if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) + { + register unsigned int key = hash (str, len); + + if (key <= MAX_HASH_VALUE) + if (len == lengthtable[key]) + { + register const char *s = wordlist[key].from; + + if (s && (((unsigned char)*str ^ (unsigned char)*s) & ~32) == 0 && !gumbo_ascii_strncasecmp(str, s, len)) + return &wordlist[key]; + } + } + return 0; +} diff --git a/gumbo-parser/src/svg_tags.gperf b/gumbo-parser/src/svg_tags.gperf new file mode 100644 index 00000000..a3c05f71 --- /dev/null +++ b/gumbo-parser/src/svg_tags.gperf @@ -0,0 +1,55 @@ +%{ +#include "replacement.h" +#include "macros.h" +#include "ascii.h" +%} + +%ignore-case +%struct-type +%omit-struct-type +%compare-lengths +%readonly-tables +%null-strings +%includes +%define lookup-function-name gumbo_get_svg_tag_replacement +%define slot-name from +%define initializer-suffix ,(char*)0 +StringReplacement; + +%% +"altglyph", "altGlyph" +"altglyphdef", "altGlyphDef" +"altglyphitem", "altGlyphItem" +"animatecolor", "animateColor" +"animatemotion", "animateMotion" +"animatetransform", "animateTransform" +"clippath", "clipPath" +"feblend", "feBlend" +"fecolormatrix", "feColorMatrix" +"fecomponenttransfer", "feComponentTransfer" +"fecomposite", "feComposite" +"feconvolvematrix", "feConvolveMatrix" +"fediffuselighting", "feDiffuseLighting" +"fedisplacementmap", "feDisplacementMap" +"fedistantlight", "feDistantLight" +"feflood", "feFlood" +"fefunca", "feFuncA" +"fefuncb", "feFuncB" +"fefuncg", "feFuncG" +"fefuncr", "feFuncR" +"fegaussianblur", "feGaussianBlur" +"feimage", "feImage" +"femerge", "feMerge" +"femergenode", "feMergeNode" +"femorphology", "feMorphology" +"feoffset", "feOffset" +"fepointlight", "fePointLight" +"fespecularlighting", "feSpecularLighting" +"fespotlight", "feSpotLight" +"fetile", "feTile" +"feturbulence", "feTurbulence" +"foreignobject", "foreignObject" +"glyphref", "glyphRef" +"lineargradient", "linearGradient" +"radialgradient", "radialGradient" +"textpath", "textPath" diff --git a/gumbo-parser/src/tag.c b/gumbo-parser/src/tag.c index 08cb9238..3cae2d33 100644 --- a/gumbo-parser/src/tag.c +++ b/gumbo-parser/src/tag.c @@ -1,40 +1,187 @@ -// Copyright 2011 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) +/* + Copyright 2011 Google Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "gumbo.h" +#include "util.h" +#include "tag_lookup.h" #include <assert.h> -#include <ctype.h> #include <string.h> -const char* kGumboTagNames[] = { -#include "tag_strings.h" - "", // TAG_UNKNOWN - "", // TAG_LAST -}; +static const char kGumboTagNames[GUMBO_TAG_LAST+1][15] = { + [GUMBO_TAG_HTML] = "html", + [GUMBO_TAG_HEAD] = "head", + [GUMBO_TAG_TITLE] = "title", + [GUMBO_TAG_BASE] = "base", + [GUMBO_TAG_LINK] = "link", + [GUMBO_TAG_META] = "meta", + [GUMBO_TAG_STYLE] = "style", + [GUMBO_TAG_SCRIPT] = "script", + [GUMBO_TAG_NOSCRIPT] = "noscript", + [GUMBO_TAG_TEMPLATE] = "template", + [GUMBO_TAG_BODY] = "body", + [GUMBO_TAG_ARTICLE] = "article", + [GUMBO_TAG_SECTION] = "section", + [GUMBO_TAG_NAV] = "nav", + [GUMBO_TAG_ASIDE] = "aside", + [GUMBO_TAG_H1] = "h1", + [GUMBO_TAG_H2] = "h2", + [GUMBO_TAG_H3] = "h3", + [GUMBO_TAG_H4] = "h4", + [GUMBO_TAG_H5] = "h5", + [GUMBO_TAG_H6] = "h6", + [GUMBO_TAG_HGROUP] = "hgroup", + [GUMBO_TAG_HEADER] = "header", + [GUMBO_TAG_FOOTER] = "footer", + [GUMBO_TAG_ADDRESS] = "address", + [GUMBO_TAG_P] = "p", + [GUMBO_TAG_HR] = "hr", + [GUMBO_TAG_PRE] = "pre", + [GUMBO_TAG_BLOCKQUOTE] = "blockquote", + [GUMBO_TAG_OL] = "ol", + [GUMBO_TAG_UL] = "ul", + [GUMBO_TAG_LI] = "li", + [GUMBO_TAG_DL] = "dl", + [GUMBO_TAG_DT] = "dt", + [GUMBO_TAG_DD] = "dd", + [GUMBO_TAG_FIGURE] = "figure", + [GUMBO_TAG_FIGCAPTION] = "figcaption", + [GUMBO_TAG_MAIN] = "main", + [GUMBO_TAG_DIV] = "div", + [GUMBO_TAG_A] = "a", + [GUMBO_TAG_EM] = "em", + [GUMBO_TAG_STRONG] = "strong", + [GUMBO_TAG_SMALL] = "small", + [GUMBO_TAG_S] = "s", + [GUMBO_TAG_CITE] = "cite", + [GUMBO_TAG_Q] = "q", + [GUMBO_TAG_DFN] = "dfn", + [GUMBO_TAG_ABBR] = "abbr", + [GUMBO_TAG_DATA] = "data", + [GUMBO_TAG_TIME] = "time", + [GUMBO_TAG_CODE] = "code", + [GUMBO_TAG_VAR] = "var", + [GUMBO_TAG_SAMP] = "samp", + [GUMBO_TAG_KBD] = "kbd", + [GUMBO_TAG_SUB] = "sub", + [GUMBO_TAG_SUP] = "sup", + [GUMBO_TAG_I] = "i", + [GUMBO_TAG_B] = "b", + [GUMBO_TAG_U] = "u", + [GUMBO_TAG_MARK] = "mark", + [GUMBO_TAG_RUBY] = "ruby", + [GUMBO_TAG_RT] = "rt", + [GUMBO_TAG_RP] = "rp", + [GUMBO_TAG_BDI] = "bdi", + [GUMBO_TAG_BDO] = "bdo", + [GUMBO_TAG_SPAN] = "span", + [GUMBO_TAG_BR] = "br", + [GUMBO_TAG_WBR] = "wbr", + [GUMBO_TAG_INS] = "ins", + [GUMBO_TAG_DEL] = "del", + [GUMBO_TAG_IMAGE] = "image", + [GUMBO_TAG_IMG] = "img", + [GUMBO_TAG_IFRAME] = "iframe", + [GUMBO_TAG_EMBED] = "embed", + [GUMBO_TAG_OBJECT] = "object", + [GUMBO_TAG_PARAM] = "param", + [GUMBO_TAG_VIDEO] = "video", + [GUMBO_TAG_AUDIO] = "audio", + [GUMBO_TAG_SOURCE] = "source", + [GUMBO_TAG_TRACK] = "track", + [GUMBO_TAG_CANVAS] = "canvas", + [GUMBO_TAG_MAP] = "map", + [GUMBO_TAG_AREA] = "area", + [GUMBO_TAG_MATH] = "math", + [GUMBO_TAG_MI] = "mi", + [GUMBO_TAG_MO] = "mo", + [GUMBO_TAG_MN] = "mn", + [GUMBO_TAG_MS] = "ms", + [GUMBO_TAG_MTEXT] = "mtext", + [GUMBO_TAG_MGLYPH] = "mglyph", + [GUMBO_TAG_MALIGNMARK] = "malignmark", + [GUMBO_TAG_ANNOTATION_XML] = "annotation-xml", + [GUMBO_TAG_SVG] = "svg", + [GUMBO_TAG_FOREIGNOBJECT] = "foreignobject", + [GUMBO_TAG_DESC] = "desc", + [GUMBO_TAG_TABLE] = "table", + [GUMBO_TAG_CAPTION] = "caption", + [GUMBO_TAG_COLGROUP] = "colgroup", + [GUMBO_TAG_COL] = "col", + [GUMBO_TAG_TBODY] = "tbody", + [GUMBO_TAG_THEAD] = "thead", + [GUMBO_TAG_TFOOT] = "tfoot", + [GUMBO_TAG_TR] = "tr", + [GUMBO_TAG_TD] = "td", + [GUMBO_TAG_TH] = "th", + [GUMBO_TAG_FORM] = "form", + [GUMBO_TAG_FIELDSET] = "fieldset", + [GUMBO_TAG_LEGEND] = "legend", + [GUMBO_TAG_LABEL] = "label", + [GUMBO_TAG_INPUT] = "input", + [GUMBO_TAG_BUTTON] = "button", + [GUMBO_TAG_SELECT] = "select", + [GUMBO_TAG_DATALIST] = "datalist", + [GUMBO_TAG_OPTGROUP] = "optgroup", + [GUMBO_TAG_OPTION] = "option", + [GUMBO_TAG_TEXTAREA] = "textarea", + [GUMBO_TAG_KEYGEN] = "keygen", + [GUMBO_TAG_OUTPUT] = "output", + [GUMBO_TAG_PROGRESS] = "progress", + [GUMBO_TAG_METER] = "meter", + [GUMBO_TAG_DETAILS] = "details", + [GUMBO_TAG_SUMMARY] = "summary", + [GUMBO_TAG_MENU] = "menu", + [GUMBO_TAG_MENUITEM] = "menuitem", + [GUMBO_TAG_APPLET] = "applet", + [GUMBO_TAG_ACRONYM] = "acronym", + [GUMBO_TAG_BGSOUND] = "bgsound", + [GUMBO_TAG_DIR] = "dir", + [GUMBO_TAG_FRAME] = "frame", + [GUMBO_TAG_FRAMESET] = "frameset", + [GUMBO_TAG_NOFRAMES] = "noframes", + [GUMBO_TAG_LISTING] = "listing", + [GUMBO_TAG_XMP] = "xmp", + [GUMBO_TAG_NEXTID] = "nextid", + [GUMBO_TAG_NOEMBED] = "noembed", + [GUMBO_TAG_PLAINTEXT] = "plaintext", + [GUMBO_TAG_RB] = "rb", + [GUMBO_TAG_STRIKE] = "strike", + [GUMBO_TAG_BASEFONT] = "basefont", + [GUMBO_TAG_BIG] = "big", + [GUMBO_TAG_BLINK] = "blink", + [GUMBO_TAG_CENTER] = "center", + [GUMBO_TAG_FONT] = "font", + [GUMBO_TAG_MARQUEE] = "marquee", + [GUMBO_TAG_MULTICOL] = "multicol", + [GUMBO_TAG_NOBR] = "nobr", + [GUMBO_TAG_SPACER] = "spacer", + [GUMBO_TAG_TT] = "tt", + [GUMBO_TAG_RTC] = "rtc", + [GUMBO_TAG_DIALOG] = "dialog", -static const unsigned char kGumboTagSizes[] = { -#include "tag_sizes.h" - 0, // TAG_UNKNOWN - 0, // TAG_LAST + [GUMBO_TAG_UNKNOWN] = "", + [GUMBO_TAG_LAST] = "", }; const char* gumbo_normalized_tagname(GumboTag tag) { assert(tag <= GUMBO_TAG_LAST); - return kGumboTagNames[tag]; + const char *tagname = kGumboTagNames[tag]; + assert(tagname); + return tagname; } void gumbo_tag_from_original_text(GumboStringPiece* text) { @@ -45,52 +192,31 @@ void gumbo_tag_from_original_text(GumboStringPiece* text) { assert(text->length >= 2); assert(text->data[0] == '<'); assert(text->data[text->length - 1] == '>'); + if (text->data[1] == '/') { - // End tag. + // End tag assert(text->length >= 3); text->data += 2; // Move past </ text->length -= 3; } else { - // Start tag. + // Start tag text->data += 1; // Move past < text->length -= 2; - // strnchr is apparently not a standard C library function, so I loop - // explicitly looking for whitespace or other illegal tag characters - as - // accepted by the Tag Name State for (const char* c = text->data; c != text->data + text->length; ++c) { - if (*c == '\t' || *c == '\n' || *c == '\f' || *c == ' ' || *c == '/') { + switch (*c) { + case '\t': + case '\n': + case '\f': + case ' ': + case '/': text->length = c - text->data; - break; + return; } } } } -static int case_memcmp(const char* s1, const char* s2, unsigned int n) { - while (n--) { - unsigned char c1 = tolower(*s1++); - unsigned char c2 = tolower(*s2++); - if (c1 != c2) return (int) c1 - (int) c2; - } - return 0; -} - -#include "tag_gperf.h" -#define TAG_MAP_SIZE (sizeof(kGumboTagMap) / sizeof(kGumboTagMap[0])) - -GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length) { - if (length) { - unsigned int key = tag_hash(tagname, length); - if (key < TAG_MAP_SIZE) { - GumboTag tag = kGumboTagMap[key]; - if (length == kGumboTagSizes[(int) tag] && - !case_memcmp(tagname, kGumboTagNames[(int) tag], length)) - return tag; - } - } - return GUMBO_TAG_UNKNOWN; -} - -GumboTag gumbo_tag_enum(const char* tagname) { - return gumbo_tagn_enum(tagname, strlen(tagname)); +GumboTag gumbo_tagn_enum(const char *tagname, size_t tagname_length) { + const TagHashSlot *slot = gumbo_tag_lookup(tagname, tagname_length); + return slot ? slot->tag : GUMBO_TAG_UNKNOWN; } diff --git a/gumbo-parser/src/tag.in b/gumbo-parser/src/tag.in deleted file mode 100644 index 4c252648..00000000 --- a/gumbo-parser/src/tag.in +++ /dev/null @@ -1,150 +0,0 @@ -html -head -title -base -link -meta -style -script -noscript -template -body -article -section -nav -aside -h1 -h2 -h3 -h4 -h5 -h6 -hgroup -header -footer -address -p -hr -pre -blockquote -ol -ul -li -dl -dt -dd -figure -figcaption -main -div -a -em -strong -small -s -cite -q -dfn -abbr -data -time -code -var -samp -kbd -sub -sup -i -b -u -mark -ruby -rt -rp -bdi -bdo -span -br -wbr -ins -del -image -img -iframe -embed -object -param -video -audio -source -track -canvas -map -area -math -mi -mo -mn -ms -mtext -mglyph -malignmark -annotation-xml -svg -foreignobject -desc -table -caption -colgroup -col -tbody -thead -tfoot -tr -td -th -form -fieldset -legend -label -input -button -select -datalist -optgroup -option -textarea -keygen -output -progress -meter -details -summary -menu -menuitem -applet -acronym -bgsound -dir -frame -frameset -noframes -isindex -listing -xmp -nextid -noembed -plaintext -rb -strike -basefont -big -blink -center -font -marquee -multicol -nobr -spacer -tt -rtc diff --git a/gumbo-parser/src/tag_enum.h b/gumbo-parser/src/tag_enum.h deleted file mode 100644 index 6d7aeb3d..00000000 --- a/gumbo-parser/src/tag_enum.h +++ /dev/null @@ -1,153 +0,0 @@ -// Generated via `gentags.py src/tag.in`. -// Do not edit; edit src/tag.in instead. -// clang-format off -GUMBO_TAG_HTML, -GUMBO_TAG_HEAD, -GUMBO_TAG_TITLE, -GUMBO_TAG_BASE, -GUMBO_TAG_LINK, -GUMBO_TAG_META, -GUMBO_TAG_STYLE, -GUMBO_TAG_SCRIPT, -GUMBO_TAG_NOSCRIPT, -GUMBO_TAG_TEMPLATE, -GUMBO_TAG_BODY, -GUMBO_TAG_ARTICLE, -GUMBO_TAG_SECTION, -GUMBO_TAG_NAV, -GUMBO_TAG_ASIDE, -GUMBO_TAG_H1, -GUMBO_TAG_H2, -GUMBO_TAG_H3, -GUMBO_TAG_H4, -GUMBO_TAG_H5, -GUMBO_TAG_H6, -GUMBO_TAG_HGROUP, -GUMBO_TAG_HEADER, -GUMBO_TAG_FOOTER, -GUMBO_TAG_ADDRESS, -GUMBO_TAG_P, -GUMBO_TAG_HR, -GUMBO_TAG_PRE, -GUMBO_TAG_BLOCKQUOTE, -GUMBO_TAG_OL, -GUMBO_TAG_UL, -GUMBO_TAG_LI, -GUMBO_TAG_DL, -GUMBO_TAG_DT, -GUMBO_TAG_DD, -GUMBO_TAG_FIGURE, -GUMBO_TAG_FIGCAPTION, -GUMBO_TAG_MAIN, -GUMBO_TAG_DIV, -GUMBO_TAG_A, -GUMBO_TAG_EM, -GUMBO_TAG_STRONG, -GUMBO_TAG_SMALL, -GUMBO_TAG_S, -GUMBO_TAG_CITE, -GUMBO_TAG_Q, -GUMBO_TAG_DFN, -GUMBO_TAG_ABBR, -GUMBO_TAG_DATA, -GUMBO_TAG_TIME, -GUMBO_TAG_CODE, -GUMBO_TAG_VAR, -GUMBO_TAG_SAMP, -GUMBO_TAG_KBD, -GUMBO_TAG_SUB, -GUMBO_TAG_SUP, -GUMBO_TAG_I, -GUMBO_TAG_B, -GUMBO_TAG_U, -GUMBO_TAG_MARK, -GUMBO_TAG_RUBY, -GUMBO_TAG_RT, -GUMBO_TAG_RP, -GUMBO_TAG_BDI, -GUMBO_TAG_BDO, -GUMBO_TAG_SPAN, -GUMBO_TAG_BR, -GUMBO_TAG_WBR, -GUMBO_TAG_INS, -GUMBO_TAG_DEL, -GUMBO_TAG_IMAGE, -GUMBO_TAG_IMG, -GUMBO_TAG_IFRAME, -GUMBO_TAG_EMBED, -GUMBO_TAG_OBJECT, -GUMBO_TAG_PARAM, -GUMBO_TAG_VIDEO, -GUMBO_TAG_AUDIO, -GUMBO_TAG_SOURCE, -GUMBO_TAG_TRACK, -GUMBO_TAG_CANVAS, -GUMBO_TAG_MAP, -GUMBO_TAG_AREA, -GUMBO_TAG_MATH, -GUMBO_TAG_MI, -GUMBO_TAG_MO, -GUMBO_TAG_MN, -GUMBO_TAG_MS, -GUMBO_TAG_MTEXT, -GUMBO_TAG_MGLYPH, -GUMBO_TAG_MALIGNMARK, -GUMBO_TAG_ANNOTATION_XML, -GUMBO_TAG_SVG, -GUMBO_TAG_FOREIGNOBJECT, -GUMBO_TAG_DESC, -GUMBO_TAG_TABLE, -GUMBO_TAG_CAPTION, -GUMBO_TAG_COLGROUP, -GUMBO_TAG_COL, -GUMBO_TAG_TBODY, -GUMBO_TAG_THEAD, -GUMBO_TAG_TFOOT, -GUMBO_TAG_TR, -GUMBO_TAG_TD, -GUMBO_TAG_TH, -GUMBO_TAG_FORM, -GUMBO_TAG_FIELDSET, -GUMBO_TAG_LEGEND, -GUMBO_TAG_LABEL, -GUMBO_TAG_INPUT, -GUMBO_TAG_BUTTON, -GUMBO_TAG_SELECT, -GUMBO_TAG_DATALIST, -GUMBO_TAG_OPTGROUP, -GUMBO_TAG_OPTION, -GUMBO_TAG_TEXTAREA, -GUMBO_TAG_KEYGEN, -GUMBO_TAG_OUTPUT, -GUMBO_TAG_PROGRESS, -GUMBO_TAG_METER, -GUMBO_TAG_DETAILS, -GUMBO_TAG_SUMMARY, -GUMBO_TAG_MENU, -GUMBO_TAG_MENUITEM, -GUMBO_TAG_APPLET, -GUMBO_TAG_ACRONYM, -GUMBO_TAG_BGSOUND, -GUMBO_TAG_DIR, -GUMBO_TAG_FRAME, -GUMBO_TAG_FRAMESET, -GUMBO_TAG_NOFRAMES, -GUMBO_TAG_ISINDEX, -GUMBO_TAG_LISTING, -GUMBO_TAG_XMP, -GUMBO_TAG_NEXTID, -GUMBO_TAG_NOEMBED, -GUMBO_TAG_PLAINTEXT, -GUMBO_TAG_RB, -GUMBO_TAG_STRIKE, -GUMBO_TAG_BASEFONT, -GUMBO_TAG_BIG, -GUMBO_TAG_BLINK, -GUMBO_TAG_CENTER, -GUMBO_TAG_FONT, -GUMBO_TAG_MARQUEE, -GUMBO_TAG_MULTICOL, -GUMBO_TAG_NOBR, -GUMBO_TAG_SPACER, -GUMBO_TAG_TT, -GUMBO_TAG_RTC, diff --git a/gumbo-parser/src/tag_gperf.h b/gumbo-parser/src/tag_gperf.h deleted file mode 100644 index 378eaf95..00000000 --- a/gumbo-parser/src/tag_gperf.h +++ /dev/null @@ -1,105 +0,0 @@ -static unsigned int tag_hash( - register const char *str, register unsigned int len) { - static unsigned short asso_values[] = {296, 296, 296, 296, 296, 296, 296, 296, - 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, - 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, - 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 6, 4, 3, 1, 1, 0, - 1, 0, 0, 296, 296, 296, 296, 296, 296, 296, 22, 73, 151, 4, 13, 59, 65, 2, - 69, 0, 134, 9, 16, 52, 55, 28, 101, 0, 1, 6, 63, 126, 104, 93, 124, 296, - 296, 296, 296, 296, 296, 296, 22, 73, 151, 4, 13, 59, 65, 2, 69, 0, 134, - 9, 16, 52, 55, 28, 101, 0, 1, 6, 63, 126, 104, 93, 124, 296, 296, 296, - 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, - 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, - 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, - 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, - 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, - 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, - 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, - 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, - 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296}; - register unsigned int hval = len; - - switch (hval) { - default: - hval += asso_values[(unsigned char) str[1] + 3]; - /*FALLTHROUGH*/ - case 1: - hval += asso_values[(unsigned char) str[0]]; - break; - } - return hval + asso_values[(unsigned char) str[len - 1]]; -} - -static const unsigned char kGumboTagMap[] = {GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_S, GUMBO_TAG_H6, GUMBO_TAG_H5, GUMBO_TAG_H4, - GUMBO_TAG_H3, GUMBO_TAG_SPACER, GUMBO_TAG_H2, GUMBO_TAG_HEADER, - GUMBO_TAG_H1, GUMBO_TAG_HEAD, GUMBO_TAG_LAST, GUMBO_TAG_DETAILS, - GUMBO_TAG_SELECT, GUMBO_TAG_DIR, GUMBO_TAG_LAST, GUMBO_TAG_DEL, - GUMBO_TAG_LAST, GUMBO_TAG_SOURCE, GUMBO_TAG_LEGEND, GUMBO_TAG_DATALIST, - GUMBO_TAG_METER, GUMBO_TAG_MGLYPH, GUMBO_TAG_LAST, GUMBO_TAG_MATH, - GUMBO_TAG_LABEL, GUMBO_TAG_TABLE, GUMBO_TAG_TEMPLATE, GUMBO_TAG_LAST, - GUMBO_TAG_RP, GUMBO_TAG_TIME, GUMBO_TAG_TITLE, GUMBO_TAG_DATA, - GUMBO_TAG_APPLET, GUMBO_TAG_HGROUP, GUMBO_TAG_SAMP, GUMBO_TAG_TEXTAREA, - GUMBO_TAG_ABBR, GUMBO_TAG_MARQUEE, GUMBO_TAG_LAST, GUMBO_TAG_MENUITEM, - GUMBO_TAG_SMALL, GUMBO_TAG_META, GUMBO_TAG_A, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_EMBED, - GUMBO_TAG_MAP, GUMBO_TAG_LAST, GUMBO_TAG_PARAM, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_NOBR, GUMBO_TAG_P, GUMBO_TAG_SPAN, GUMBO_TAG_EM, - GUMBO_TAG_LAST, GUMBO_TAG_NOFRAMES, GUMBO_TAG_SECTION, GUMBO_TAG_NOEMBED, - GUMBO_TAG_NEXTID, GUMBO_TAG_FOOTER, GUMBO_TAG_NOSCRIPT, GUMBO_TAG_HR, - GUMBO_TAG_LAST, GUMBO_TAG_FONT, GUMBO_TAG_DL, GUMBO_TAG_TR, - GUMBO_TAG_SCRIPT, GUMBO_TAG_MO, GUMBO_TAG_LAST, GUMBO_TAG_DD, - GUMBO_TAG_MAIN, GUMBO_TAG_TD, GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_FORM, - GUMBO_TAG_OBJECT, GUMBO_TAG_LAST, GUMBO_TAG_FIELDSET, GUMBO_TAG_LAST, - GUMBO_TAG_BGSOUND, GUMBO_TAG_MENU, GUMBO_TAG_TFOOT, GUMBO_TAG_FIGURE, - GUMBO_TAG_RB, GUMBO_TAG_LI, GUMBO_TAG_LISTING, GUMBO_TAG_BASEFONT, - GUMBO_TAG_OPTGROUP, GUMBO_TAG_LAST, GUMBO_TAG_BASE, GUMBO_TAG_ADDRESS, - GUMBO_TAG_MI, GUMBO_TAG_LAST, GUMBO_TAG_PLAINTEXT, GUMBO_TAG_LAST, - GUMBO_TAG_PROGRESS, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_ACRONYM, GUMBO_TAG_ARTICLE, GUMBO_TAG_LAST, GUMBO_TAG_PRE, - GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_AREA, - GUMBO_TAG_RT, GUMBO_TAG_LAST, GUMBO_TAG_OPTION, GUMBO_TAG_IMAGE, - GUMBO_TAG_DT, GUMBO_TAG_LAST, GUMBO_TAG_TT, GUMBO_TAG_HTML, GUMBO_TAG_WBR, - GUMBO_TAG_OL, GUMBO_TAG_LAST, GUMBO_TAG_STYLE, GUMBO_TAG_STRIKE, - GUMBO_TAG_SUP, GUMBO_TAG_MULTICOL, GUMBO_TAG_U, GUMBO_TAG_DFN, GUMBO_TAG_UL, - GUMBO_TAG_FIGCAPTION, GUMBO_TAG_MTEXT, GUMBO_TAG_LAST, GUMBO_TAG_VAR, - GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_FRAMESET, GUMBO_TAG_LAST, - GUMBO_TAG_BR, GUMBO_TAG_I, GUMBO_TAG_FRAME, GUMBO_TAG_LAST, GUMBO_TAG_DIV, - GUMBO_TAG_LAST, GUMBO_TAG_TH, GUMBO_TAG_MS, GUMBO_TAG_ANNOTATION_XML, - GUMBO_TAG_B, GUMBO_TAG_TBODY, GUMBO_TAG_THEAD, GUMBO_TAG_BIG, - GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_XMP, GUMBO_TAG_LAST, GUMBO_TAG_KBD, - GUMBO_TAG_LAST, GUMBO_TAG_LINK, GUMBO_TAG_IFRAME, GUMBO_TAG_MARK, - GUMBO_TAG_CENTER, GUMBO_TAG_OUTPUT, GUMBO_TAG_DESC, GUMBO_TAG_CANVAS, - GUMBO_TAG_COL, GUMBO_TAG_MALIGNMARK, GUMBO_TAG_IMG, GUMBO_TAG_ASIDE, - GUMBO_TAG_LAST, GUMBO_TAG_CODE, GUMBO_TAG_LAST, GUMBO_TAG_SUB, GUMBO_TAG_MN, - GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_INS, GUMBO_TAG_AUDIO, - GUMBO_TAG_STRONG, GUMBO_TAG_CITE, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_INPUT, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_NAV, GUMBO_TAG_LAST, GUMBO_TAG_COLGROUP, - GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_SVG, GUMBO_TAG_KEYGEN, GUMBO_TAG_VIDEO, - GUMBO_TAG_BDO, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_BODY, GUMBO_TAG_LAST, GUMBO_TAG_Q, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_TRACK, - GUMBO_TAG_LAST, GUMBO_TAG_BDI, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_CAPTION, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_RUBY, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_BUTTON, - GUMBO_TAG_SUMMARY, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_RTC, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_BLINK, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, - GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_ISINDEX}; diff --git a/gumbo-parser/src/tag_lookup.c b/gumbo-parser/src/tag_lookup.c new file mode 100644 index 00000000..59adf2ab --- /dev/null +++ b/gumbo-parser/src/tag_lookup.c @@ -0,0 +1,382 @@ +/* ANSI-C code produced by gperf version 3.1 */ +/* Command-line: gperf -m100 lib/tag_lookup.gperf */ +/* Computed positions: -k'1-2,$' */ +/* Filtered by: mk/gperf-filter.sed */ + +#include "tag_lookup.h" +#include "macros.h" +#include "ascii.h" +#include <string.h> + +#define TOTAL_KEYWORDS 150 +#define MIN_WORD_LENGTH 1 +#define MAX_WORD_LENGTH 14 +#define MIN_HASH_VALUE 9 +#define MAX_HASH_VALUE 271 +/* maximum key range = 263, duplicates = 0 */ + + + +static inline unsigned int +hash (register const char *str, register size_t len) +{ + static const unsigned short asso_values[] = + { + 272, 272, 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, 272, 9, + 7, 6, 4, 4, 3, 4, 3, 3, 272, 272, + 272, 272, 272, 272, 272, 70, 83, 152, 7, 16, + 61, 98, 5, 76, 102, 126, 12, 19, 54, 54, + 31, 97, 3, 4, 9, 33, 136, 113, 86, 15, + 272, 272, 272, 272, 272, 272, 272, 70, 83, 152, + 7, 16, 61, 98, 5, 76, 102, 126, 12, 19, + 54, 54, 31, 97, 3, 4, 9, 33, 136, 113, + 86, 15, 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, 272 + }; + register unsigned int hval = len; + + switch (hval) + { + default: + hval += asso_values[(unsigned char)str[1]+3]; + /*FALLTHROUGH*/ + case 1: + hval += asso_values[(unsigned char)str[0]]; + break; + } + return hval + asso_values[(unsigned char)str[len - 1]]; +} + +const TagHashSlot * +gumbo_tag_lookup (register const char *str, register size_t len) +{ + static const unsigned char lengthtable[] = + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, + 2, 2, 2, 6, 2, 6, 2, 4, 0, 7, 6, 3, 0, 3, + 0, 6, 6, 8, 5, 0, 0, 4, 5, 5, 8, 0, 2, 4, + 5, 2, 0, 5, 4, 2, 0, 7, 0, 8, 5, 0, 0, 0, + 0, 0, 0, 5, 3, 4, 5, 1, 4, 0, 4, 1, 2, 8, + 7, 7, 6, 6, 8, 2, 8, 4, 2, 0, 6, 0, 0, 3, + 4, 6, 13, 4, 4, 6, 8, 0, 8, 4, 0, 6, 0, 8, + 4, 5, 0, 2, 2, 9, 2, 4, 0, 8, 4, 2, 4, 8, + 7, 0, 2, 5, 2, 0, 6, 0, 3, 2, 2, 6, 3, 8, + 7, 2, 5, 7, 0, 2, 6, 2, 4, 3, 0, 10, 5, 6, + 3, 1, 2, 0, 6, 0, 5, 5, 0, 3, 0, 3, 3, 1, + 4, 6, 4, 7, 3, 0, 0, 2, 10, 10, 0, 0, 6, 1, + 4, 6, 3, 0, 2, 5, 6, 4, 3, 4, 0, 7, 3, 0, + 0, 0, 4, 0, 0, 5, 0, 0, 0, 6, 0, 14, 8, 1, + 3, 0, 0, 7, 3, 0, 0, 0, 0, 0, 0, 5, 3, 0, + 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 7, 6, 0, 0, + 0, 0, 0, 5, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, + 0, 0, 5, 0, 0, 3 + }; + static const TagHashSlot wordlist[] = + { + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"s", GUMBO_TAG_S}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"h6", GUMBO_TAG_H6}, + {"h5", GUMBO_TAG_H5}, + {"h4", GUMBO_TAG_H4}, + {"h3", GUMBO_TAG_H3}, + {"spacer", GUMBO_TAG_SPACER}, + {"h2", GUMBO_TAG_H2}, + {"header", GUMBO_TAG_HEADER}, + {"h1", GUMBO_TAG_H1}, + {"head", GUMBO_TAG_HEAD}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"details", GUMBO_TAG_DETAILS}, + {"select", GUMBO_TAG_SELECT}, + {"dir", GUMBO_TAG_DIR}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"del", GUMBO_TAG_DEL}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"source", GUMBO_TAG_SOURCE}, + {"legend", GUMBO_TAG_LEGEND}, + {"datalist", GUMBO_TAG_DATALIST}, + {"meter", GUMBO_TAG_METER}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"math", GUMBO_TAG_MATH}, + {"label", GUMBO_TAG_LABEL}, + {"table", GUMBO_TAG_TABLE}, + {"template", GUMBO_TAG_TEMPLATE}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"rp", GUMBO_TAG_RP}, + {"time", GUMBO_TAG_TIME}, + {"title", GUMBO_TAG_TITLE}, + {"hr", GUMBO_TAG_HR}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"tbody", GUMBO_TAG_TBODY}, + {"samp", GUMBO_TAG_SAMP}, + {"tr", GUMBO_TAG_TR}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"marquee", GUMBO_TAG_MARQUEE}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"menuitem", GUMBO_TAG_MENUITEM}, + {"small", GUMBO_TAG_SMALL}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"embed", GUMBO_TAG_EMBED}, + {"map", GUMBO_TAG_MAP}, + {"menu", GUMBO_TAG_MENU}, + {"param", GUMBO_TAG_PARAM}, + {"p", GUMBO_TAG_P}, + {"nobr", GUMBO_TAG_NOBR}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"span", GUMBO_TAG_SPAN}, + {"u", GUMBO_TAG_U}, + {"em", GUMBO_TAG_EM}, + {"noframes", GUMBO_TAG_NOFRAMES}, + {"section", GUMBO_TAG_SECTION}, + {"noembed", GUMBO_TAG_NOEMBED}, + {"nextid", GUMBO_TAG_NEXTID}, + {"footer", GUMBO_TAG_FOOTER}, + {"noscript", GUMBO_TAG_NOSCRIPT}, + {"dl", GUMBO_TAG_DL}, + {"progress", GUMBO_TAG_PROGRESS}, + {"font", GUMBO_TAG_FONT}, + {"mo", GUMBO_TAG_MO}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"script", GUMBO_TAG_SCRIPT}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"pre", GUMBO_TAG_PRE}, + {"main", GUMBO_TAG_MAIN}, + {"object", GUMBO_TAG_OBJECT}, + {"foreignobject", GUMBO_TAG_FOREIGNOBJECT}, + {"form", GUMBO_TAG_FORM}, + {"data", GUMBO_TAG_DATA}, + {"applet", GUMBO_TAG_APPLET}, + {"fieldset", GUMBO_TAG_FIELDSET}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"textarea", GUMBO_TAG_TEXTAREA}, + {"abbr", GUMBO_TAG_ABBR}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"figure", GUMBO_TAG_FIGURE}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"optgroup", GUMBO_TAG_OPTGROUP}, + {"meta", GUMBO_TAG_META}, + {"tfoot", GUMBO_TAG_TFOOT}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"ul", GUMBO_TAG_UL}, + {"li", GUMBO_TAG_LI}, + {"plaintext", GUMBO_TAG_PLAINTEXT}, + {"rb", GUMBO_TAG_RB}, + {"body", GUMBO_TAG_BODY}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"basefont", GUMBO_TAG_BASEFONT}, + {"ruby", GUMBO_TAG_RUBY}, + {"mi", GUMBO_TAG_MI}, + {"base", GUMBO_TAG_BASE}, + {"frameset", GUMBO_TAG_FRAMESET}, + {"summary", GUMBO_TAG_SUMMARY}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"dd", GUMBO_TAG_DD}, + {"frame", GUMBO_TAG_FRAME}, + {"td", GUMBO_TAG_TD}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"option", GUMBO_TAG_OPTION}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"svg", GUMBO_TAG_SVG}, + {"br", GUMBO_TAG_BR}, + {"ol", GUMBO_TAG_OL}, + {"dialog", GUMBO_TAG_DIALOG}, + {"sup", GUMBO_TAG_SUP}, + {"multicol", GUMBO_TAG_MULTICOL}, + {"article", GUMBO_TAG_ARTICLE}, + {"rt", GUMBO_TAG_RT}, + {"image", GUMBO_TAG_IMAGE}, + {"listing", GUMBO_TAG_LISTING}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"dt", GUMBO_TAG_DT}, + {"mglyph", GUMBO_TAG_MGLYPH}, + {"tt", GUMBO_TAG_TT}, + {"html", GUMBO_TAG_HTML}, + {"wbr", GUMBO_TAG_WBR}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"figcaption", GUMBO_TAG_FIGCAPTION}, + {"style", GUMBO_TAG_STYLE}, + {"strike", GUMBO_TAG_STRIKE}, + {"dfn", GUMBO_TAG_DFN}, + {"a", GUMBO_TAG_A}, + {"th", GUMBO_TAG_TH}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"hgroup", GUMBO_TAG_HGROUP}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"mtext", GUMBO_TAG_MTEXT}, + {"thead", GUMBO_TAG_THEAD}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"var", GUMBO_TAG_VAR}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"xmp", GUMBO_TAG_XMP}, + {"kbd", GUMBO_TAG_KBD}, + {"i", GUMBO_TAG_I}, + {"link", GUMBO_TAG_LINK}, + {"output", GUMBO_TAG_OUTPUT}, + {"mark", GUMBO_TAG_MARK}, + {"acronym", GUMBO_TAG_ACRONYM}, + {"div", GUMBO_TAG_DIV}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"ms", GUMBO_TAG_MS}, + {"malignmark", GUMBO_TAG_MALIGNMARK}, + {"blockquote", GUMBO_TAG_BLOCKQUOTE}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"center", GUMBO_TAG_CENTER}, + {"b", GUMBO_TAG_B}, + {"desc", GUMBO_TAG_DESC}, + {"canvas", GUMBO_TAG_CANVAS}, + {"col", GUMBO_TAG_COL}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"mn", GUMBO_TAG_MN}, + {"track", GUMBO_TAG_TRACK}, + {"iframe", GUMBO_TAG_IFRAME}, + {"code", GUMBO_TAG_CODE}, + {"sub", GUMBO_TAG_SUB}, + {"area", GUMBO_TAG_AREA}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"address", GUMBO_TAG_ADDRESS}, + {"ins", GUMBO_TAG_INS}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"cite", GUMBO_TAG_CITE}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"input", GUMBO_TAG_INPUT}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"keygen", GUMBO_TAG_KEYGEN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"annotation-xml", GUMBO_TAG_ANNOTATION_XML}, + {"colgroup", GUMBO_TAG_COLGROUP}, + {"q", GUMBO_TAG_Q}, + {"big", GUMBO_TAG_BIG}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"bgsound", GUMBO_TAG_BGSOUND}, + {"nav", GUMBO_TAG_NAV}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"video", GUMBO_TAG_VIDEO}, + {"img", GUMBO_TAG_IMG}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"audio", GUMBO_TAG_AUDIO}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"caption", GUMBO_TAG_CAPTION}, + {"strong", GUMBO_TAG_STRONG}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"aside", GUMBO_TAG_ASIDE}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"button", GUMBO_TAG_BUTTON}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"bdo", GUMBO_TAG_BDO}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"bdi", GUMBO_TAG_BDI}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"blink", GUMBO_TAG_BLINK}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {(char*)0,GUMBO_TAG_UNKNOWN}, + {"rtc", GUMBO_TAG_RTC} + }; + + if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) + { + register unsigned int key = hash (str, len); + + if (key <= MAX_HASH_VALUE) + if (len == lengthtable[key]) + { + register const char *s = wordlist[key].key; + + if (s && (((unsigned char)*str ^ (unsigned char)*s) & ~32) == 0 && !gumbo_ascii_strncasecmp(str, s, len)) + return &wordlist[key]; + } + } + return 0; +} diff --git a/gumbo-parser/src/tag_lookup.gperf b/gumbo-parser/src/tag_lookup.gperf new file mode 100644 index 00000000..0f7deaaa --- /dev/null +++ b/gumbo-parser/src/tag_lookup.gperf @@ -0,0 +1,169 @@ +%{ +#include "tag_lookup.h" +#include "macros.h" +#include "ascii.h" +%} + +%ignore-case +%struct-type +%omit-struct-type +%compare-lengths +%readonly-tables +%null-strings +%includes +%define lookup-function-name gumbo_tag_lookup +%define slot-name key +%define initializer-suffix ,GUMBO_TAG_UNKNOWN +TagHashSlot; + +%% +html, GUMBO_TAG_HTML +head, GUMBO_TAG_HEAD +title, GUMBO_TAG_TITLE +base, GUMBO_TAG_BASE +link, GUMBO_TAG_LINK +meta, GUMBO_TAG_META +style, GUMBO_TAG_STYLE +script, GUMBO_TAG_SCRIPT +noscript, GUMBO_TAG_NOSCRIPT +template, GUMBO_TAG_TEMPLATE +body, GUMBO_TAG_BODY +article, GUMBO_TAG_ARTICLE +section, GUMBO_TAG_SECTION +nav, GUMBO_TAG_NAV +aside, GUMBO_TAG_ASIDE +h1, GUMBO_TAG_H1 +h2, GUMBO_TAG_H2 +h3, GUMBO_TAG_H3 +h4, GUMBO_TAG_H4 +h5, GUMBO_TAG_H5 +h6, GUMBO_TAG_H6 +hgroup, GUMBO_TAG_HGROUP +header, GUMBO_TAG_HEADER +footer, GUMBO_TAG_FOOTER +address, GUMBO_TAG_ADDRESS +p, GUMBO_TAG_P +hr, GUMBO_TAG_HR +pre, GUMBO_TAG_PRE +blockquote, GUMBO_TAG_BLOCKQUOTE +ol, GUMBO_TAG_OL +ul, GUMBO_TAG_UL +li, GUMBO_TAG_LI +dl, GUMBO_TAG_DL +dt, GUMBO_TAG_DT +dd, GUMBO_TAG_DD +figure, GUMBO_TAG_FIGURE +figcaption, GUMBO_TAG_FIGCAPTION +main, GUMBO_TAG_MAIN +div, GUMBO_TAG_DIV +a, GUMBO_TAG_A +em, GUMBO_TAG_EM +strong, GUMBO_TAG_STRONG +small, GUMBO_TAG_SMALL +s, GUMBO_TAG_S +cite, GUMBO_TAG_CITE +q, GUMBO_TAG_Q +dfn, GUMBO_TAG_DFN +abbr, GUMBO_TAG_ABBR +data, GUMBO_TAG_DATA +time, GUMBO_TAG_TIME +code, GUMBO_TAG_CODE +var, GUMBO_TAG_VAR +samp, GUMBO_TAG_SAMP +kbd, GUMBO_TAG_KBD +sub, GUMBO_TAG_SUB +sup, GUMBO_TAG_SUP +i, GUMBO_TAG_I +b, GUMBO_TAG_B +u, GUMBO_TAG_U +mark, GUMBO_TAG_MARK +ruby, GUMBO_TAG_RUBY +rt, GUMBO_TAG_RT +rp, GUMBO_TAG_RP +bdi, GUMBO_TAG_BDI +bdo, GUMBO_TAG_BDO +span, GUMBO_TAG_SPAN +br, GUMBO_TAG_BR +wbr, GUMBO_TAG_WBR +ins, GUMBO_TAG_INS +del, GUMBO_TAG_DEL +image, GUMBO_TAG_IMAGE +img, GUMBO_TAG_IMG +iframe, GUMBO_TAG_IFRAME +embed, GUMBO_TAG_EMBED +object, GUMBO_TAG_OBJECT +param, GUMBO_TAG_PARAM +video, GUMBO_TAG_VIDEO +audio, GUMBO_TAG_AUDIO +source, GUMBO_TAG_SOURCE +track, GUMBO_TAG_TRACK +canvas, GUMBO_TAG_CANVAS +map, GUMBO_TAG_MAP +area, GUMBO_TAG_AREA +math, GUMBO_TAG_MATH +mi, GUMBO_TAG_MI +mo, GUMBO_TAG_MO +mn, GUMBO_TAG_MN +ms, GUMBO_TAG_MS +mtext, GUMBO_TAG_MTEXT +mglyph, GUMBO_TAG_MGLYPH +malignmark, GUMBO_TAG_MALIGNMARK +annotation-xml, GUMBO_TAG_ANNOTATION_XML +svg, GUMBO_TAG_SVG +foreignobject, GUMBO_TAG_FOREIGNOBJECT +desc, GUMBO_TAG_DESC +table, GUMBO_TAG_TABLE +caption, GUMBO_TAG_CAPTION +colgroup, GUMBO_TAG_COLGROUP +col, GUMBO_TAG_COL +tbody, GUMBO_TAG_TBODY +thead, GUMBO_TAG_THEAD +tfoot, GUMBO_TAG_TFOOT +tr, GUMBO_TAG_TR +td, GUMBO_TAG_TD +th, GUMBO_TAG_TH +form, GUMBO_TAG_FORM +fieldset, GUMBO_TAG_FIELDSET +legend, GUMBO_TAG_LEGEND +label, GUMBO_TAG_LABEL +input, GUMBO_TAG_INPUT +button, GUMBO_TAG_BUTTON +select, GUMBO_TAG_SELECT +datalist, GUMBO_TAG_DATALIST +optgroup, GUMBO_TAG_OPTGROUP +option, GUMBO_TAG_OPTION +textarea, GUMBO_TAG_TEXTAREA +keygen, GUMBO_TAG_KEYGEN +output, GUMBO_TAG_OUTPUT +progress, GUMBO_TAG_PROGRESS +meter, GUMBO_TAG_METER +details, GUMBO_TAG_DETAILS +summary, GUMBO_TAG_SUMMARY +menu, GUMBO_TAG_MENU +menuitem, GUMBO_TAG_MENUITEM +applet, GUMBO_TAG_APPLET +acronym, GUMBO_TAG_ACRONYM +bgsound, GUMBO_TAG_BGSOUND +dir, GUMBO_TAG_DIR +frame, GUMBO_TAG_FRAME +frameset, GUMBO_TAG_FRAMESET +noframes, GUMBO_TAG_NOFRAMES +listing, GUMBO_TAG_LISTING +xmp, GUMBO_TAG_XMP +nextid, GUMBO_TAG_NEXTID +noembed, GUMBO_TAG_NOEMBED +plaintext, GUMBO_TAG_PLAINTEXT +rb, GUMBO_TAG_RB +strike, GUMBO_TAG_STRIKE +basefont, GUMBO_TAG_BASEFONT +big, GUMBO_TAG_BIG +blink, GUMBO_TAG_BLINK +center, GUMBO_TAG_CENTER +font, GUMBO_TAG_FONT +marquee, GUMBO_TAG_MARQUEE +multicol, GUMBO_TAG_MULTICOL +nobr, GUMBO_TAG_NOBR +spacer, GUMBO_TAG_SPACER +tt, GUMBO_TAG_TT +rtc, GUMBO_TAG_RTC +dialog, GUMBO_TAG_DIALOG diff --git a/gumbo-parser/src/tag_lookup.h b/gumbo-parser/src/tag_lookup.h new file mode 100644 index 00000000..fe8454aa --- /dev/null +++ b/gumbo-parser/src/tag_lookup.h @@ -0,0 +1,13 @@ +#ifndef GUMBO_TAG_LOOKUP_H_ +#define GUMBO_TAG_LOOKUP_H_ + +#include "gumbo.h" + +typedef struct { + const char *key; + const GumboTag tag; +} TagHashSlot; + +const TagHashSlot *gumbo_tag_lookup(const char *str, size_t len); + +#endif // GUMBO_TAG_LOOKUP_H_ diff --git a/gumbo-parser/src/tag_sizes.h b/gumbo-parser/src/tag_sizes.h deleted file mode 100644 index 7c92de07..00000000 --- a/gumbo-parser/src/tag_sizes.h +++ /dev/null @@ -1,4 +0,0 @@ -// Generated via `gentags.py src/tag.in`. -// Do not edit; edit src/tag.in instead. -// clang-format off -4, 4, 5, 4, 4, 4, 5, 6, 8, 8, 4, 7, 7, 3, 5, 2, 2, 2, 2, 2, 2, 6, 6, 6, 7, 1, 2, 3, 10, 2, 2, 2, 2, 2, 2, 6, 10, 4, 3, 1, 2, 6, 5, 1, 4, 1, 3, 4, 4, 4, 4, 3, 4, 3, 3, 3, 1, 1, 1, 4, 4, 2, 2, 3, 3, 4, 2, 3, 3, 3, 5, 3, 6, 5, 6, 5, 5, 5, 6, 5, 6, 3, 4, 4, 2, 2, 2, 2, 5, 6, 10, 14, 3, 13, 4, 5, 7, 8, 3, 5, 5, 5, 2, 2, 2, 4, 8, 6, 5, 5, 6, 6, 8, 8, 6, 8, 6, 6, 8, 5, 7, 7, 4, 8, 6, 7, 7, 3, 5, 8, 8, 7, 7, 3, 6, 7, 9, 2, 6, 8, 3, 5, 6, 4, 7, 8, 4, 6, 2, 3, \ No newline at end of file diff --git a/gumbo-parser/src/tag_strings.h b/gumbo-parser/src/tag_strings.h deleted file mode 100644 index 6540e2e6..00000000 --- a/gumbo-parser/src/tag_strings.h +++ /dev/null @@ -1,153 +0,0 @@ -// Generated via `gentags.py src/tag.in`. -// Do not edit; edit src/tag.in instead. -// clang-format off -"html", -"head", -"title", -"base", -"link", -"meta", -"style", -"script", -"noscript", -"template", -"body", -"article", -"section", -"nav", -"aside", -"h1", -"h2", -"h3", -"h4", -"h5", -"h6", -"hgroup", -"header", -"footer", -"address", -"p", -"hr", -"pre", -"blockquote", -"ol", -"ul", -"li", -"dl", -"dt", -"dd", -"figure", -"figcaption", -"main", -"div", -"a", -"em", -"strong", -"small", -"s", -"cite", -"q", -"dfn", -"abbr", -"data", -"time", -"code", -"var", -"samp", -"kbd", -"sub", -"sup", -"i", -"b", -"u", -"mark", -"ruby", -"rt", -"rp", -"bdi", -"bdo", -"span", -"br", -"wbr", -"ins", -"del", -"image", -"img", -"iframe", -"embed", -"object", -"param", -"video", -"audio", -"source", -"track", -"canvas", -"map", -"area", -"math", -"mi", -"mo", -"mn", -"ms", -"mtext", -"mglyph", -"malignmark", -"annotation-xml", -"svg", -"foreignobject", -"desc", -"table", -"caption", -"colgroup", -"col", -"tbody", -"thead", -"tfoot", -"tr", -"td", -"th", -"form", -"fieldset", -"legend", -"label", -"input", -"button", -"select", -"datalist", -"optgroup", -"option", -"textarea", -"keygen", -"output", -"progress", -"meter", -"details", -"summary", -"menu", -"menuitem", -"applet", -"acronym", -"bgsound", -"dir", -"frame", -"frameset", -"noframes", -"isindex", -"listing", -"xmp", -"nextid", -"noembed", -"plaintext", -"rb", -"strike", -"basefont", -"big", -"blink", -"center", -"font", -"marquee", -"multicol", -"nobr", -"spacer", -"tt", -"rtc", diff --git a/gumbo-parser/src/token_type.h b/gumbo-parser/src/token_type.h index eeab5078..fdee13bb 100644 --- a/gumbo-parser/src/token_type.h +++ b/gumbo-parser/src/token_type.h @@ -1,26 +1,6 @@ -// Copyright 2011 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) - #ifndef GUMBO_TOKEN_TYPE_H_ #define GUMBO_TOKEN_TYPE_H_ -#ifdef __cplusplus -extern "C" { -#endif - // An enum representing the type of token. typedef enum { GUMBO_TOKEN_DOCTYPE, @@ -34,8 +14,4 @@ typedef enum { GUMBO_TOKEN_EOF } GumboTokenType; -#ifdef __cplusplus -} // extern C -#endif - -#endif // GUMBO_TOKEN_TYPE_H_ +#endif // GUMBO_TOKEN_TYPE_H_ diff --git a/gumbo-parser/src/tokenizer.c b/gumbo-parser/src/tokenizer.c index 307589f9..26bade14 100644 --- a/gumbo-parser/src/tokenizer.c +++ b/gumbo-parser/src/tokenizer.c @@ -1,69 +1,68 @@ -// Copyright 2010 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) -// -// Coding conventions specific to this file: -// -// 1. Functions that fill in a token should be named emit_*, and should be -// followed immediately by a return from the tokenizer (true if no error -// occurred, false if an error occurred). Sometimes the emit functions -// themselves return a boolean so that they can be combined with the return -// statement; in this case, they should match this convention. -// 2. Functions that shuffle data from temporaries to final API structures -// should be named finish_*, and be called just before the tokenizer exits the -// state that accumulates the temporary. -// 3. All internal data structures should be kept in an initialized state from -// tokenizer creation onwards, ready to accept input. When a buffer's flushed -// and reset, it should be deallocated and immediately reinitialized. -// 4. Make sure there are appropriate break statements following each state. -// 5. Assertions on the state of the temporary and tag buffers are usually a -// good idea, and should go at the entry point of each state when added. -// 6. Statement order within states goes: -// 1. Add parse errors, if appropriate. -// 2. Call finish_* functions to build up tag state. -// 2. Switch to new state. Set _reconsume flag if appropriate. -// 3. Perform any other temporary buffer manipulation. -// 4. Emit tokens -// 5. Return/break. -// This order ensures that we can verify that every emit is followed by a -// return, ensures that the correct state is recorded with any parse errors, and -// prevents parse error position from being messed up by possible mark/resets in -// temporary buffer manipulation. - -#include "tokenizer.h" +/* + Copyright 2010 Google Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +/* + Coding conventions specific to this file: + + 1. Functions that fill in a token should be named emit_*, and should be + followed immediately by a return from the tokenizer (true if no error + occurred, false if an error occurred). Sometimes the emit functions + themselves return a boolean so that they can be combined with the return + statement; in this case, they should match this convention. + 2. Functions that shuffle data from temporaries to final API structures + should be named finish_*, and be called just before the tokenizer exits the + state that accumulates the temporary. + 3. All internal data structures should be kept in an initialized state from + tokenizer creation onwards, ready to accept input. When a buffer's flushed + and reset, it should be deallocated and immediately reinitialized. + 4. Make sure there are appropriate break statements following each state. + 5. Assertions on the state of the temporary and tag buffers are usually a + good idea, and should go at the entry point of each state when added. + 6. Statement order within states goes: + 1. Add parse errors, if appropriate. + 2. Call finish_* functions to build up tag state. + 2. Switch to new state. Set _reconsume flag if appropriate. + 3. Perform any other temporary buffer manipulation. + 4. Emit tokens + 5. Return/break. + This order ensures that we can verify that every emit is followed by + a return, ensures that the correct state is recorded with any parse + errors, and prevents parse error position from being messed up by + possible mark/resets in temporary buffer manipulation. +*/ #include <assert.h> -#include <stdbool.h> #include <string.h> - +#include "tokenizer.h" +#include "ascii.h" #include "attribute.h" #include "char_ref.h" #include "error.h" #include "gumbo.h" #include "parser.h" #include "string_buffer.h" -#include "string_piece.h" #include "token_type.h" #include "tokenizer_states.h" #include "utf8.h" #include "util.h" #include "vector.h" -// Compared against _script_data_buffer to determine if we're in double-escaped -// script mode. -const GumboStringPiece kScriptTag = {"script", 6}; +// Compared against _script_data_buffer to determine if we're in +// double-escaped script mode. +static const GumboStringPiece kScriptTag = {.data = "script", .length = 6}; // An enum for the return value of each individual state. typedef enum { @@ -86,31 +85,35 @@ typedef struct GumboInternalTagState { // the buffer can be re-used for building up attributes. GumboTag _tag; + // The current tag name. It's set at the same time that _tag is set if _tag + // is set to GUMBO_TAG_UNKNOWN. + char *_name; + // The starting location of the text in the buffer. GumboSourcePosition _start_pos; - // The current list of attributes. This is copied (and ownership of its data - // transferred) to the GumboStartTag token upon completion of the tag. New + // The current list of attributes. This is copied (and ownership of its data + // transferred) to the GumboStartTag token upon completion of the tag. New // attributes are added as soon as their attribute name state is complete, and // values are filled in by operating on _attributes.data[attributes.length-1]. GumboVector /* GumboAttribute */ _attributes; - // If true, the next attribute value to be finished should be dropped. This + // If true, the next attribute value to be finished should be dropped. This // happens if a duplicate attribute name is encountered - we want to consume // the attribute value, but shouldn't overwrite the existing value. bool _drop_next_attr_value; // The state that caused the tokenizer to switch into a character reference in - // attribute value state. This is used to set the additional allowed - // character, and is switched back to on completion. Initialized as the + // attribute value state. This is used to set the additional allowed + // character, and is switched back to on completion. Initialized as the // tokenizer enters the character reference state. GumboTokenizerEnum _attr_value_state; - // The last start tag to have been emitted by the tokenizer. This is + // The last start tag to have been emitted by the tokenizer. This is // necessary to check for appropriate end tags. GumboTag _last_start_tag; - // If true, then this is a start tag. If false, it's an end tag. This is + // If true, then this is a start tag. If false, it's an end tag. This is // necessary to generate the appropriate token type at tag-closing time. bool _is_start_tag; @@ -121,43 +124,43 @@ typedef struct GumboInternalTagState { // This is the main tokenizer state struct, containing all state used by in // tokenizing the input stream. typedef struct GumboInternalTokenizerState { - // The current lexer state. Starts in GUMBO_LEX_DATA. + // The current lexer state. Starts in GUMBO_LEX_DATA. GumboTokenizerEnum _state; // A flag indicating whether the current input character needs to reconsumed // in another state, or whether the next input character should be read for - // the next iteration of the state loop. This is set when the spec reads + // the next iteration of the state loop. This is set when the spec reads // "Reconsume the current input character in..." bool _reconsume_current_input; - // A flag indicating whether the current node is a foreign element. This is + // A flag indicating whether the current node is a foreign element. This is // set by gumbo_tokenizer_set_is_current_node_foreign and checked in the // markup declaration state. bool _is_current_node_foreign; - // A flag indicating whether the tokenizer is in a CDATA section. If so, then + // A flag indicating whether the tokenizer is in a CDATA section. If so, then // text tokens emitted will be GUMBO_TOKEN_CDATA. bool _is_in_cdata; // Certain states (notably character references) may emit two character tokens - // at once, but the contract for lex() fills in only one token at a time. The + // at once, but the contract for lex() fills in only one token at a time. The // extra character is buffered here, and then this is checked on entry to - // lex(). If a character is stored here, it's immediately emitted and control - // returns from the lexer. kGumboNoChar is used to represent 'no character + // lex(). If a character is stored here, it's immediately emitted and control + // returns from the lexer. kGumboNoChar is used to represent 'no character // stored.' // // Note that characters emitted through this mechanism will have their source // position marked as the character under the mark, i.e. multiple characters - // may be emitted with the same position. This is desirable for character - // references, but unsuitable for many other cases. Use the _temporary_buffer + // may be emitted with the same position. This is desirable for character + // references, but unsuitable for many other cases. Use the _temporary_buffer // mechanism if the buffered characters must have their original positions in // the document. int _buffered_emit_char; // A temporary buffer to accumulate characters, as described by the "temporary - // buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox + // buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox // way: we record the specific character to go into the buffer, which may - // sometimes be a lowercased version of the actual input character. However, + // sometimes be a lowercased version of the actual input character. However, // we *also* use utf8iterator_mark() to record the position at tag start. // When we start flushing the temporary buffer, we set _temporary_buffer_emit // to the start of it, and then increment it for each call to the tokenizer. @@ -167,13 +170,13 @@ typedef struct GumboInternalTokenizerState { GumboStringBuffer _temporary_buffer; // The current cursor position we're emitting from within - // _temporary_buffer.data. NULL whenever we're not flushing the buffer. + // _temporary_buffer.data. NULL whenever we're not flushing the buffer. const char* _temporary_buffer_emit; // The temporary buffer is also used by the spec to check whether we should // enter the script data double escaped state, but we can't use the same // buffer for both because we have to flush out "<s" as emits while still - // maintaining the context that will eventually become "script". This is a + // maintaining the context that will eventually become "script". This is a // separate buffer that's used in place of the temporary buffer for states // that may enter the script data double escape start state. GumboStringBuffer _script_data_buffer; @@ -189,7 +192,7 @@ typedef struct GumboInternalTokenizerState { // Current tag state. GumboTagState _tag_state; - // Doctype state. We use the temporary buffer to accumulate characters (it's + // Doctype state. We use the temporary buffer to accumulate characters (it's // not used for anything else in the doctype states), and then freshly // allocate the strings in the doctype token, then copy it over on emit. GumboTokenDocType _doc_type_state; @@ -199,8 +202,10 @@ typedef struct GumboInternalTokenizerState { } GumboTokenizerState; // Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct. -static void tokenizer_add_parse_error( - GumboParser* parser, GumboErrorType type) { +static void tokenizer_add_parse_error ( + GumboParser* parser, + GumboErrorType type +) { GumboError* error = gumbo_add_error(parser); if (!error) { return; @@ -309,14 +314,14 @@ static void tokenizer_add_parse_error( } static bool is_alpha(int c) { - // We don't use ISO C isupper/islower functions here because they - // depend upon the program's locale, while the behavior of the HTML5 spec is - // independent of which locale the program is run in. - return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); + // We don't use the ISO C isalpha() function here because it depends + // on the current locale, whereas the behavior in the HTML5 spec is + // locale-independent. + return ((unsigned) c | 32) - 'a' < 26; } static int ensure_lowercase(int c) { - return c >= 'A' && c <= 'Z' ? c + 0x20 : c; + return gumbo_ascii_tolower(c); } static GumboTokenType get_char_token_type(bool is_in_cdata, int c) { @@ -346,7 +351,7 @@ static GumboTokenType get_char_token_type(bool is_in_cdata, int c) { // text that will eventually be emitted, it needs to be called a couple of // states before the spec says "Set the temporary buffer to the empty string". // In general, this should be called whenever there's a transition to a -// "less-than sign state". The initial < and possibly / then need to be +// "less-than sign state". The initial < and possibly / then need to be // appended to the temporary buffer, their presence needs to be accounted for in // states that compare the temporary buffer against a literal value, and // spec stanzas that say "emit a < and / character token along with a character @@ -356,30 +361,40 @@ static void clear_temporary_buffer(GumboParser* parser) { GumboTokenizerState* tokenizer = parser->_tokenizer_state; assert(!tokenizer->_temporary_buffer_emit); utf8iterator_mark(&tokenizer->_input); - gumbo_string_buffer_clear(parser, &tokenizer->_temporary_buffer); + gumbo_string_buffer_clear(&tokenizer->_temporary_buffer); // The temporary buffer and script data buffer are the same object in the // spec, so the script data buffer should be cleared as well. - gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer); + gumbo_string_buffer_clear(&tokenizer->_script_data_buffer); } // Appends a codepoint to the temporary buffer. -static void append_char_to_temporary_buffer( - GumboParser* parser, int codepoint) { - gumbo_string_buffer_append_codepoint( - parser, codepoint, &parser->_tokenizer_state->_temporary_buffer); +static void append_char_to_temporary_buffer ( + GumboParser* parser, + int codepoint +) { + gumbo_string_buffer_append_codepoint ( + codepoint, + &parser->_tokenizer_state->_temporary_buffer + ); } -// Checks to see if the temporary buffer equals a certain string. -// Make sure this remains side-effect free; it's used in assertions. #ifndef NDEBUG -static bool temporary_buffer_equals(GumboParser* parser, const char* text) { - GumboStringBuffer* buffer = &parser->_tokenizer_state->_temporary_buffer; - // TODO(jdtang): See if the extra strlen is a performance problem, and replace - // it with an explicit sizeof(literal) if necessary. I don't think it will - // be, as this is only used in a couple of rare states. - int text_len = strlen(text); - return text_len == buffer->length && - memcmp(buffer->data, text, text_len) == 0; +static bool temporary_buffer_equals__ ( + const GumboParser* parser, + const char* text, + size_t text_len +) { + const GumboStringBuffer* buf = &parser->_tokenizer_state->_temporary_buffer; + return + text_len == buf->length + && memcmp(buf->data, text, text_len) == 0; +} + +#define temporary_buffer_equals(parser, text) \ + temporary_buffer_equals__(parser, "" text, sizeof(text) - 1) + +static bool temporary_buffer_is_empty(const GumboParser* parser) { + return parser->_tokenizer_state->_temporary_buffer.length == 0; } #endif @@ -387,9 +402,9 @@ static void doc_type_state_init(GumboParser* parser) { GumboTokenDocType* doc_type_state = &parser->_tokenizer_state->_doc_type_state; // We initialize these to NULL here so that we don't end up leaking memory if - // we never see a doctype token. When we do see a doctype token, we reset + // we never see a doctype token. When we do see a doctype token, we reset // them to a freshly-allocated empty string so that we can present a uniform - // interface to client code and not make them check for null. Ownership is + // interface to client code and not make them check for null. Ownership is // transferred to the doctype token when it's emitted. doc_type_state->name = NULL; doc_type_state->public_identifier = NULL; @@ -408,7 +423,7 @@ static void reset_token_start_point(GumboTokenizerState* tokenizer) { } // Sets the tag buffer original text and start point to the current iterator -// position. This is necessary because attribute names & values may have +// position. This is necessary because attribute names & values may have // whitespace preceeding them, and so we can't assume that the actual token // starting point was the end of the last tag buffer usage. static void reset_tag_buffer_start_point(GumboParser* parser) { @@ -423,15 +438,14 @@ static void reset_tag_buffer_start_point(GumboParser* parser) { // and clears the temporary buffer. static void finish_temporary_buffer(GumboParser* parser, const char** output) { GumboTokenizerState* tokenizer = parser->_tokenizer_state; - *output = - gumbo_string_buffer_to_string(parser, &tokenizer->_temporary_buffer); + *output = gumbo_string_buffer_to_string(&tokenizer->_temporary_buffer); clear_temporary_buffer(parser); } // Advances the iterator past the end of the token, and then fills in the -// relevant position fields. It's assumed that after every emit, the tokenizer +// relevant position fields. It's assumed that after every emit, the tokenizer // will immediately return (letting the tree-construction stage read the filled -// in Token). Thus, it's safe to advance the input stream here, since it will +// in Token). Thus, it's safe to advance the input stream here, since it will // bypass the advance at the bottom of the state machine loop. // // Since this advances the iterator and resets the current input, make sure to @@ -450,7 +464,7 @@ static void finish_token(GumboParser* parser, GumboToken* token) { if (token->original_text.length > 0 && token->original_text.data[token->original_text.length - 1] == '\r') { // The UTF8 iterator will ignore carriage returns in the input stream, which - // means that the next token may start one past a \r character. The pointer + // means that the next token may start one past a \r character. The pointer // arithmetic above results in that \r being appended to the original text // of the preceding token, so we have to adjust its length here to chop the // \r off. @@ -463,7 +477,7 @@ static void finish_token(GumboParser* parser, GumboToken* token) { static void finish_doctype_public_id(GumboParser* parser) { GumboTokenDocType* doc_type_state = &parser->_tokenizer_state->_doc_type_state; - gumbo_parser_deallocate(parser, (void*) doc_type_state->public_identifier); + gumbo_free((void*) doc_type_state->public_identifier); finish_temporary_buffer(parser, &doc_type_state->public_identifier); doc_type_state->has_public_identifier = true; } @@ -473,7 +487,7 @@ static void finish_doctype_public_id(GumboParser* parser) { static void finish_doctype_system_id(GumboParser* parser) { GumboTokenDocType* doc_type_state = &parser->_tokenizer_state->_doc_type_state; - gumbo_parser_deallocate(parser, (void*) doc_type_state->system_identifier); + gumbo_free((void*) doc_type_state->system_identifier); finish_temporary_buffer(parser, &doc_type_state->system_identifier); doc_type_state->has_system_identifier = true; } @@ -495,7 +509,7 @@ static StateResult emit_replacement_char( return RETURN_ERROR; } -// Writes an EOF character token. Always returns RETURN_SUCCESS. +// Writes an EOF character token. Always returns RETURN_SUCCESS. static StateResult emit_eof(GumboParser* parser, GumboToken* output) { emit_char(parser, -1, output); return RETURN_SUCCESS; @@ -520,7 +534,9 @@ static void emit_doctype(GumboParser* parser, GumboToken* output) { // Debug-only function that explicitly sets the attribute vector data to NULL so // it can be asserted on tag creation, verifying that there are no memory leaks. static void mark_tag_state_as_empty(GumboTagState* tag_state) { + UNUSED_IF_NDEBUG(tag_state); #ifndef NDEBUG + tag_state->_name = NULL; tag_state->_attributes = kGumboEmptyVector; #endif } @@ -532,6 +548,7 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) { if (tag_state->_is_start_tag) { output->type = GUMBO_TOKEN_START_TAG; output->v.start_tag.tag = tag_state->_tag; + output->v.start_tag.name = tag_state->_name; output->v.start_tag.attributes = tag_state->_attributes; output->v.start_tag.is_self_closing = tag_state->_is_self_closing; tag_state->_last_start_tag = tag_state->_tag; @@ -540,23 +557,27 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) { "Emitted start tag %s.\n", gumbo_normalized_tagname(tag_state->_tag)); } else { output->type = GUMBO_TOKEN_END_TAG; - output->v.end_tag = tag_state->_tag; + output->v.end_tag.tag = tag_state->_tag; + output->v.end_tag.is_self_closing = tag_state->_is_self_closing; // In end tags, ownership of the attributes vector is not transferred to the // token, but it's still initialized as normal, so it must be manually - // deallocated. There may also be attributes to destroy, in certain broken + // deallocated. There may also be attributes to destroy, in certain broken // cases like </div</th> (the "th" is an attribute there). for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) { - gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]); + gumbo_destroy_attribute(tag_state->_attributes.data[i]); } - gumbo_parser_deallocate(parser, tag_state->_attributes.data); + gumbo_free(tag_state->_attributes.data); mark_tag_state_as_empty(tag_state); gumbo_debug( "Emitted end tag %s.\n", gumbo_normalized_tagname(tag_state->_tag)); } - gumbo_string_buffer_destroy(parser, &tag_state->_buffer); + gumbo_string_buffer_destroy(&tag_state->_buffer); finish_token(parser, output); - gumbo_debug("Original text = %.*s.\n", output->original_text.length, - output->original_text.data); + gumbo_debug ( + "Original text = %.*s.\n", + (int) output->original_text.length, + output->original_text.data + ); assert(output->original_text.length >= 2); assert(output->original_text.data[0] == '<'); assert(output->original_text.data[output->original_text.length - 1] == '>'); @@ -570,26 +591,36 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) { static void abandon_current_tag(GumboParser* parser) { GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state; for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) { - gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]); + gumbo_destroy_attribute(tag_state->_attributes.data[i]); } - gumbo_parser_deallocate(parser, tag_state->_attributes.data); + gumbo_free(tag_state->_attributes.data); mark_tag_state_as_empty(tag_state); - gumbo_string_buffer_destroy(parser, &tag_state->_buffer); + gumbo_string_buffer_destroy(&tag_state->_buffer); gumbo_debug("Abandoning current tag.\n"); } -// Wraps the consume_char_ref function to handle its output and make the -// appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse +// Wraps the gumbo_consume_char_ref function to handle its output and make the +// appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse // error occurred, RETURN_SUCCESS otherwise. -static StateResult emit_char_ref(GumboParser* parser, - int additional_allowed_char, bool is_in_attribute, GumboToken* output) { +static StateResult emit_char_ref ( + GumboParser* parser, + int additional_allowed_char, + bool UNUSED_ARG(is_in_attribute), + GumboToken* output +) { GumboTokenizerState* tokenizer = parser->_tokenizer_state; OneOrTwoCodepoints char_ref; - bool status = consume_char_ref( - parser, &tokenizer->_input, additional_allowed_char, false, &char_ref); + bool status = gumbo_consume_char_ref ( + parser, + &tokenizer->_input, + additional_allowed_char, + false, + &char_ref + ); if (char_ref.first != kGumboNoChar) { - // consume_char_ref ends with the iterator pointing at the next character, - // so we need to be sure not advance it again before reading the next token. + // gumbo_consume_char_ref ends with the iterator pointing at the next + // character, so we need to be sure not advance it again before + // reading the next token. tokenizer->_reconsume_current_input = true; emit_char(parser, char_ref.first, output); tokenizer->_buffered_emit_char = char_ref.second; @@ -599,9 +630,9 @@ static StateResult emit_char_ref(GumboParser* parser, return status ? RETURN_SUCCESS : RETURN_ERROR; } -// Emits a comment token. Comments use the temporary buffer to accumulate their +// Emits a comment token. Comments use the temporary buffer to accumulate their // data, and then it's copied over and released to the 'text' field of the -// GumboToken union. Always returns RETURN_SUCCESS. +// GumboToken union. Always returns RETURN_SUCCESS. static StateResult emit_comment(GumboParser* parser, GumboToken* output) { output->type = GUMBO_TOKEN_COMMENT; finish_temporary_buffer(parser, &output->v.text); @@ -626,11 +657,11 @@ static bool maybe_emit_from_temporary_buffer( } assert(*c == utf8iterator_current(&tokenizer->_input)); - // emit_char also advances the input stream. We need to do some juggling of + // emit_char also advances the input stream. We need to do some juggling of // the _reconsume_current_input flag to get the proper behavior when emitting - // previous tokens. Basically, _reconsume_current_input should *never* be set + // previous tokens. Basically, _reconsume_current_input should *never* be set // when emitting anything from the temporary buffer, since those characters - // have already been advanced past. However, it should be preserved so that + // have already been advanced past. However, it should be preserved so that // when the *next* character is encountered again, the tokenizer knows not to // advance past it. bool saved_reconsume_state = tokenizer->_reconsume_current_input; @@ -644,7 +675,7 @@ static bool maybe_emit_from_temporary_buffer( // Sets up the tokenizer to begin flushing the temporary buffer. // This resets the input iterator stream to the start of the last tag, sets up // _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits -// the first character in it. It returns true if a character was emitted, false +// the first character in it. It returns true if a character was emitted, false // otherwise. static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) { GumboTokenizerState* tokenizer = parser->_tokenizer_state; @@ -654,32 +685,35 @@ static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) { return maybe_emit_from_temporary_buffer(parser, output); } -// Appends a codepoint to the current tag buffer. If +// Appends a codepoint to the current tag buffer. If // reinitilize_position_on_first is set, this also initializes the tag buffer // start point; the only time you would *not* want to pass true for this // parameter is if you want the original_text to include character (like an // opening quote) that doesn't appear in the value. -static void append_char_to_tag_buffer( - GumboParser* parser, int codepoint, bool reinitilize_position_on_first) { +static void append_char_to_tag_buffer ( + GumboParser* parser, + int codepoint, + bool reinitilize_position_on_first +) { GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer; if (buffer->length == 0 && reinitilize_position_on_first) { reset_tag_buffer_start_point(parser); } - gumbo_string_buffer_append_codepoint(parser, codepoint, buffer); + gumbo_string_buffer_append_codepoint(codepoint, buffer); } -// (Re-)initialize the tag buffer. This also resets the original_text pointer +// (Re-)initialize the tag buffer. This also resets the original_text pointer // and _start_pos field to point to the current position. static void initialize_tag_buffer(GumboParser* parser) { GumboTokenizerState* tokenizer = parser->_tokenizer_state; GumboTagState* tag_state = &tokenizer->_tag_state; - gumbo_string_buffer_init(parser, &tag_state->_buffer); + gumbo_string_buffer_init(&tag_state->_buffer); reset_tag_buffer_start_point(parser); } // Initializes the tag_state to start a new tag, keeping track of the opening -// positions and original text. Takes a boolean indicating whether this is a +// positions and original text. Takes a boolean indicating whether this is a // start or end tag. static void start_new_tag(GumboParser* parser, bool is_start_tag) { GumboTokenizerState* tokenizer = parser->_tokenizer_state; @@ -690,14 +724,15 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) { assert(is_alpha(c)); initialize_tag_buffer(parser); - gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer); + gumbo_string_buffer_append_codepoint(c, &tag_state->_buffer); + assert(tag_state->_name == NULL); assert(tag_state->_attributes.data == NULL); // Initial size chosen by statistical analysis of a corpus of 60k webpages. - // 99.5% of elements have 0 attributes, 93% of the remainder have 1. These + // 99.5% of elements have 0 attributes, 93% of the remainder have 1. These // numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1 // for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs. - gumbo_vector_init(parser, 1, &tag_state->_attributes); + gumbo_vector_init(1, &tag_state->_attributes); tag_state->_drop_next_attr_value = false; tag_state->_is_start_tag = is_start_tag; tag_state->_is_self_closing = false; @@ -708,7 +743,7 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) { static void copy_over_tag_buffer(GumboParser* parser, const char** output) { GumboTokenizerState* tokenizer = parser->_tokenizer_state; GumboTagState* tag_state = &tokenizer->_tag_state; - *output = gumbo_string_buffer_to_string(parser, &tag_state->_buffer); + *output = gumbo_string_buffer_to_string(&tag_state->_buffer); } // Fills in: @@ -717,9 +752,12 @@ static void copy_over_tag_buffer(GumboParser* parser, const char** output) { // * The start_pos GumboSourcePosition with the start position of the tag // buffer. // * The end_pos GumboSourcePosition with the current source position. -static void copy_over_original_tag_text(GumboParser* parser, - GumboStringPiece* original_text, GumboSourcePosition* start_pos, - GumboSourcePosition* end_pos) { +static void copy_over_original_tag_text ( + GumboParser* parser, + GumboStringPiece* original_text, + GumboSourcePosition* start_pos, + GumboSourcePosition* end_pos +) { GumboTokenizerState* tokenizer = parser->_tokenizer_state; GumboTagState* tag_state = &tokenizer->_tag_state; @@ -729,7 +767,7 @@ static void copy_over_original_tag_text(GumboParser* parser, if (original_text->data[original_text->length - 1] == '\r') { // Since \r is skipped by the UTF-8 iterator, it can sometimes end up // appended to the end of original text even when it's really the first part - // of the next character. If we detect this situation, shrink the length of + // of the next character. If we detect this situation, shrink the length of // the original text by 1 to remove the carriage return. --original_text->length; } @@ -739,8 +777,7 @@ static void copy_over_original_tag_text(GumboParser* parser, // Releases and then re-initializes the tag buffer. static void reinitialize_tag_buffer(GumboParser* parser) { - gumbo_parser_deallocate( - parser, parser->_tokenizer_state->_tag_state._buffer.data); + gumbo_free(parser->_tokenizer_state->_tag_state._buffer.data); initialize_tag_buffer(parser); } @@ -750,14 +787,24 @@ static void finish_tag_name(GumboParser* parser) { GumboTokenizerState* tokenizer = parser->_tokenizer_state; GumboTagState* tag_state = &tokenizer->_tag_state; - tag_state->_tag = - gumbo_tagn_enum(tag_state->_buffer.data, tag_state->_buffer.length); + const char *data = tag_state->_buffer.data; + size_t length = tag_state->_buffer.length; + tag_state->_tag = gumbo_tagn_enum(data, length); + if (tag_state->_tag == GUMBO_TAG_UNKNOWN) { + char *name = gumbo_alloc(length + 1); + memcpy(name, data, length); + name[length] = 0; + tag_state->_name = name; + } reinitialize_tag_buffer(parser); } // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct. -static void add_duplicate_attr_error(GumboParser* parser, const char* attr_name, - int original_index, int new_index) { +static void add_duplicate_attr_error ( + GumboParser* parser, + int original_index, + int new_index +) { GumboError* error = gumbo_add_error(parser); if (!error) { return; @@ -773,11 +820,11 @@ static void add_duplicate_attr_error(GumboParser* parser, const char* attr_name, } // Creates a new attribute in the current tag, copying the current tag buffer to -// the attribute's name. The attribute's value starts out as the empty string +// the attribute's name. The attribute's value starts out as the empty string // (following the "Boolean attributes" section of the spec) and is only -// overwritten on finish_attribute_value(). If the attribute has already been +// overwritten on finish_attribute_value(). If the attribute has already been // specified, the new attribute is dropped, a parse error is added, and the -// function returns false. Otherwise, this returns true. +// function returns false. Otherwise, this returns true. static bool finish_attribute_name(GumboParser* parser) { GumboTokenizerState* tokenizer = parser->_tokenizer_state; GumboTagState* tag_state = &tokenizer->_tag_state; @@ -789,30 +836,43 @@ static bool finish_attribute_name(GumboParser* parser) { GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes; for (unsigned int i = 0; i < attributes->length; ++i) { GumboAttribute* attr = attributes->data[i]; - if (strlen(attr->name) == tag_state->_buffer.length && - memcmp(attr->name, tag_state->_buffer.data, - tag_state->_buffer.length) == 0) { + if ( + strlen(attr->name) == tag_state->_buffer.length + && 0 == memcmp ( + attr->name, + tag_state->_buffer.data, + tag_state->_buffer.length + ) + ) { // Identical attribute; bail. - add_duplicate_attr_error(parser, attr->name, i, attributes->length); + add_duplicate_attr_error(parser, i, attributes->length); tag_state->_drop_next_attr_value = true; return false; } } - GumboAttribute* attr = gumbo_parser_allocate(parser, sizeof(GumboAttribute)); + GumboAttribute* attr = gumbo_alloc(sizeof(GumboAttribute)); attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE; copy_over_tag_buffer(parser, &attr->name); - copy_over_original_tag_text( - parser, &attr->original_name, &attr->name_start, &attr->name_end); - attr->value = gumbo_copy_stringz(parser, ""); - copy_over_original_tag_text( - parser, &attr->original_value, &attr->name_start, &attr->name_end); - gumbo_vector_add(parser, attr, attributes); + copy_over_original_tag_text ( + parser, + &attr->original_name, + &attr->name_start, + &attr->name_end + ); + attr->value = gumbo_strdup(""); + copy_over_original_tag_text ( + parser, + &attr->original_value, + &attr->name_start, + &attr->name_end + ); + gumbo_vector_add(attr, attributes); reinitialize_tag_buffer(parser); return true; } -// Finishes an attribute value. This sets the value of the most recently added +// Finishes an attribute value. This sets the value of the most recently added // attribute to the current contents of the tag buffer. static void finish_attribute_value(GumboParser* parser) { GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state; @@ -826,7 +886,7 @@ static void finish_attribute_value(GumboParser* parser) { GumboAttribute* attr = tag_state->_attributes.data[tag_state->_attributes.length - 1]; - gumbo_parser_deallocate(parser, (void*) attr->value); + gumbo_free((void*) attr->value); copy_over_tag_buffer(parser, &attr->value); copy_over_original_tag_text( parser, &attr->original_value, &attr->value_start, &attr->value_end); @@ -842,24 +902,27 @@ static bool is_appropriate_end_tag(GumboParser* parser) { tag_state->_buffer.length); } -void gumbo_tokenizer_state_init( - GumboParser* parser, const char* text, size_t text_length) { - GumboTokenizerState* tokenizer = - gumbo_parser_allocate(parser, sizeof(GumboTokenizerState)); +void gumbo_tokenizer_state_init ( + GumboParser* parser, + const char* text, + size_t text_length +) { + GumboTokenizerState* tokenizer = gumbo_alloc(sizeof(GumboTokenizerState)); parser->_tokenizer_state = tokenizer; gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); tokenizer->_reconsume_current_input = false; tokenizer->_is_current_node_foreign = false; tokenizer->_is_in_cdata = false; tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST; + tokenizer->_tag_state._name = NULL; tokenizer->_buffered_emit_char = kGumboNoChar; - gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer); + gumbo_string_buffer_init(&tokenizer->_temporary_buffer); tokenizer->_temporary_buffer_emit = NULL; mark_tag_state_as_empty(&tokenizer->_tag_state); - gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer); + gumbo_string_buffer_init(&tokenizer->_script_data_buffer); tokenizer->_token_start = text; utf8iterator_init(parser, text, text_length, &tokenizer->_input); utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos); @@ -871,27 +934,37 @@ void gumbo_tokenizer_state_destroy(GumboParser* parser) { assert(tokenizer->_doc_type_state.name == NULL); assert(tokenizer->_doc_type_state.public_identifier == NULL); assert(tokenizer->_doc_type_state.system_identifier == NULL); - gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer); - gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer); - gumbo_parser_deallocate(parser, tokenizer); + gumbo_string_buffer_destroy(&tokenizer->_temporary_buffer); + gumbo_string_buffer_destroy(&tokenizer->_script_data_buffer); + assert(tokenizer->_tag_state._name == NULL); + assert(tokenizer->_tag_state._attributes.data == NULL); + gumbo_free(tokenizer); } void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) { parser->_tokenizer_state->_state = state; } -void gumbo_tokenizer_set_is_current_node_foreign( - GumboParser* parser, bool is_foreign) { +void gumbo_tokenizer_set_is_current_node_foreign ( + GumboParser* parser, + bool is_foreign +) { if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) { - gumbo_debug("Toggling is_current_node_foreign to %s.\n", - is_foreign ? "true" : "false"); + gumbo_debug ( + "Toggling is_current_node_foreign to %s.\n", + is_foreign ? "true" : "false" + ); } parser->_tokenizer_state->_is_current_node_foreign = is_foreign; } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#data-state -static StateResult handle_data_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#data-state +static StateResult handle_data_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { switch (c) { case '&': gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA); @@ -914,16 +987,24 @@ static StateResult handle_data_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-data-state -static StateResult handle_char_ref_in_data_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-data-state +static StateResult handle_char_ref_in_data_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int UNUSED_ARG(c), + GumboToken* output +) { gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); return emit_char_ref(parser, ' ', false, output); } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rcdata-state -static StateResult handle_rcdata_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state +static StateResult handle_rcdata_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { switch (c) { case '&': gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA); @@ -943,16 +1024,24 @@ static StateResult handle_rcdata_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-rcdata-state -static StateResult handle_char_ref_in_rcdata_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-rcdata-state +static StateResult handle_char_ref_in_rcdata_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int UNUSED_ARG(c), + GumboToken* output +) { gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA); return emit_char_ref(parser, ' ', false, output); } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-state -static StateResult handle_rawtext_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state +static StateResult handle_rawtext_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { switch (c) { case '<': gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT); @@ -968,9 +1057,13 @@ static StateResult handle_rawtext_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-state -static StateResult handle_script_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#script-data-state +static StateResult handle_script_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { switch (c) { case '<': gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT); @@ -986,9 +1079,13 @@ static StateResult handle_script_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#plaintext-state -static StateResult handle_plaintext_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state +static StateResult handle_plaintext_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { switch (c) { case '\0': return emit_replacement_char(parser, output); @@ -999,9 +1096,13 @@ static StateResult handle_plaintext_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-open-state -static StateResult handle_tag_open_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state +static StateResult handle_tag_open_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { assert(temporary_buffer_equals(parser, "<")); switch (c) { case '!': @@ -1032,9 +1133,13 @@ static StateResult handle_tag_open_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#end-tag-open-state -static StateResult handle_end_tag_open_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state +static StateResult handle_end_tag_open_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { assert(temporary_buffer_equals(parser, "</")); switch (c) { case '>': @@ -1059,9 +1164,13 @@ static StateResult handle_end_tag_open_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-name-state -static StateResult handle_tag_name_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state +static StateResult handle_tag_name_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { switch (c) { case '\t': case '\n': @@ -1093,9 +1202,13 @@ static StateResult handle_tag_name_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-less-than-sign-state -static StateResult handle_rcdata_lt_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state +static StateResult handle_rcdata_lt_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { assert(temporary_buffer_equals(parser, "<")); if (c == '/') { gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN); @@ -1108,9 +1221,13 @@ static StateResult handle_rcdata_lt_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-open-state -static StateResult handle_rcdata_end_tag_open_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state +static StateResult handle_rcdata_end_tag_open_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { assert(temporary_buffer_equals(parser, "</")); if (is_alpha(c)) { gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME); @@ -1124,9 +1241,14 @@ static StateResult handle_rcdata_end_tag_open_state(GumboParser* parser, return true; } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-name-state -static StateResult handle_rcdata_end_tag_name_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state +static StateResult handle_rcdata_end_tag_name_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { + UNUSED_IF_NDEBUG(tokenizer); assert(tokenizer->_temporary_buffer.length >= 2); if (is_alpha(c)) { append_char_to_tag_buffer(parser, ensure_lowercase(c), true); @@ -1156,9 +1278,13 @@ static StateResult handle_rcdata_end_tag_name_state(GumboParser* parser, return emit_temporary_buffer(parser, output); } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-less-than-sign-state -static StateResult handle_rawtext_lt_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state +static StateResult handle_rawtext_lt_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { assert(temporary_buffer_equals(parser, "<")); if (c == '/') { gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN); @@ -1171,9 +1297,13 @@ static StateResult handle_rawtext_lt_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-open-state -static StateResult handle_rawtext_end_tag_open_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state +static StateResult handle_rawtext_end_tag_open_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { assert(temporary_buffer_equals(parser, "</")); if (is_alpha(c)) { gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME); @@ -1186,9 +1316,13 @@ static StateResult handle_rawtext_end_tag_open_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-name-state -static StateResult handle_rawtext_end_tag_name_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state +static StateResult handle_rawtext_end_tag_name_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { assert(tokenizer->_temporary_buffer.length >= 2); gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length, tokenizer->_tag_state._buffer.data); @@ -1221,9 +1355,13 @@ static StateResult handle_rawtext_end_tag_name_state(GumboParser* parser, return emit_temporary_buffer(parser, output); } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-less-than-sign-state -static StateResult handle_script_lt_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state +static StateResult handle_script_lt_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { assert(temporary_buffer_equals(parser, "<")); if (c == '/') { gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN); @@ -1240,9 +1378,13 @@ static StateResult handle_script_lt_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-open-state -static StateResult handle_script_end_tag_open_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state +static StateResult handle_script_end_tag_open_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { assert(temporary_buffer_equals(parser, "</")); if (is_alpha(c)) { gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME); @@ -1255,9 +1397,14 @@ static StateResult handle_script_end_tag_open_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-name-state -static StateResult handle_script_end_tag_name_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state +static StateResult handle_script_end_tag_name_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { + UNUSED_IF_NDEBUG(tokenizer); assert(tokenizer->_temporary_buffer.length >= 2); if (is_alpha(c)) { append_char_to_tag_buffer(parser, ensure_lowercase(c), true); @@ -1287,9 +1434,13 @@ static StateResult handle_script_end_tag_name_state(GumboParser* parser, return emit_temporary_buffer(parser, output); } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-state -static StateResult handle_script_escaped_start_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state +static StateResult handle_script_escaped_start_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { if (c == '-') { gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH); return emit_current_char(parser, output); @@ -1300,9 +1451,13 @@ static StateResult handle_script_escaped_start_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-dash-state -static StateResult handle_script_escaped_start_dash_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state +static StateResult handle_script_escaped_start_dash_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { if (c == '-') { gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH); return emit_current_char(parser, output); @@ -1313,9 +1468,13 @@ static StateResult handle_script_escaped_start_dash_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-state -static StateResult handle_script_escaped_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state +static StateResult handle_script_escaped_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { switch (c) { case '-': gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH); @@ -1335,9 +1494,13 @@ static StateResult handle_script_escaped_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-state -static StateResult handle_script_escaped_dash_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state +static StateResult handle_script_escaped_dash_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { switch (c) { case '-': gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH); @@ -1360,9 +1523,13 @@ static StateResult handle_script_escaped_dash_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-dash-state -static StateResult handle_script_escaped_dash_dash_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state +static StateResult handle_script_escaped_dash_dash_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { switch (c) { case '-': return emit_current_char(parser, output); @@ -1387,9 +1554,13 @@ static StateResult handle_script_escaped_dash_dash_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-less-than-sign-state -static StateResult handle_script_escaped_lt_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state +static StateResult handle_script_escaped_lt_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { assert(temporary_buffer_equals(parser, "<")); assert(!tokenizer->_script_data_buffer.length); if (c == '/') { @@ -1399,8 +1570,10 @@ static StateResult handle_script_escaped_lt_state(GumboParser* parser, } else if (is_alpha(c)) { gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START); append_char_to_temporary_buffer(parser, c); - gumbo_string_buffer_append_codepoint( - parser, ensure_lowercase(c), &tokenizer->_script_data_buffer); + gumbo_string_buffer_append_codepoint ( + ensure_lowercase(c), + &tokenizer->_script_data_buffer + ); return emit_temporary_buffer(parser, output); } else { gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED); @@ -1408,9 +1581,13 @@ static StateResult handle_script_escaped_lt_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-open-state -static StateResult handle_script_escaped_end_tag_open_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state +static StateResult handle_script_escaped_end_tag_open_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { assert(temporary_buffer_equals(parser, "</")); if (is_alpha(c)) { gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME); @@ -1423,9 +1600,14 @@ static StateResult handle_script_escaped_end_tag_open_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-name-state -static StateResult handle_script_escaped_end_tag_name_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state +static StateResult handle_script_escaped_end_tag_name_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { + UNUSED_IF_NDEBUG(tokenizer); assert(tokenizer->_temporary_buffer.length >= 2); if (is_alpha(c)) { append_char_to_tag_buffer(parser, ensure_lowercase(c), true); @@ -1455,9 +1637,13 @@ static StateResult handle_script_escaped_end_tag_name_state(GumboParser* parser, return emit_temporary_buffer(parser, output); } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-start-state -static StateResult handle_script_double_escaped_start_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state +static StateResult handle_script_double_escaped_start_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { switch (c) { case '\t': case '\n': @@ -1465,16 +1651,22 @@ static StateResult handle_script_double_escaped_start_state(GumboParser* parser, case ' ': case '/': case '>': - gumbo_tokenizer_set_state( - parser, gumbo_string_equals(&kScriptTag, - (GumboStringPiece*) &tokenizer->_script_data_buffer) - ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED - : GUMBO_LEX_SCRIPT_ESCAPED); + gumbo_tokenizer_set_state ( + parser, + gumbo_string_equals ( + &kScriptTag, + (GumboStringPiece*) &tokenizer->_script_data_buffer + ) + ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED + : GUMBO_LEX_SCRIPT_ESCAPED + ); return emit_current_char(parser, output); default: if (is_alpha(c)) { - gumbo_string_buffer_append_codepoint( - parser, ensure_lowercase(c), &tokenizer->_script_data_buffer); + gumbo_string_buffer_append_codepoint ( + ensure_lowercase(c), + &tokenizer->_script_data_buffer + ); return emit_current_char(parser, output); } else { gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED); @@ -1484,9 +1676,13 @@ static StateResult handle_script_double_escaped_start_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-state -static StateResult handle_script_double_escaped_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state +static StateResult handle_script_double_escaped_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { switch (c) { case '-': gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH); @@ -1505,9 +1701,13 @@ static StateResult handle_script_double_escaped_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-state -static StateResult handle_script_double_escaped_dash_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state +static StateResult handle_script_double_escaped_dash_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { switch (c) { case '-': gumbo_tokenizer_set_state( @@ -1529,10 +1729,13 @@ static StateResult handle_script_double_escaped_dash_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-dash-state -static StateResult handle_script_double_escaped_dash_dash_state( - GumboParser* parser, GumboTokenizerState* tokenizer, int c, - GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state +static StateResult handle_script_double_escaped_dash_dash_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { switch (c) { case '-': return emit_current_char(parser, output); @@ -1555,12 +1758,16 @@ static StateResult handle_script_double_escaped_dash_dash_state( } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-less-than-sign-state -static StateResult handle_script_double_escaped_lt_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state +static StateResult handle_script_double_escaped_lt_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { if (c == '/') { gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END); - gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer); + gumbo_string_buffer_clear(&tokenizer->_script_data_buffer); return emit_current_char(parser, output); } else { gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED); @@ -1569,9 +1776,13 @@ static StateResult handle_script_double_escaped_lt_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-end-state -static StateResult handle_script_double_escaped_end_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state +static StateResult handle_script_double_escaped_end_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { switch (c) { case '\t': case '\n': @@ -1587,8 +1798,10 @@ static StateResult handle_script_double_escaped_end_state(GumboParser* parser, return emit_current_char(parser, output); default: if (is_alpha(c)) { - gumbo_string_buffer_append_codepoint( - parser, ensure_lowercase(c), &tokenizer->_script_data_buffer); + gumbo_string_buffer_append_codepoint ( + ensure_lowercase(c), + &tokenizer->_script_data_buffer + ); return emit_current_char(parser, output); } else { gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED); @@ -1598,9 +1811,13 @@ static StateResult handle_script_double_escaped_end_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-name-state -static StateResult handle_before_attr_name_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state +static StateResult handle_before_attr_name_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { switch (c) { case '\t': case '\n': @@ -1636,9 +1853,13 @@ static StateResult handle_before_attr_name_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-name-state -static StateResult handle_attr_name_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state +static StateResult handle_attr_name_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { switch (c) { case '\t': case '\n': @@ -1679,9 +1900,13 @@ static StateResult handle_attr_name_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-name-state -static StateResult handle_after_attr_name_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state +static StateResult handle_after_attr_name_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { switch (c) { case '\t': case '\n': @@ -1719,9 +1944,13 @@ static StateResult handle_after_attr_name_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-value-state -static StateResult handle_before_attr_value_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state +static StateResult handle_before_attr_value_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { switch (c) { case '\t': case '\n': @@ -1768,9 +1997,13 @@ static StateResult handle_before_attr_value_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-double-quoted-state -static StateResult handle_attr_value_double_quoted_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-double-quoted-state +static StateResult handle_attr_value_double_quoted_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* UNUSED_ARG(output) +) { switch (c) { case '"': gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED); @@ -1796,9 +2029,13 @@ static StateResult handle_attr_value_double_quoted_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-single-quoted-state -static StateResult handle_attr_value_single_quoted_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-single-quoted-state +static StateResult handle_attr_value_single_quoted_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* UNUSED_ARG(output) +) { switch (c) { case '\'': gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED); @@ -1824,9 +2061,13 @@ static StateResult handle_attr_value_single_quoted_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-unquoted-state -static StateResult handle_attr_value_unquoted_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-unquoted-state +static StateResult handle_attr_value_unquoted_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { switch (c) { case '\t': case '\n': @@ -1867,9 +2108,13 @@ static StateResult handle_attr_value_unquoted_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-attribute-value-state -static StateResult handle_char_ref_in_attr_value_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-attribute-value-state +static StateResult handle_char_ref_in_attr_value_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int UNUSED_ARG(c), + GumboToken* UNUSED_ARG(output) +) { OneOrTwoCodepoints char_ref; int allowed_char; bool is_unquoted = false; @@ -1893,9 +2138,15 @@ static StateResult handle_char_ref_in_attr_value_state(GumboParser* parser, // Ignore the status, since we don't have a convenient way of signalling that // a parser error has occurred when the error occurs in the middle of a - // multi-state token. We'd need a flag inside the TokenizerState to do this, + // multi-state token. We'd need a flag inside the TokenizerState to do this, // but that's a low priority fix. - consume_char_ref(parser, &tokenizer->_input, allowed_char, true, &char_ref); + gumbo_consume_char_ref ( + parser, + &tokenizer->_input, + allowed_char, + true, + &char_ref + ); if (char_ref.first != kGumboNoChar) { tokenizer->_reconsume_current_input = true; append_char_to_tag_buffer(parser, char_ref.first, is_unquoted); @@ -1909,9 +2160,13 @@ static StateResult handle_char_ref_in_attr_value_state(GumboParser* parser, return NEXT_CHAR; } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-value-quoted-state -static StateResult handle_after_attr_value_quoted_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-quoted-state +static StateResult handle_after_attr_value_quoted_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { finish_attribute_value(parser); switch (c) { case '\t': @@ -1940,9 +2195,13 @@ static StateResult handle_after_attr_value_quoted_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#self-closing-start-tag-state -static StateResult handle_self_closing_start_tag_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state +static StateResult handle_self_closing_start_tag_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { switch (c) { case '>': gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); @@ -1961,9 +2220,13 @@ static StateResult handle_self_closing_start_tag_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-comment-state -static StateResult handle_bogus_comment_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state +static StateResult handle_bogus_comment_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { while (c != '>' && c != -1) { if (c == '\0') { tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL); @@ -1977,29 +2240,48 @@ static StateResult handle_bogus_comment_state(GumboParser* parser, return emit_comment(parser, output); } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#markup-declaration-open-state -static StateResult handle_markup_declaration_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { - if (utf8iterator_maybe_consume_match( - &tokenizer->_input, "--", sizeof("--") - 1, true)) { +// https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state +static StateResult handle_markup_declaration_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int UNUSED_ARG(c), + GumboToken* UNUSED_ARG(output) +) { + if ( + utf8iterator_maybe_consume_match ( + &tokenizer->_input, + "--", + sizeof("--") - 1, + true + ) + ) { gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START); tokenizer->_reconsume_current_input = true; - } else if (utf8iterator_maybe_consume_match( - &tokenizer->_input, "DOCTYPE", sizeof("DOCTYPE") - 1, false)) { + } else if ( + utf8iterator_maybe_consume_match ( + &tokenizer->_input, + "DOCTYPE", + sizeof("DOCTYPE") - 1, + false + ) + ) { gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE); tokenizer->_reconsume_current_input = true; // If we get here, we know we'll eventually emit a doctype token, so now is - // the time to initialize the doctype strings. (Not in doctype_state_init, + // the time to initialize the doctype strings. (Not in doctype_state_init, // since then they'll leak if ownership never gets transferred to the // doctype token. - tokenizer->_doc_type_state.name = gumbo_copy_stringz(parser, ""); - tokenizer->_doc_type_state.public_identifier = - gumbo_copy_stringz(parser, ""); - tokenizer->_doc_type_state.system_identifier = - gumbo_copy_stringz(parser, ""); - } else if (tokenizer->_is_current_node_foreign && - utf8iterator_maybe_consume_match( - &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) { + tokenizer->_doc_type_state.name = gumbo_strdup(""); + tokenizer->_doc_type_state.public_identifier = gumbo_strdup(""); + tokenizer->_doc_type_state.system_identifier = gumbo_strdup(""); + } else if ( + tokenizer->_is_current_node_foreign + && utf8iterator_maybe_consume_match ( + &tokenizer->_input, + "[CDATA[", sizeof("[CDATA[") - 1, + true + ) + ) { gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA); tokenizer->_is_in_cdata = true; tokenizer->_reconsume_current_input = true; @@ -2012,9 +2294,13 @@ static StateResult handle_markup_declaration_state(GumboParser* parser, return NEXT_CHAR; } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-state -static StateResult handle_comment_start_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state +static StateResult handle_comment_start_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { switch (c) { case '-': gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH); @@ -2041,9 +2327,13 @@ static StateResult handle_comment_start_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-dash-state -static StateResult handle_comment_start_dash_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state +static StateResult handle_comment_start_dash_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { switch (c) { case '-': gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END); @@ -2072,9 +2362,13 @@ static StateResult handle_comment_start_dash_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-state -static StateResult handle_comment_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#comment-state +static StateResult handle_comment_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { switch (c) { case '-': gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH); @@ -2094,9 +2388,13 @@ static StateResult handle_comment_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-dash-state -static StateResult handle_comment_end_dash_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state +static StateResult handle_comment_end_dash_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { switch (c) { case '-': gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END); @@ -2120,9 +2418,13 @@ static StateResult handle_comment_end_dash_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-state -static StateResult handle_comment_end_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state +static StateResult handle_comment_end_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { switch (c) { case '>': gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); @@ -2159,9 +2461,13 @@ static StateResult handle_comment_end_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-bang-state -static StateResult handle_comment_end_bang_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state +static StateResult handle_comment_end_bang_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { switch (c) { case '-': gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH); @@ -2195,9 +2501,13 @@ static StateResult handle_comment_end_bang_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-state -static StateResult handle_doctype_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#doctype-state +static StateResult handle_doctype_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { assert(!tokenizer->_temporary_buffer.length); switch (c) { case '\t': @@ -2221,9 +2531,13 @@ static StateResult handle_doctype_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-doctype-name-state -static StateResult handle_before_doctype_name_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state +static StateResult handle_before_doctype_name_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { switch (c) { case '\t': case '\n': @@ -2256,21 +2570,25 @@ static StateResult handle_before_doctype_name_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-name-state -static StateResult handle_doctype_name_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state +static StateResult handle_doctype_name_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { switch (c) { case '\t': case '\n': case '\f': case ' ': gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME); - gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name); + gumbo_free((void*) tokenizer->_doc_type_state.name); finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name); return NEXT_CHAR; case '>': gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); - gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name); + gumbo_free((void*) tokenizer->_doc_type_state.name); finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name); emit_doctype(parser, output); return RETURN_SUCCESS; @@ -2282,7 +2600,7 @@ static StateResult handle_doctype_name_state(GumboParser* parser, tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF); gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); tokenizer->_doc_type_state.force_quirks = true; - gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name); + gumbo_free((void*) tokenizer->_doc_type_state.name); finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name); emit_doctype(parser, output); return RETURN_ERROR; @@ -2294,9 +2612,13 @@ static StateResult handle_doctype_name_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-name-state -static StateResult handle_after_doctype_name_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state +static StateResult handle_after_doctype_name_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { switch (c) { case '\t': case '\n': @@ -2334,10 +2656,13 @@ static StateResult handle_after_doctype_name_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-keyword-state -static StateResult handle_after_doctype_public_keyword_state( - GumboParser* parser, GumboTokenizerState* tokenizer, int c, - GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state +static StateResult handle_after_doctype_public_keyword_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { switch (c) { case '\t': case '\n': @@ -2347,13 +2672,13 @@ static StateResult handle_after_doctype_public_keyword_state( return NEXT_CHAR; case '"': tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID); - assert(temporary_buffer_equals(parser, "")); + assert(temporary_buffer_is_empty(parser)); gumbo_tokenizer_set_state( parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED); return NEXT_CHAR; case '\'': tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID); - assert(temporary_buffer_equals(parser, "")); + assert(temporary_buffer_is_empty(parser)); gumbo_tokenizer_set_state( parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED); return NEXT_CHAR; @@ -2378,9 +2703,13 @@ static StateResult handle_after_doctype_public_keyword_state( } } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-public-identifier-state -static StateResult handle_before_doctype_public_id_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state +static StateResult handle_before_doctype_public_id_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { switch (c) { case '\t': case '\n': @@ -2388,12 +2717,12 @@ static StateResult handle_before_doctype_public_id_state(GumboParser* parser, case ' ': return NEXT_CHAR; case '"': - assert(temporary_buffer_equals(parser, "")); + assert(temporary_buffer_is_empty(parser)); gumbo_tokenizer_set_state( parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED); return NEXT_CHAR; case '\'': - assert(temporary_buffer_equals(parser, "")); + assert(temporary_buffer_is_empty(parser)); gumbo_tokenizer_set_state( parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED); return NEXT_CHAR; @@ -2418,10 +2747,13 @@ static StateResult handle_before_doctype_public_id_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(double-quoted)-state -static StateResult handle_doctype_public_id_double_quoted_state( - GumboParser* parser, GumboTokenizerState* tokenizer, int c, - GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(double-quoted)-state +static StateResult handle_doctype_public_id_double_quoted_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { switch (c) { case '"': gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID); @@ -2451,10 +2783,13 @@ static StateResult handle_doctype_public_id_double_quoted_state( } } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(single-quoted)-state -static StateResult handle_doctype_public_id_single_quoted_state( - GumboParser* parser, GumboTokenizerState* tokenizer, int c, - GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(single-quoted)-state +static StateResult handle_doctype_public_id_single_quoted_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { switch (c) { case '\'': gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID); @@ -2484,9 +2819,13 @@ static StateResult handle_doctype_public_id_single_quoted_state( } } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-identifier-state -static StateResult handle_after_doctype_public_id_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state +static StateResult handle_after_doctype_public_id_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { switch (c) { case '\t': case '\n': @@ -2501,13 +2840,13 @@ static StateResult handle_after_doctype_public_id_state(GumboParser* parser, return RETURN_SUCCESS; case '"': tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID); - assert(temporary_buffer_equals(parser, "")); + assert(temporary_buffer_is_empty(parser)); gumbo_tokenizer_set_state( parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED); return NEXT_CHAR; case '\'': tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID); - assert(temporary_buffer_equals(parser, "")); + assert(temporary_buffer_is_empty(parser)); gumbo_tokenizer_set_state( parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED); return NEXT_CHAR; @@ -2526,10 +2865,13 @@ static StateResult handle_after_doctype_public_id_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#between-doctype-public-and-system-identifiers-state -static StateResult handle_between_doctype_public_system_id_state( - GumboParser* parser, GumboTokenizerState* tokenizer, int c, - GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state +static StateResult handle_between_doctype_public_system_id_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { switch (c) { case '\t': case '\n': @@ -2541,12 +2883,12 @@ static StateResult handle_between_doctype_public_system_id_state( emit_doctype(parser, output); return RETURN_SUCCESS; case '"': - assert(temporary_buffer_equals(parser, "")); + assert(temporary_buffer_is_empty(parser)); gumbo_tokenizer_set_state( parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED); return NEXT_CHAR; case '\'': - assert(temporary_buffer_equals(parser, "")); + assert(temporary_buffer_is_empty(parser)); gumbo_tokenizer_set_state( parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED); return NEXT_CHAR; @@ -2565,10 +2907,13 @@ static StateResult handle_between_doctype_public_system_id_state( } } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-keyword-state -static StateResult handle_after_doctype_system_keyword_state( - GumboParser* parser, GumboTokenizerState* tokenizer, int c, - GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state +static StateResult handle_after_doctype_system_keyword_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { switch (c) { case '\t': case '\n': @@ -2578,13 +2923,13 @@ static StateResult handle_after_doctype_system_keyword_state( return NEXT_CHAR; case '"': tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID); - assert(temporary_buffer_equals(parser, "")); + assert(temporary_buffer_is_empty(parser)); gumbo_tokenizer_set_state( parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED); return NEXT_CHAR; case '\'': tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID); - assert(temporary_buffer_equals(parser, "")); + assert(temporary_buffer_is_empty(parser)); gumbo_tokenizer_set_state( parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED); return NEXT_CHAR; @@ -2608,9 +2953,13 @@ static StateResult handle_after_doctype_system_keyword_state( } } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-system-identifier-state -static StateResult handle_before_doctype_system_id_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state +static StateResult handle_before_doctype_system_id_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { switch (c) { case '\t': case '\n': @@ -2618,12 +2967,12 @@ static StateResult handle_before_doctype_system_id_state(GumboParser* parser, case ' ': return NEXT_CHAR; case '"': - assert(temporary_buffer_equals(parser, "")); + assert(temporary_buffer_is_empty(parser)); gumbo_tokenizer_set_state( parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED); return NEXT_CHAR; case '\'': - assert(temporary_buffer_equals(parser, "")); + assert(temporary_buffer_is_empty(parser)); gumbo_tokenizer_set_state( parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED); return NEXT_CHAR; @@ -2647,10 +2996,13 @@ static StateResult handle_before_doctype_system_id_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(double-quoted)-state -static StateResult handle_doctype_system_id_double_quoted_state( - GumboParser* parser, GumboTokenizerState* tokenizer, int c, - GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(double-quoted)-state +static StateResult handle_doctype_system_id_double_quoted_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { switch (c) { case '"': gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID); @@ -2680,10 +3032,13 @@ static StateResult handle_doctype_system_id_double_quoted_state( } } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(single-quoted)-state -static StateResult handle_doctype_system_id_single_quoted_state( - GumboParser* parser, GumboTokenizerState* tokenizer, int c, - GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(single-quoted)-state +static StateResult handle_doctype_system_id_single_quoted_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { switch (c) { case '\'': gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID); @@ -2713,9 +3068,13 @@ static StateResult handle_doctype_system_id_single_quoted_state( } } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-identifier-state -static StateResult handle_after_doctype_system_id_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state +static StateResult handle_after_doctype_system_id_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { switch (c) { case '\t': case '\n': @@ -2739,9 +3098,13 @@ static StateResult handle_after_doctype_system_id_state(GumboParser* parser, } } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-doctype-state -static StateResult handle_bogus_doctype_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state +static StateResult handle_bogus_doctype_state ( + GumboParser* parser, + GumboTokenizerState* UNUSED_ARG(tokenizer), + int c, + GumboToken* output +) { if (c == '>' || c == -1) { gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA); emit_doctype(parser, output); @@ -2750,9 +3113,13 @@ static StateResult handle_bogus_doctype_state(GumboParser* parser, return NEXT_CHAR; } -// http://www.whatwg.org/specs/web-apps/current-work/complete.html#cdata-section-state -static StateResult handle_cdata_state(GumboParser* parser, - GumboTokenizerState* tokenizer, int c, GumboToken* output) { +// https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state +static StateResult handle_cdata_state ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +) { if (c == -1 || utf8iterator_maybe_consume_match( &tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) { tokenizer->_reconsume_current_input = true; @@ -2765,50 +3132,83 @@ static StateResult handle_cdata_state(GumboParser* parser, } } -typedef StateResult (*GumboLexerStateFunction)( - GumboParser*, GumboTokenizerState*, int, GumboToken*); - -static GumboLexerStateFunction dispatch_table[] = {handle_data_state, - handle_char_ref_in_data_state, handle_rcdata_state, - handle_char_ref_in_rcdata_state, handle_rawtext_state, handle_script_state, - handle_plaintext_state, handle_tag_open_state, handle_end_tag_open_state, - handle_tag_name_state, handle_rcdata_lt_state, - handle_rcdata_end_tag_open_state, handle_rcdata_end_tag_name_state, - handle_rawtext_lt_state, handle_rawtext_end_tag_open_state, - handle_rawtext_end_tag_name_state, handle_script_lt_state, - handle_script_end_tag_open_state, handle_script_end_tag_name_state, - handle_script_escaped_start_state, handle_script_escaped_start_dash_state, - handle_script_escaped_state, handle_script_escaped_dash_state, - handle_script_escaped_dash_dash_state, handle_script_escaped_lt_state, - handle_script_escaped_end_tag_open_state, - handle_script_escaped_end_tag_name_state, - handle_script_double_escaped_start_state, - handle_script_double_escaped_state, handle_script_double_escaped_dash_state, - handle_script_double_escaped_dash_dash_state, - handle_script_double_escaped_lt_state, - handle_script_double_escaped_end_state, handle_before_attr_name_state, - handle_attr_name_state, handle_after_attr_name_state, - handle_before_attr_value_state, handle_attr_value_double_quoted_state, - handle_attr_value_single_quoted_state, handle_attr_value_unquoted_state, - handle_char_ref_in_attr_value_state, handle_after_attr_value_quoted_state, - handle_self_closing_start_tag_state, handle_bogus_comment_state, - handle_markup_declaration_state, handle_comment_start_state, - handle_comment_start_dash_state, handle_comment_state, - handle_comment_end_dash_state, handle_comment_end_state, - handle_comment_end_bang_state, handle_doctype_state, - handle_before_doctype_name_state, handle_doctype_name_state, - handle_after_doctype_name_state, handle_after_doctype_public_keyword_state, - handle_before_doctype_public_id_state, - handle_doctype_public_id_double_quoted_state, - handle_doctype_public_id_single_quoted_state, - handle_after_doctype_public_id_state, - handle_between_doctype_public_system_id_state, - handle_after_doctype_system_keyword_state, - handle_before_doctype_system_id_state, - handle_doctype_system_id_double_quoted_state, - handle_doctype_system_id_single_quoted_state, - handle_after_doctype_system_id_state, handle_bogus_doctype_state, - handle_cdata_state}; +typedef StateResult (*GumboLexerStateFunction) ( + GumboParser* parser, + GumboTokenizerState* tokenizer, + int c, + GumboToken* output +); + +static GumboLexerStateFunction dispatch_table[] = { + handle_data_state, + handle_char_ref_in_data_state, + handle_rcdata_state, + handle_char_ref_in_rcdata_state, + handle_rawtext_state, + handle_script_state, + handle_plaintext_state, + handle_tag_open_state, + handle_end_tag_open_state, + handle_tag_name_state, + handle_rcdata_lt_state, + handle_rcdata_end_tag_open_state, + handle_rcdata_end_tag_name_state, + handle_rawtext_lt_state, + handle_rawtext_end_tag_open_state, + handle_rawtext_end_tag_name_state, + handle_script_lt_state, + handle_script_end_tag_open_state, + handle_script_end_tag_name_state, + handle_script_escaped_start_state, + handle_script_escaped_start_dash_state, + handle_script_escaped_state, + handle_script_escaped_dash_state, + handle_script_escaped_dash_dash_state, + handle_script_escaped_lt_state, + handle_script_escaped_end_tag_open_state, + handle_script_escaped_end_tag_name_state, + handle_script_double_escaped_start_state, + handle_script_double_escaped_state, + handle_script_double_escaped_dash_state, + handle_script_double_escaped_dash_dash_state, + handle_script_double_escaped_lt_state, + handle_script_double_escaped_end_state, + handle_before_attr_name_state, + handle_attr_name_state, + handle_after_attr_name_state, + handle_before_attr_value_state, + handle_attr_value_double_quoted_state, + handle_attr_value_single_quoted_state, + handle_attr_value_unquoted_state, + handle_char_ref_in_attr_value_state, + handle_after_attr_value_quoted_state, + handle_self_closing_start_tag_state, + handle_bogus_comment_state, + handle_markup_declaration_state, + handle_comment_start_state, + handle_comment_start_dash_state, + handle_comment_state, + handle_comment_end_dash_state, + handle_comment_end_state, + handle_comment_end_bang_state, + handle_doctype_state, + handle_before_doctype_name_state, + handle_doctype_name_state, + handle_after_doctype_name_state, + handle_after_doctype_public_keyword_state, + handle_before_doctype_public_id_state, + handle_doctype_public_id_double_quoted_state, + handle_doctype_public_id_single_quoted_state, + handle_after_doctype_public_id_state, + handle_between_doctype_public_system_id_state, + handle_after_doctype_system_keyword_state, + handle_before_doctype_system_id_state, + handle_doctype_system_id_double_quoted_state, + handle_doctype_system_id_single_quoted_state, + handle_after_doctype_system_id_state, + handle_bogus_doctype_state, + handle_cdata_state +}; bool gumbo_lex(GumboParser* parser, GumboToken* output) { // Because of the spec requirements that... @@ -2820,9 +3220,9 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) { // state. // // ...all state must be held in the GumboTokenizer struct instead of in local - // variables in this function. That allows us to return from this method with + // variables in this function. That allows us to return from this method with // a token, and then immediately jump back to the same state with the same - // input if we need to return a different token. The various emit_* functions + // input if we need to return a different token. The various emit_* functions // are responsible for changing state (eg. flushing the chardata buffer, // reading the next input character) to avoid an infinite loop. GumboTokenizerState* tokenizer = parser->_tokenizer_state; @@ -2846,10 +3246,9 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) { assert(!tokenizer->_temporary_buffer_emit); assert(tokenizer->_buffered_emit_char == kGumboNoChar); int c = utf8iterator_current(&tokenizer->_input); - gumbo_debug( - "Lexing character '%c' (%d) in state %d.\n", c, c, tokenizer->_state); - StateResult result = - dispatch_table[tokenizer->_state](parser, tokenizer, c, output); + GumboTokenizerEnum state = tokenizer->_state; + gumbo_debug("Lexing character '%c' (%d) in state %u.\n", c, c, state); + StateResult result = dispatch_table[state](parser, tokenizer, c, output); // We need to clear reconsume_current_input before returning to prevent // certain infinite loop states. bool should_advance = !tokenizer->_reconsume_current_input; @@ -2867,30 +3266,29 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) { } } -void gumbo_token_destroy(GumboParser* parser, GumboToken* token) { +void gumbo_token_destroy(GumboToken* token) { if (!token) return; switch (token->type) { case GUMBO_TOKEN_DOCTYPE: - gumbo_parser_deallocate(parser, (void*) token->v.doc_type.name); - gumbo_parser_deallocate( - parser, (void*) token->v.doc_type.public_identifier); - gumbo_parser_deallocate( - parser, (void*) token->v.doc_type.system_identifier); + gumbo_free((void*) token->v.doc_type.name); + gumbo_free((void*) token->v.doc_type.public_identifier); + gumbo_free((void*) token->v.doc_type.system_identifier); return; case GUMBO_TOKEN_START_TAG: for (unsigned int i = 0; i < token->v.start_tag.attributes.length; ++i) { GumboAttribute* attr = token->v.start_tag.attributes.data[i]; if (attr) { // May have been nulled out if this token was merged with another. - gumbo_destroy_attribute(parser, attr); + gumbo_destroy_attribute(attr); } } - gumbo_parser_deallocate( - parser, (void*) token->v.start_tag.attributes.data); + gumbo_free((void*) token->v.start_tag.attributes.data); + if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN) + gumbo_free((void*) token->v.start_tag.name); return; case GUMBO_TOKEN_COMMENT: - gumbo_parser_deallocate(parser, (void*) token->v.text); + gumbo_free((void*) token->v.text); return; default: return; diff --git a/gumbo-parser/src/tokenizer.h b/gumbo-parser/src/tokenizer.h index 1e2a2ca7..b1f43a92 100644 --- a/gumbo-parser/src/tokenizer.h +++ b/gumbo-parser/src/tokenizer.h @@ -1,25 +1,9 @@ -// Copyright 2010 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) -// -// This contains an implementation of a tokenizer for HTML5. It consumes a -// buffer of UTF-8 characters, and then emits a stream of tokens. - #ifndef GUMBO_TOKENIZER_H_ #define GUMBO_TOKENIZER_H_ +// This contains an implementation of a tokenizer for HTML5. It consumes a +// buffer of UTF-8 characters, and then emits a stream of tokens. + #include <stdbool.h> #include <stddef.h> @@ -49,11 +33,18 @@ typedef struct GumboInternalTokenDocType { // Struct containing all information pertaining to start tag tokens. typedef struct GumboInternalTokenStartTag { GumboTag tag; + const char *name; GumboVector /* GumboAttribute */ attributes; bool is_self_closing; } GumboTokenStartTag; -// A data structure representing a single token in the input stream. This +// Struct containing all information pertaining to end tag tokens. +typedef struct GumboInternalTokenEndTag { + GumboTag tag; + bool is_self_closing; +} GumboTokenEndTag; + +// A data structure representing a single token in the input stream. This // contains an enum for the type, the source position, a GumboStringPiece // pointing to the original text, and then a union for any parsed data. typedef struct GumboInternalToken { @@ -63,7 +54,7 @@ typedef struct GumboInternalToken { union { GumboTokenDocType doc_type; GumboTokenStartTag start_tag; - GumboTag end_tag; + GumboTokenEndTag end_tag; const char* text; // For comments. int character; // For character, whitespace, null, and EOF tokens. } v; @@ -71,28 +62,35 @@ typedef struct GumboInternalToken { // Initializes the tokenizer state within the GumboParser object, setting up a // parse of the specified text. -void gumbo_tokenizer_state_init( - struct GumboInternalParser* parser, const char* text, size_t text_length); +void gumbo_tokenizer_state_init ( + struct GumboInternalParser* parser, + const char* text, + size_t text_length +); // Destroys the tokenizer state within the GumboParser object, freeing any // dynamically-allocated structures within it. void gumbo_tokenizer_state_destroy(struct GumboInternalParser* parser); -// Sets the tokenizer state to the specified value. This is needed by some +// Sets the tokenizer state to the specified value. This is needed by some // parser states, which alter the state of the tokenizer in response to tags // seen. -void gumbo_tokenizer_set_state( - struct GumboInternalParser* parser, GumboTokenizerEnum state); +void gumbo_tokenizer_set_state ( + struct GumboInternalParser* parser, + GumboTokenizerEnum state +); -// Flags whether the current node is a foreign content element. This is +// Flags whether the current node is a foreign content element. This is // necessary for the markup declaration open state, where the tokenizer must be // aware of the state of the parser to properly tokenize bad comment tags. -// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#markup-declaration-open-state -void gumbo_tokenizer_set_is_current_node_foreign( - struct GumboInternalParser* parser, bool is_foreign); +// https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state +void gumbo_tokenizer_set_is_current_node_foreign ( + struct GumboInternalParser* parser, + bool is_foreign +); // Lexes a single token from the specified buffer, filling the output with the -// parsed GumboToken data structure. Returns true for a successful +// parsed GumboToken data structure. Returns true for a successful // tokenization, false if a parse error occurs. // // Example: @@ -101,23 +99,22 @@ void gumbo_tokenizer_set_is_current_node_foreign( // gumbo_tokenizer_state_init(&parser, text, strlen(text)); // while (gumbo_lex(&parser, &output)) { // ...do stuff with output. -// gumbo_token_destroy(&parser, &token); +// gumbo_token_destroy(&token); // } // gumbo_tokenizer_state_destroy(&parser); bool gumbo_lex(struct GumboInternalParser* parser, GumboToken* output); -// Frees the internally-allocated pointers within an GumboToken. Note that this +// Frees the internally-allocated pointers within a GumboToken. Note that this // doesn't free the token itself, since oftentimes it will be allocated on the -// stack. A simple call to free() (or GumboParser->deallocator, if -// appropriate) can handle that. +// stack. // // Note that if you are handing over ownership of the internal strings to some // other data structure - for example, a parse tree - these do not need to be // freed. -void gumbo_token_destroy(struct GumboInternalParser* parser, GumboToken* token); +void gumbo_token_destroy(GumboToken* token); #ifdef __cplusplus } #endif -#endif // GUMBO_TOKENIZER_H_ +#endif // GUMBO_TOKENIZER_H_ diff --git a/gumbo-parser/src/tokenizer_states.h b/gumbo-parser/src/tokenizer_states.h index 80659f5f..cb3f4b24 100644 --- a/gumbo-parser/src/tokenizer_states.h +++ b/gumbo-parser/src/tokenizer_states.h @@ -1,32 +1,16 @@ -// Copyright 2011 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) -// -// This contains the list of states used in the tokenizer. Although at first +#ifndef GUMBO_TOKENIZER_STATES_H_ +#define GUMBO_TOKENIZER_STATES_H_ + +// This contains the list of states used in the tokenizer. Although at first // glance it seems like these could be kept internal to the tokenizer, several // of the actions in the parser require that it reach into the tokenizer and -// reset the tokenizer state. For that to work, it needs to have the +// reset the tokenizer state. For that to work, it needs to have the // definitions of individual states available. // // This may also be useful for providing more detailed error messages for parse // errors, as we can match up states and inputs in a table without having to // clutter the tokenizer code with lots of precise error messages. -#ifndef GUMBO_TOKENIZER_STATES_H_ -#define GUMBO_TOKENIZER_STATES_H_ - // The ordering of this enum is also used to build the dispatch table for the // tokenizer state machine, so if it is changed, be sure to update that too. typedef enum { @@ -100,4 +84,4 @@ typedef enum { GUMBO_LEX_CDATA } GumboTokenizerEnum; -#endif // GUMBO_TOKENIZER_STATES_H_ +#endif // GUMBO_TOKENIZER_STATES_H_ diff --git a/gumbo-parser/src/utf8.c b/gumbo-parser/src/utf8.c index fdd6f837..e1c34f3f 100644 --- a/gumbo-parser/src/utf8.c +++ b/gumbo-parser/src/utf8.c @@ -1,59 +1,53 @@ -// Copyright 2010 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) +/* + Copyright 2018 Craig Barnes. + Copyright 2010 Google Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include "utf8.h" #include <assert.h> #include <stdint.h> #include <string.h> -#include <strings.h> // For strncasecmp. #include "error.h" #include "gumbo.h" #include "parser.h" -#include "util.h" +#include "ascii.h" #include "vector.h" const int kUtf8ReplacementChar = 0xFFFD; -// Reference material: -// Wikipedia: http://en.wikipedia.org/wiki/UTF-8#Description -// RFC 3629: http://tools.ietf.org/html/rfc3629 -// HTML5 Unicode handling: -// http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html#preprocessing-the-input-stream -// -// This implementation is based on a DFA-based decoder by Bjoern Hoehrmann -// <bjoern@hoehrmann.de>. We wrap the inner table-based decoder routine in our -// own handling for newlines, tabs, invalid continuation bytes, and other -// conditions that the HTML5 spec fully specifies but normal UTF8 decoders do -// not handle. -// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. Full text of -// the license agreement and code follows. +// References: +// * https://tools.ietf.org/html/rfc3629 +// * https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream -// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> - -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to -// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -// of the Software, and to permit persons to whom the Software is furnished to -// do -// so, subject to the following conditions: +// The following code is a DFA-based UTF-8 decoder by Bjoern Hoehrmann. +// We wrap the inner table-based decoder routine in our own handling for +// newlines, tabs, invalid continuation bytes, and other conditions that +// the HTML5 spec fully specifies but normal UTF-8 decoders do not handle. +// See https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. +// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> +// +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. @@ -61,35 +55,33 @@ const int kUtf8ReplacementChar = 0xFFFD; #define UTF8_REJECT 12 static const uint8_t utf8d[] = { - // The first part of the table maps bytes to character classes that - // to reduce the size of the transition table and create bitmasks. - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 10, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, - - // The second part is a transition table that maps a combination - // of a state of the automaton and a character class to a state. - 0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12, - 12, 12, 12, 12, 12, 12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, 12, 24, 12, - 12, 12, 12, 12, 24, 12, 24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, - 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, - 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, - 12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + // The first part of the table maps bytes to character classes that + // to reduce the size of the transition table and create bitmasks. + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + + // The second part is a transition table that maps a combination + // of a state of the automaton and a character class to a state. + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, }; -uint32_t static inline decode(uint32_t* state, uint32_t* codep, uint32_t byte) { +static inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) { uint32_t type = utf8d[byte]; - *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6) - : (0xff >> type) & (byte); + *codep = + (*state != UTF8_ACCEPT) + ? (byte & 0x3fu) | (*codep << 6) + : (0xff >> type) & (byte); *state = utf8d[256 + *state + type]; return *state; @@ -113,8 +105,8 @@ static void add_error(Utf8Iterator* iter, GumboErrorType type) { // At the point the error is recorded, the code point hasn't been computed // yet (and can't be, because it's invalid), so we need to build up the raw // hex value from the bytes under the cursor. - uint64_t code_point = 0; - for (int i = 0; i < iter->_width; ++i) { + uint32_t code_point = 0; + for (size_t i = 0; i < iter->_width; ++i) { code_point = (code_point << 8) | (unsigned char) iter->_start[i]; } error->v.codepoint = code_point; @@ -139,10 +131,10 @@ static void read_char(Utf8Iterator* iter) { if (state == UTF8_ACCEPT) { iter->_width = c - iter->_start + 1; // This is the special handling for carriage returns that is mandated by - // the HTML5 spec. Since we're looking for particular 7-bit literal + // the HTML5 spec. Since we're looking for particular 7-bit literal // characters, we operate in terms of chars and only need a check for iter // overrun, instead of having to read in a full next code point. - // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream + // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream if (code_point == '\r') { assert(iter->_width == 1); const char* next = c + 1; @@ -171,8 +163,8 @@ static void read_char(Utf8Iterator* iter) { } } // If we got here without exiting early, then we've reached the end of the - // iterator. Add an error for truncated input, set the width to consume the - // rest of the iterator, and emit a replacement character. The next time we + // iterator. Add an error for truncated input, set the width to consume the + // rest of the iterator, and emit a replacement character. The next time we // enter this method, it will detect that there's no input to consume and // output an EOF. iter->_current = kUtf8ReplacementChar; @@ -196,13 +188,23 @@ static void update_position(Utf8Iterator* iter) { // Returns true if this Unicode code point is in the list of characters // forbidden by the HTML5 spec, such as undefined control chars. bool utf8_is_invalid_code_point(int c) { - return (c >= 0x1 && c <= 0x8) || c == 0xB || (c >= 0xE && c <= 0x1F) || - (c >= 0x7F && c <= 0x9F) || (c >= 0xFDD0 && c <= 0xFDEF) || - ((c & 0xFFFF) == 0xFFFE) || ((c & 0xFFFF) == 0xFFFF); + return + (c >= 0x1 && c <= 0x8) + || c == 0xB + || (c >= 0xE && c <= 0x1F) + || (c >= 0x7F && c <= 0x9F) + || (c >= 0xFDD0 && c <= 0xFDEF) + || ((c & 0xFFFF) == 0xFFFE) + || ((c & 0xFFFF) == 0xFFFF) + ; } -void utf8iterator_init(GumboParser* parser, const char* source, - size_t source_length, Utf8Iterator* iter) { +void utf8iterator_init ( + GumboParser* parser, + const char* source, + size_t source_length, + Utf8Iterator* iter +) { iter->_start = source; iter->_end = source + source_length; iter->_pos.line = 1; @@ -220,10 +222,14 @@ void utf8iterator_next(Utf8Iterator* iter) { read_char(iter); } -int utf8iterator_current(const Utf8Iterator* iter) { return iter->_current; } +int utf8iterator_current(const Utf8Iterator* iter) { + return iter->_current; +} -void utf8iterator_get_position( - const Utf8Iterator* iter, GumboSourcePosition* output) { +void utf8iterator_get_position ( + const Utf8Iterator* iter, + GumboSourcePosition* output +) { *output = iter->_pos; } @@ -235,13 +241,22 @@ const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) { return iter->_end; } -bool utf8iterator_maybe_consume_match(Utf8Iterator* iter, const char* prefix, - size_t length, bool case_sensitive) { - bool matched = (iter->_start + length <= iter->_end) && - (case_sensitive ? !strncmp(iter->_start, prefix, length) - : !strncasecmp(iter->_start, prefix, length)); +bool utf8iterator_maybe_consume_match ( + Utf8Iterator* iter, + const char* prefix, + size_t length, + bool case_sensitive +) { + bool matched = + (iter->_start + length <= iter->_end) + && ( + case_sensitive + ? !strncmp(iter->_start, prefix, length) + : !gumbo_ascii_strncasecmp(iter->_start, prefix, length) + ) + ; if (matched) { - for (unsigned int i = 0; i < length; ++i) { + for (size_t i = 0; i < length; ++i) { utf8iterator_next(iter); } return true; diff --git a/gumbo-parser/src/utf8.h b/gumbo-parser/src/utf8.h index bd31a781..0c52e5fa 100644 --- a/gumbo-parser/src/utf8.h +++ b/gumbo-parser/src/utf8.h @@ -1,41 +1,26 @@ -// Copyright 2010 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) -// -// This contains an implementation of a UTF8 iterator and decoder suitable for -// an HTML5 parser. This does a bit more than straight UTF-8 decoding. The +#ifndef GUMBO_UTF8_H_ +#define GUMBO_UTF8_H_ + +// This contains an implementation of a UTF-8 iterator and decoder suitable for +// a HTML5 parser. This does a bit more than straight UTF-8 decoding. The // HTML5 spec specifies that: // 1. Decoding errors are parse errors. -// 2. Certain other codepoints (eg. control characters) are parse errors. +// 2. Certain other codepoints (e.g. control characters) are parse errors. // 3. Carriage returns and CR/LF groups are converted to line feeds. -// http://www.whatwg.org/specs/web-apps/current-work/multipage/infrastructure.html#decoded-as-utf-8,-with-error-handling +// https://encoding.spec.whatwg.org/#utf-8-decode // -// Also, we want to keep track of source positions for error handling. As a +// Also, we want to keep track of source positions for error handling. As a // result, we fold all that functionality into this decoder, and can't use an // off-the-shelf library. // // This header is internal-only, which is why we prefix functions with only // utf8_ or utf8_iterator_ instead of gumbo_utf8_. -#ifndef GUMBO_UTF8_H_ -#define GUMBO_UTF8_H_ - #include <stdbool.h> #include <stddef.h> #include "gumbo.h" +#include "macros.h" #ifdef __cplusplus extern "C" { @@ -51,7 +36,7 @@ typedef struct GumboInternalUtf8Iterator { // Points at the start of the code point most recently read into 'current'. const char* _start; - // Points at the mark. The mark is initially set to the beginning of the + // Points at the mark. The mark is initially set to the beginning of the // input. const char* _mark; @@ -62,7 +47,7 @@ typedef struct GumboInternalUtf8Iterator { int _current; // The width in bytes of the current code point. - int _width; + size_t _width; // The SourcePosition for the current location. GumboSourcePosition _pos; @@ -77,12 +62,16 @@ typedef struct GumboInternalUtf8Iterator { // Returns true if this Unicode code point is in the list of characters // forbidden by the HTML5 spec, such as NUL bytes and undefined control chars. -bool utf8_is_invalid_code_point(int c); +bool utf8_is_invalid_code_point(int c) CONST_FN; -// Initializes a new Utf8Iterator from the given byte buffer. The source does +// Initializes a new Utf8Iterator from the given byte buffer. The source does // not have to be NUL-terminated, but the length must be passed in explicitly. -void utf8iterator_init(struct GumboInternalParser* parser, const char* source, - size_t source_length, Utf8Iterator* iter); +void utf8iterator_init ( + struct GumboInternalParser* parser, + const char* source, + size_t source_length, + Utf8Iterator* iter +); // Advances the current position by one code point. void utf8iterator_next(Utf8Iterator* iter); @@ -97,23 +86,27 @@ void utf8iterator_get_position( // Retrieves a character pointer to the start of the current character. const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter); -// Retrieves a character pointer to 1 past the end of the buffer. This is +// Retrieves a character pointer to 1 past the end of the buffer. This is // necessary for certain state machines and string comparisons that would like // to look directly for ASCII text in the buffer without going through the // decoder. const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter); // If the upcoming text in the buffer matches the specified prefix (which has -// length 'length'), consume it and return true. Otherwise, return false with -// no other effects. If the length of the string would overflow the buffer, -// this returns false. Note that prefix should not contain null bytes because -// of the use of strncmp/strncasecmp internally. All existing use-cases adhere +// length 'length'), consume it and return true. Otherwise, return false with +// no other effects. If the length of the string would overflow the buffer, +// this returns false. Note that prefix should not contain null bytes because +// of the use of strncmp/strncasecmp internally. All existing use-cases adhere // to this. -bool utf8iterator_maybe_consume_match( - Utf8Iterator* iter, const char* prefix, size_t length, bool case_sensitive); +bool utf8iterator_maybe_consume_match ( + Utf8Iterator* iter, + const char* prefix, + size_t length, + bool case_sensitive +); // "Marks" a particular location of interest in the input stream, so that it can -// later be reset() to. There's also the ability to record an error at the +// later be reset() to. There's also the ability to record an error at the // point that was marked, as oftentimes that's more useful than the last // character before the error was detected. void utf8iterator_mark(Utf8Iterator* iter); @@ -123,10 +116,13 @@ void utf8iterator_reset(Utf8Iterator* iter); // Sets the position and original text fields of an error to the value at the // mark. -void utf8iterator_fill_error_at_mark( - Utf8Iterator* iter, struct GumboInternalError* error); +void utf8iterator_fill_error_at_mark ( + Utf8Iterator* iter, + struct GumboInternalError* error +); #ifdef __cplusplus } #endif -#endif // GUMBO_UTF8_H_ + +#endif // GUMBO_UTF8_H_ diff --git a/gumbo-parser/src/util.c b/gumbo-parser/src/util.c index 5a24c115..5af20524 100644 --- a/gumbo-parser/src/util.c +++ b/gumbo-parser/src/util.c @@ -1,58 +1,68 @@ -// Copyright 2010 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) +/* + Copyright 2017-2018 Craig Barnes. + Copyright 2010 Google Inc. -#include "util.h" + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 -#include <assert.h> + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include <stdio.h> #include <stdlib.h> #include <string.h> -#include <strings.h> -#include <stdarg.h> -#include <stdio.h> - +#include "util.h" #include "gumbo.h" -#include "parser.h" -// TODO(jdtang): This should be elsewhere, but there's no .c file for -// SourcePositions and yet the constant needs some linkage, so this is as good -// as any. -const GumboSourcePosition kGumboEmptySourcePosition = {0, 0, 0}; +void* gumbo_alloc(size_t size) { + void* ptr = malloc(size); + if (unlikely(ptr == NULL)) { + perror(__func__); + abort(); + } + return ptr; +} -void* gumbo_parser_allocate(GumboParser* parser, size_t num_bytes) { - return parser->_options->allocator(parser->_options->userdata, num_bytes); +void* gumbo_realloc(void* ptr, size_t size) { + ptr = realloc(ptr, size); + if (unlikely(ptr == NULL)) { + perror(__func__); + abort(); + } + return ptr; } -void gumbo_parser_deallocate(GumboParser* parser, void* ptr) { - parser->_options->deallocator(parser->_options->userdata, ptr); +void gumbo_free(void* ptr) { + free(ptr); } -char* gumbo_copy_stringz(GumboParser* parser, const char* str) { - char* buffer = gumbo_parser_allocate(parser, strlen(str) + 1); - strcpy(buffer, str); - return buffer; +char* gumbo_strdup(const char* str) { + const size_t size = strlen(str) + 1; + // The strdup(3) function isn't available in strict "-std=c99" mode + // (it's part of POSIX, not C99), so use malloc(3) and memcpy(3) + // instead: + char* buffer = gumbo_alloc(size); + return memcpy(buffer, str, size); } -// Debug function to trace operation of the parser. Pass --copts=-DGUMBO_DEBUG -// to use. -void gumbo_debug(const char* format, ...) { #ifdef GUMBO_DEBUG +#include <stdarg.h> +// Debug function to trace operation of the parser +// (define GUMBO_DEBUG to use). +void gumbo_debug(const char* format, ...) { va_list args; va_start(args, format); vprintf(format, args); va_end(args); fflush(stdout); -#endif } +#else +void gumbo_debug(const char* UNUSED_ARG(format), ...) {} +#endif diff --git a/gumbo-parser/src/util.h b/gumbo-parser/src/util.h index 6ad65649..dfdf465b 100644 --- a/gumbo-parser/src/util.h +++ b/gumbo-parser/src/util.h @@ -1,60 +1,30 @@ -// Copyright 2010 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) -// -// This contains some utility functions that didn't fit into any of the other -// headers. - #ifndef GUMBO_UTIL_H_ #define GUMBO_UTIL_H_ -#ifdef _MSC_VER -#define _CRT_SECURE_NO_WARNINGS -#endif + #include <stdbool.h> #include <stddef.h> +#include "macros.h" #ifdef __cplusplus extern "C" { #endif -// Forward declaration since it's passed into some of the functions in this -// header. -struct GumboInternalParser; - // Utility function for allocating & copying a null-terminated string into a -// freshly-allocated buffer. This is necessary for proper memory management; we +// freshly-allocated buffer. This is necessary for proper memory management; we // have the convention that all const char* in parse tree structures are // freshly-allocated, so if we didn't copy, we'd try to delete a literal string // when the parse tree is destroyed. -char* gumbo_copy_stringz(struct GumboInternalParser* parser, const char* str); - -// Allocate a chunk of memory, using the allocator specified in the Parser's -// config options. -void* gumbo_parser_allocate( - struct GumboInternalParser* parser, size_t num_bytes); +char* gumbo_strdup(const char* str) XMALLOC NONNULL_ARGS; -// Deallocate a chunk of memory, using the deallocator specified in the Parser's -// config options. -void gumbo_parser_deallocate(struct GumboInternalParser* parser, void* ptr); +void* gumbo_alloc(size_t size) XMALLOC; +void* gumbo_realloc(void* ptr, size_t size) RETURNS_NONNULL; +void gumbo_free(void* ptr); -// Debug wrapper for printf, to make it easier to turn off debugging info when -// required. -void gumbo_debug(const char* format, ...); +// Debug wrapper for printf +void gumbo_debug(const char* format, ...) PRINTF(1); #ifdef __cplusplus } #endif -#endif // GUMBO_UTIL_H_ +#endif // GUMBO_UTIL_H_ diff --git a/gumbo-parser/src/vector.c b/gumbo-parser/src/vector.c index 51758dfe..d4dfa2af 100644 --- a/gumbo-parser/src/vector.c +++ b/gumbo-parser/src/vector.c @@ -1,81 +1,70 @@ -// Copyright 2010 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) +/* + Copyright 2018 Craig Barnes. + Copyright 2010 Google Inc. -#include "vector.h" + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ #include <assert.h> #include <stdlib.h> #include <string.h> -#include <strings.h> - +#include "vector.h" #include "util.h" -struct GumboInternalParser; - -const GumboVector kGumboEmptyVector = {NULL, 0, 0}; +const GumboVector kGumboEmptyVector = { \ + .data = NULL, \ + .length = 0, \ + .capacity = 0 \ +}; -void gumbo_vector_init(struct GumboInternalParser* parser, - size_t initial_capacity, GumboVector* vector) { +void gumbo_vector_init(unsigned int initial_capacity, GumboVector* vector) { vector->length = 0; vector->capacity = initial_capacity; if (initial_capacity > 0) { - vector->data = - gumbo_parser_allocate(parser, sizeof(void*) * initial_capacity); + vector->data = gumbo_alloc(sizeof(void*) * initial_capacity); } else { vector->data = NULL; } } -void gumbo_vector_destroy( - struct GumboInternalParser* parser, GumboVector* vector) { +void gumbo_vector_destroy(GumboVector* vector) { if (vector->capacity > 0) { - gumbo_parser_deallocate(parser, vector->data); + gumbo_free(vector->data); } } -static void enlarge_vector_if_full( - struct GumboInternalParser* parser, GumboVector* vector) { +static void enlarge_vector_if_full(GumboVector* vector) { if (vector->length >= vector->capacity) { if (vector->capacity) { - size_t old_num_bytes = sizeof(void*) * vector->capacity; vector->capacity *= 2; size_t num_bytes = sizeof(void*) * vector->capacity; - void** temp = gumbo_parser_allocate(parser, num_bytes); - memcpy(temp, vector->data, old_num_bytes); - gumbo_parser_deallocate(parser, vector->data); - vector->data = temp; + vector->data = gumbo_realloc(vector->data, num_bytes); } else { // 0-capacity vector; no previous array to deallocate. vector->capacity = 2; - vector->data = - gumbo_parser_allocate(parser, sizeof(void*) * vector->capacity); + vector->data = gumbo_alloc(sizeof(void*) * vector->capacity); } } } -void gumbo_vector_add( - struct GumboInternalParser* parser, void* element, GumboVector* vector) { - enlarge_vector_if_full(parser, vector); +void gumbo_vector_add(void* element, GumboVector* vector) { + enlarge_vector_if_full(vector); assert(vector->data); assert(vector->length < vector->capacity); vector->data[vector->length++] = element; } -void* gumbo_vector_pop( - struct GumboInternalParser* parser, GumboVector* vector) { +void* gumbo_vector_pop(GumboVector* vector) { if (vector->length == 0) { return NULL; } @@ -91,33 +80,38 @@ int gumbo_vector_index_of(GumboVector* vector, const void* element) { return -1; } -void gumbo_vector_insert_at(struct GumboInternalParser* parser, void* element, - unsigned int index, GumboVector* vector) { - assert(index >= 0); +void gumbo_vector_insert_at ( + void* element, + unsigned int index, + GumboVector* vector +) { assert(index <= vector->length); - enlarge_vector_if_full(parser, vector); + enlarge_vector_if_full(vector); ++vector->length; - memmove(&vector->data[index + 1], &vector->data[index], - sizeof(void*) * (vector->length - index - 1)); + memmove ( + &vector->data[index + 1], + &vector->data[index], + sizeof(void*) * (vector->length - index - 1) + ); vector->data[index] = element; } -void gumbo_vector_remove( - struct GumboInternalParser* parser, void* node, GumboVector* vector) { +void gumbo_vector_remove(void* node, GumboVector* vector) { int index = gumbo_vector_index_of(vector, node); if (index == -1) { return; } - gumbo_vector_remove_at(parser, index, vector); + gumbo_vector_remove_at(index, vector); } -void* gumbo_vector_remove_at(struct GumboInternalParser* parser, - unsigned int index, GumboVector* vector) { - assert(index >= 0); +void* gumbo_vector_remove_at(unsigned int index, GumboVector* vector) { assert(index < vector->length); void* result = vector->data[index]; - memmove(&vector->data[index], &vector->data[index + 1], - sizeof(void*) * (vector->length - index - 1)); + memmove ( + &vector->data[index], + &vector->data[index + 1], + sizeof(void*) * (vector->length - index - 1) + ); --vector->length; return result; } diff --git a/gumbo-parser/src/vector.h b/gumbo-parser/src/vector.h index 70fe6fa6..5e164de3 100644 --- a/gumbo-parser/src/vector.h +++ b/gumbo-parser/src/vector.h @@ -1,19 +1,3 @@ -// Copyright 2010 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Author: jdtang@google.com (Jonathan Tang) - #ifndef GUMBO_VECTOR_H_ #define GUMBO_VECTOR_H_ @@ -23,45 +7,39 @@ extern "C" { #endif -// Forward declaration since it's passed into some of the functions in this -// header. -struct GumboInternalParser; - // Initializes a new GumboVector with the specified initial capacity. -void gumbo_vector_init(struct GumboInternalParser* parser, - size_t initial_capacity, GumboVector* vector); +void gumbo_vector_init(unsigned int initial_capacity, GumboVector* vector); -// Frees the memory used by an GumboVector. Does not free the contained +// Frees the memory used by a GumboVector. Does not free the contained // pointers. -void gumbo_vector_destroy( - struct GumboInternalParser* parser, GumboVector* vector); +void gumbo_vector_destroy(GumboVector* vector); -// Adds a new element to an GumboVector. -void gumbo_vector_add( - struct GumboInternalParser* parser, void* element, GumboVector* vector); +// Adds a new element to a GumboVector. +void gumbo_vector_add(void* element, GumboVector* vector); // Removes and returns the element most recently added to the GumboVector. -// Ownership is transferred to caller. Capacity is unchanged. If the vector is +// Ownership is transferred to caller. Capacity is unchanged. If the vector is // empty, NULL is returned. -void* gumbo_vector_pop(struct GumboInternalParser* parser, GumboVector* vector); +void* gumbo_vector_pop(GumboVector* vector); -// Inserts an element at a specific index. This is potentially O(N) time, but +// Inserts an element at a specific index. This is potentially O(N) time, but // is necessary for some of the spec's behavior. -void gumbo_vector_insert_at(struct GumboInternalParser* parser, void* element, - unsigned int index, GumboVector* vector); +void gumbo_vector_insert_at ( + void* element, + unsigned int index, + GumboVector* vector +); // Removes an element from the vector, or does nothing if the element is not in // the vector. -void gumbo_vector_remove( - struct GumboInternalParser* parser, void* element, GumboVector* vector); +void gumbo_vector_remove(void* element, GumboVector* vector); -// Removes and returns an element at a specific index. Note that this is +// Removes and returns an element at a specific index. Note that this is // potentially O(N) time and should be used sparingly. -void* gumbo_vector_remove_at(struct GumboInternalParser* parser, - unsigned int index, GumboVector* vector); +void* gumbo_vector_remove_at(unsigned int index, GumboVector* vector); #ifdef __cplusplus } #endif -#endif // GUMBO_VECTOR_H_ +#endif // GUMBO_VECTOR_H_ diff --git a/gumbo-parser/test/attribute.cc b/gumbo-parser/test/attribute.cc new file mode 100644 index 00000000..e55734aa --- /dev/null +++ b/gumbo-parser/test/attribute.cc @@ -0,0 +1,49 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jdtang@google.com (Jonathan Tang) + +#include "attribute.h" + +#include <stdlib.h> +#include <string.h> + +#include "gtest/gtest.h" +#include "test_utils.h" +#include "vector.h" + +namespace { + +class GumboAttributeTest : public GumboTest { + protected: + GumboAttributeTest() { gumbo_vector_init(2, &vector_); } + + ~GumboAttributeTest() { gumbo_vector_destroy(&vector_); } + + GumboVector vector_; +}; + +TEST_F(GumboAttributeTest, GetAttribute) { + GumboAttribute attr1; + GumboAttribute attr2; + attr1.name = ""; + attr2.name = "foo"; + + gumbo_vector_add(&attr1, &vector_); + gumbo_vector_add(&attr2, &vector_); + EXPECT_EQ(&attr2, gumbo_get_attribute(&vector_, "foo")); + EXPECT_EQ(NULL, gumbo_get_attribute(&vector_, "bar")); +} + +} // namespace diff --git a/gumbo-parser/test/char_ref.cc b/gumbo-parser/test/char_ref.cc new file mode 100644 index 00000000..de3cf06a --- /dev/null +++ b/gumbo-parser/test/char_ref.cc @@ -0,0 +1,172 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jdtang@google.com (Jonathan Tang) +// +// GUnit char_ref tests. These are quick smoke tests, mostly to identify +// crashing bugs so that they can be fixed without having to debug +// multi-language tests. As such, they focus on coverage rather than +// completeness. For testing the full spec, use char_ref_py_tests, which share +// their testdata with the Python html5lib library. + +#include "char_ref.h" + +#include <stdio.h> +#include <string.h> + +#include "gtest/gtest.h" +#include "test_utils.h" +#include "utf8.h" + +namespace { + +class CharRefTest : public GumboTest { + protected: + bool ConsumeCharRef(const char* input) { + return ConsumeCharRef(input, ' ', false); + } + + bool ConsumeCharRef( + const char* input, int additional_allowed_char, bool is_in_attribute) { + text_ = input; + utf8iterator_init(&parser_, input, strlen(input), &iter_); + bool result = gumbo_consume_char_ref( + &parser_, &iter_, additional_allowed_char, is_in_attribute, &output_); + fflush(stdout); + return result; + } + + Utf8Iterator iter_; + OneOrTwoCodepoints output_; +}; + +TEST_F(CharRefTest, Whitespace) { + EXPECT_TRUE(ConsumeCharRef("  ")); + EXPECT_EQ(kGumboNoChar, output_.first); + EXPECT_EQ(kGumboNoChar, output_.second); +} + +TEST_F(CharRefTest, NumericHex) { + EXPECT_TRUE(ConsumeCharRef("ካ")); + EXPECT_EQ(0x12ab, output_.first); + EXPECT_EQ(kGumboNoChar, output_.second); +} + +TEST_F(CharRefTest, NumericDecimal) { + EXPECT_TRUE(ConsumeCharRef("Ӓ")); + EXPECT_EQ(1234, output_.first); + EXPECT_EQ(kGumboNoChar, output_.second); +} + +TEST_F(CharRefTest, NumericInvalidDigit) { + errors_are_expected_ = true; + EXPECT_FALSE(ConsumeCharRef("&#google")); + EXPECT_EQ(kGumboNoChar, output_.first); + EXPECT_EQ(kGumboNoChar, output_.second); + EXPECT_EQ('&', utf8iterator_current(&iter_)); +} + +TEST_F(CharRefTest, NumericNoSemicolon) { + errors_are_expected_ = true; + EXPECT_FALSE(ConsumeCharRef("Ӓgoogle")); + EXPECT_EQ(1234, output_.first); + EXPECT_EQ(kGumboNoChar, output_.second); + EXPECT_EQ('g', utf8iterator_current(&iter_)); +} + +TEST_F(CharRefTest, NumericReplacement) { + errors_are_expected_ = true; + EXPECT_FALSE(ConsumeCharRef("‚")); + // Low quotation mark character. + EXPECT_EQ(0x201A, output_.first); + EXPECT_EQ(kGumboNoChar, output_.second); +} + +TEST_F(CharRefTest, NumericInvalid) { + errors_are_expected_ = true; + EXPECT_FALSE(ConsumeCharRef("�")); + EXPECT_EQ(0xFFFD, output_.first); + EXPECT_EQ(kGumboNoChar, output_.second); +} + +TEST_F(CharRefTest, NumericUtfInvalid) { + errors_are_expected_ = true; + EXPECT_FALSE(ConsumeCharRef("")); + EXPECT_EQ(0x7, output_.first); + EXPECT_EQ(kGumboNoChar, output_.second); +} + +TEST_F(CharRefTest, NamedReplacement) { + EXPECT_TRUE(ConsumeCharRef("<")); + EXPECT_EQ('<', output_.first); + EXPECT_EQ(kGumboNoChar, output_.second); +} + +TEST_F(CharRefTest, NamedReplacementNoSemicolon) { + errors_are_expected_ = true; + EXPECT_FALSE(ConsumeCharRef(">")); + EXPECT_EQ('>', output_.first); + EXPECT_EQ(kGumboNoChar, output_.second); +} + +TEST_F(CharRefTest, NamedReplacementWithInvalidUtf8) { + errors_are_expected_ = true; + EXPECT_TRUE(ConsumeCharRef("&\xc3\xa5")); + EXPECT_EQ(kGumboNoChar, output_.first); + EXPECT_EQ(kGumboNoChar, output_.second); +} + +TEST_F(CharRefTest, NamedReplacementInvalid) { + errors_are_expected_ = true; + EXPECT_FALSE(ConsumeCharRef("&google;")); + EXPECT_EQ(kGumboNoChar, output_.first); + EXPECT_EQ(kGumboNoChar, output_.second); + EXPECT_EQ('&', utf8iterator_current(&iter_)); +} + +// TEST_F(CharRefTest, NamedReplacementInvalidNoSemicolon) { +// EXPECT_FALSE(ConsumeCharRef("&google")); +// EXPECT_EQ(kGumboNoChar, output_.first); +// EXPECT_EQ(kGumboNoChar, output_.second); +// EXPECT_EQ('&', utf8iterator_current(&iter_)); +//} + +TEST_F(CharRefTest, AdditionalAllowedChar) { + EXPECT_TRUE(ConsumeCharRef("&\"", '"', false)); + EXPECT_EQ(kGumboNoChar, output_.first); + EXPECT_EQ(kGumboNoChar, output_.second); + EXPECT_EQ('&', utf8iterator_current(&iter_)); +} + +TEST_F(CharRefTest, InAttribute) { + EXPECT_TRUE(ConsumeCharRef("¬ed", ' ', true)); + EXPECT_EQ(kGumboNoChar, output_.first); + EXPECT_EQ(kGumboNoChar, output_.second); + EXPECT_EQ('&', utf8iterator_current(&iter_)); +} + +TEST_F(CharRefTest, MultiChars) { + EXPECT_TRUE(ConsumeCharRef("⋵̸")); + EXPECT_EQ(0x22F5, output_.first); + EXPECT_EQ(0x0338, output_.second); +} + +TEST_F(CharRefTest, CharAfter) { + EXPECT_TRUE(ConsumeCharRef("<x")); + EXPECT_EQ('<', output_.first); + EXPECT_EQ(kGumboNoChar, output_.second); + EXPECT_EQ('x', utf8iterator_current(&iter_)); +} + +} // namespace diff --git a/gumbo-parser/test/parser.cc b/gumbo-parser/test/parser.cc new file mode 100644 index 00000000..d85e286a --- /dev/null +++ b/gumbo-parser/test/parser.cc @@ -0,0 +1,1996 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jdtang@google.com (Jonathan Tang) + +#include <string> +#include "gumbo.h" +#include "gtest/gtest.h" +#include "test_utils.h" + +namespace { + +class GumboParserTest : public ::testing::Test { + protected: + GumboParserTest() + : options_(kGumboDefaultOptions), output_(NULL), root_(NULL) + {} + + virtual ~GumboParserTest() { + if (output_) { + gumbo_destroy_output(output_); + } + } + + virtual void Parse(const char* input) { + if (output_) { + gumbo_destroy_output(output_); + } + + output_ = gumbo_parse_with_options(&options_, input, strlen(input)); + // The naming inconsistency is because these tests were initially written + // when gumbo_parse returned the document element instead of an GumboOutput + // structure. + root_ = output_->document; + } + + virtual void ParseFragment( + const char* input, GumboTag context, GumboNamespaceEnum context_ns) { + if (output_) { + gumbo_destroy_output(output_); + } + + options_.fragment_context = context; + options_.fragment_namespace = context_ns; + output_ = gumbo_parse_with_options(&options_, input, strlen(input)); + root_ = output_->document; + } + + virtual void Parse(const std::string& input) { + // This overload is so we can test/demonstrate that computing offsets from + // the .data() member of an STL string works properly. + if (output_) { + gumbo_destroy_output(output_); + } + + output_ = gumbo_parse_with_options(&options_, input.data(), input.length()); + root_ = output_->document; + SanityCheckPointers(input.data(), input.length(), output_->root, 1000); + } + + GumboOptions options_; + GumboOutput* output_; + GumboNode* root_; +}; + +TEST_F(GumboParserTest, NullDocument) { + Parse(""); + ASSERT_TRUE(root_); + ASSERT_EQ(GUMBO_NODE_DOCUMENT, root_->type); + EXPECT_EQ(GUMBO_INSERTION_BY_PARSER, root_->parse_flags); + + GumboNode* body; + GetAndAssertBody(root_, &body); +} + +TEST_F(GumboParserTest, ParseTwice) { + Parse(""); + ASSERT_TRUE(root_); + ASSERT_EQ(GUMBO_NODE_DOCUMENT, root_->type); + + std::string second_input(""); + Parse(second_input); + ASSERT_TRUE(root_); + ASSERT_EQ(GUMBO_NODE_DOCUMENT, root_->type); + + GumboNode* body; + GetAndAssertBody(root_, &body); +} + +TEST_F(GumboParserTest, OneChar) { + std::string input("T"); + Parse(input); + ASSERT_TRUE(root_); + ASSERT_EQ(GUMBO_NODE_DOCUMENT, root_->type); + EXPECT_EQ(GUMBO_INSERTION_BY_PARSER, root_->parse_flags); + ASSERT_EQ(1, GetChildCount(root_)); + + GumboNode* html = GetChild(root_, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, html->type); + EXPECT_TRUE(html->parse_flags & GUMBO_INSERTION_BY_PARSER); + EXPECT_TRUE(html->parse_flags & GUMBO_INSERTION_IMPLICIT_END_TAG); + EXPECT_TRUE(html->parse_flags & GUMBO_INSERTION_IMPLIED); + EXPECT_EQ(GUMBO_TAG_HTML, html->v.element.tag); + ASSERT_EQ(2, GetChildCount(html)); + + GumboNode* head = GetChild(html, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, head->type); + EXPECT_EQ(GUMBO_TAG_HEAD, head->v.element.tag); + EXPECT_EQ(0, GetChildCount(head)); + + GumboNode* body = GetChild(html, 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, body->type); + EXPECT_EQ(GUMBO_TAG_BODY, body->v.element.tag); + ASSERT_EQ(1, GetChildCount(body)); + EXPECT_EQ(1, body->v.element.start_pos.line); + EXPECT_EQ(1, body->v.element.start_pos.column); + EXPECT_EQ(0, body->v.element.start_pos.offset); + EXPECT_EQ(1, body->v.element.end_pos.line); + EXPECT_EQ(2, body->v.element.end_pos.column); + EXPECT_EQ(1, body->v.element.end_pos.offset); + + GumboNode* text = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_TEXT, text->type); + EXPECT_STREQ("T", text->v.text.text); + EXPECT_EQ(1, text->v.text.start_pos.line); + EXPECT_EQ(1, text->v.text.start_pos.column); + EXPECT_EQ(0, text->v.text.start_pos.offset); + EXPECT_EQ(input.data(), text->v.text.original_text.data); + EXPECT_EQ(1, text->v.text.original_text.length); +} + +TEST_F(GumboParserTest, TextOnly) { + Parse("Test"); + EXPECT_EQ(1, output_->errors.length); // No doctype. + ASSERT_EQ(1, GetChildCount(root_)); + + GumboNode* html = GetChild(root_, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, html->type); + EXPECT_EQ(GUMBO_TAG_HTML, html->v.element.tag); + ASSERT_EQ(2, GetChildCount(html)); + + GumboNode* head = GetChild(html, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, head->type); + EXPECT_EQ(GUMBO_TAG_HEAD, head->v.element.tag); + EXPECT_EQ(0, GetChildCount(head)); + + GumboNode* body = GetChild(html, 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, body->type); + EXPECT_EQ(GUMBO_TAG_BODY, body->v.element.tag); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* text = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_TEXT, text->type); + EXPECT_STREQ("Test", text->v.text.text); +} + +TEST_F(GumboParserTest, SelfClosingTagError) { + Parse("<div/>"); + // No DOCTYPE + // Tag cannot be self-closing + // EOF with div still open + EXPECT_EQ(3, output_->errors.length); +} + +TEST_F(GumboParserTest, SelfClosingTagWithComplexProcessing) { + Parse("<br/>"); + ASSERT_EQ(1, output_->errors.length); // No doctype. + ASSERT_EQ(1, GetChildCount(root_)); + + GumboNode* html = GetChild(root_, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, html->type); + EXPECT_EQ(GUMBO_TAG_HTML, html->v.element.tag); + ASSERT_EQ(2, GetChildCount(html)); + + GumboNode* head = GetChild(html, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, head->type); + EXPECT_EQ(GUMBO_TAG_HEAD, head->v.element.tag); + EXPECT_EQ(0, GetChildCount(head)); + + GumboNode* body = GetChild(html, 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, body->type); + EXPECT_EQ(GUMBO_TAG_BODY, body->v.element.tag); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* br = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, br->type); + EXPECT_EQ(GUMBO_TAG_BR, br->v.element.tag); +} + +TEST_F(GumboParserTest, UnexpectedEndBreak) { + Parse("</br><div></div>"); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(2, GetChildCount(body)); + + GumboNode* br = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, br->type); + EXPECT_EQ(GUMBO_TAG_BR, br->v.element.tag); + ASSERT_EQ(0, GetChildCount(br)); + + GumboNode* div = GetChild(body, 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, div->type); + EXPECT_EQ(GUMBO_TAG_DIV, div->v.element.tag); + ASSERT_EQ(0, GetChildCount(div)); +} + +TEST_F(GumboParserTest, CaseSensitiveAttributes) { + Parse("<div class=CamelCase>"); + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* div = GetChild(body, 0); + GumboVector* attributes = &div->v.element.attributes; + ASSERT_EQ(1, attributes->length); + + GumboAttribute* clas = static_cast<GumboAttribute*>(attributes->data[0]); + EXPECT_EQ(GUMBO_ATTR_NAMESPACE_NONE, clas->attr_namespace); + EXPECT_STREQ("class", clas->name); + EXPECT_STREQ("CamelCase", clas->value); +} + +TEST_F(GumboParserTest, ExplicitHtmlStructure) { + Parse( + "<!doctype html>\n<html>" + "<head><title>Foo\n" + "

Test
"); + ASSERT_EQ(1, GetChildCount(root_)); + EXPECT_EQ(0, output_->errors.length); + + ASSERT_EQ(GUMBO_NODE_DOCUMENT, root_->type); + EXPECT_STREQ("html", root_->v.document.name); + EXPECT_STREQ("", root_->v.document.public_identifier); + EXPECT_STREQ("", root_->v.document.system_identifier); + + GumboNode* html = GetChild(root_, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, html->type); + EXPECT_EQ(GUMBO_INSERTION_NORMAL, html->parse_flags); + EXPECT_EQ(GUMBO_TAG_HTML, html->v.element.tag); + EXPECT_EQ(2, html->v.element.start_pos.line); + EXPECT_EQ(1, html->v.element.start_pos.column); + EXPECT_EQ(16, html->v.element.start_pos.offset); + EXPECT_EQ(3, html->v.element.end_pos.line); + EXPECT_EQ(39, html->v.element.end_pos.column); + EXPECT_EQ(92, html->v.element.end_pos.offset); + EXPECT_EQ("", ToString(html->v.element.original_tag)); + EXPECT_EQ("", ToString(html->v.element.original_end_tag)); + ASSERT_EQ(3, GetChildCount(html)); + + GumboNode* head = GetChild(html, 0); + EXPECT_EQ(GUMBO_INSERTION_NORMAL, head->parse_flags); + ASSERT_EQ(GUMBO_NODE_ELEMENT, head->type); + EXPECT_EQ(GUMBO_TAG_HEAD, head->v.element.tag); + EXPECT_EQ(html, head->parent); + EXPECT_EQ(0, head->index_within_parent); + EXPECT_EQ(1, GetChildCount(head)); + + GumboNode* body = GetChild(html, 2); + EXPECT_EQ(GUMBO_INSERTION_NORMAL, body->parse_flags); + ASSERT_EQ(GUMBO_NODE_ELEMENT, body->type); + EXPECT_EQ(GUMBO_TAG_BODY, body->v.element.tag); + EXPECT_EQ(html, body->parent); + EXPECT_EQ(3, body->v.element.start_pos.line); + EXPECT_EQ(1, body->v.element.start_pos.column); + EXPECT_EQ(54, body->v.element.start_pos.offset); + EXPECT_EQ(3, body->v.element.end_pos.line); + EXPECT_EQ(32, body->v.element.end_pos.column); + EXPECT_EQ(85, body->v.element.end_pos.offset); + EXPECT_EQ("", ToString(body->v.element.original_tag)); + EXPECT_EQ("", ToString(body->v.element.original_end_tag)); + EXPECT_EQ(2, body->index_within_parent); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* div = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, div->type); + EXPECT_EQ(GUMBO_TAG_DIV, div->v.element.tag); + EXPECT_EQ(body, div->parent); + EXPECT_EQ(0, div->index_within_parent); + ASSERT_EQ(1, GetChildCount(div)); + + ASSERT_EQ(1, GetAttributeCount(div)); + GumboAttribute* clas = GetAttribute(div, 0); + EXPECT_STREQ("class", clas->name); + EXPECT_EQ("class", ToString(clas->original_name)); + EXPECT_STREQ("bar", clas->value); + EXPECT_EQ("bar", ToString(clas->original_value)); + + GumboNode* text = GetChild(div, 0); + ASSERT_EQ(GUMBO_NODE_TEXT, text->type); + EXPECT_STREQ("Test", text->v.text.text); +} + +TEST_F(GumboParserTest, Whitespace) { + Parse("
    \n
  • Text\n
"); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* ul = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, ul->type); + EXPECT_EQ(GUMBO_TAG_UL, ul->v.element.tag); + ASSERT_EQ(2, GetChildCount(ul)); + + GumboNode* whitespace = GetChild(ul, 0); + ASSERT_EQ(GUMBO_NODE_WHITESPACE, whitespace->type); + EXPECT_STREQ("\n ", whitespace->v.text.text); + + GumboNode* li = GetChild(ul, 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, li->type); + EXPECT_EQ(GUMBO_TAG_LI, li->v.element.tag); + ASSERT_EQ(1, GetChildCount(li)); + + GumboNode* text = GetChild(li, 0); + ASSERT_EQ(GUMBO_NODE_TEXT, text->type); + EXPECT_STREQ("Text\n", text->v.text.text); +} + +TEST_F(GumboParserTest, DuplicateAttributes) { + std::string text(""); + Parse(text); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* input = GetChild(body, 0); + EXPECT_EQ(GUMBO_INSERTION_IMPLICIT_END_TAG, input->parse_flags); + ASSERT_EQ(GUMBO_NODE_ELEMENT, input->type); + EXPECT_EQ(GUMBO_TAG_INPUT, input->v.element.tag); + EXPECT_EQ(0, GetChildCount(input)); + ASSERT_EQ(2, GetAttributeCount(input)); + + GumboAttribute* checked = GetAttribute(input, 0); + EXPECT_STREQ("checked", checked->name); + EXPECT_STREQ("false", checked->value); + EXPECT_EQ(1, checked->name_start.line); + EXPECT_EQ(8, checked->name_start.column); + EXPECT_EQ(15, checked->name_end.column); + EXPECT_EQ(16, checked->value_start.column); + EXPECT_EQ(23, checked->value_end.column); + EXPECT_EQ(7, checked->original_name.data - text.data()); + EXPECT_EQ(7, checked->original_name.length); + EXPECT_EQ(15, checked->original_value.data - text.data()); + EXPECT_EQ(7, checked->original_value.length); + + GumboAttribute* id = GetAttribute(input, 1); + EXPECT_STREQ("id", id->name); + EXPECT_STREQ("foo", id->value); + + // TODO(jdtang): Run some assertions on the parse error that's added. +} + +TEST_F(GumboParserTest, LinkTagsInHead) { + Parse( + "\n" + " \n" + " Sample title>\n\n" + " \n" + " \n" + " \n" + " Foo"); + ASSERT_EQ(1, GetChildCount(root_)); + + GumboNode* html = GetChild(root_, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, html->type); + EXPECT_EQ(GUMBO_INSERTION_IMPLICIT_END_TAG, html->parse_flags); + EXPECT_EQ(GUMBO_TAG_HTML, html->v.element.tag); + ASSERT_EQ(3, GetChildCount(html)); + + GumboNode* head = GetChild(html, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, head->type); + EXPECT_EQ(GUMBO_INSERTION_NORMAL, head->parse_flags); + EXPECT_EQ(GUMBO_TAG_HEAD, head->v.element.tag); + EXPECT_EQ(7, GetChildCount(head)); + + GumboNode* text1 = GetChild(head, 2); + ASSERT_EQ(GUMBO_NODE_WHITESPACE, text1->type); + EXPECT_STREQ("\n\n ", text1->v.text.text); + + GumboNode* link1 = GetChild(head, 3); + ASSERT_EQ(GUMBO_NODE_ELEMENT, link1->type); + EXPECT_EQ(GUMBO_TAG_LINK, link1->v.element.tag); + EXPECT_EQ(GUMBO_INSERTION_IMPLICIT_END_TAG, link1->parse_flags); + EXPECT_EQ(0, GetChildCount(link1)); + + GumboNode* text2 = GetChild(head, 4); + ASSERT_EQ(GUMBO_NODE_WHITESPACE, text2->type); + EXPECT_STREQ("\n ", text2->v.text.text); + + GumboNode* link2 = GetChild(head, 5); + ASSERT_EQ(GUMBO_NODE_ELEMENT, link2->type); + EXPECT_EQ(GUMBO_TAG_LINK, link2->v.element.tag); + EXPECT_EQ(GUMBO_INSERTION_IMPLICIT_END_TAG, link2->parse_flags); + EXPECT_EQ(0, GetChildCount(link2)); + + GumboNode* text3 = GetChild(head, 6); + ASSERT_EQ(GUMBO_NODE_WHITESPACE, text3->type); + EXPECT_STREQ("\n ", text3->v.text.text); + + GumboNode* body = GetChild(html, 2); + ASSERT_EQ(GUMBO_NODE_ELEMENT, body->type); + EXPECT_EQ(GUMBO_INSERTION_NORMAL, body->parse_flags); + EXPECT_EQ(GUMBO_TAG_BODY, body->v.element.tag); + ASSERT_EQ(1, GetChildCount(body)); +} + +TEST_F(GumboParserTest, WhitespaceBeforeHtml) { + Parse("\nTest"); + ASSERT_EQ(1, GetChildCount(root_)); + + GumboNode* body = GetChild(GetChild(root_, 0), 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, body->type); + EXPECT_EQ(GUMBO_TAG_BODY, GetTag(body)); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* text = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_TEXT, text->type); + EXPECT_STREQ("Test", text->v.text.text); +} + +TEST_F(GumboParserTest, TextAfterHtml) { + Parse("Test after doc"); + GumboNode* body; + GetAndAssertBody(root_, &body); + + ASSERT_EQ(GUMBO_NODE_ELEMENT, body->type); + EXPECT_EQ(GUMBO_TAG_BODY, GetTag(body)); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* text = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_TEXT, text->type); + EXPECT_STREQ("Test after doc", text->v.text.text); +} + +TEST_F(GumboParserTest, WhitespaceInHead) { + Parse(" Test"); + + GumboNode* html = GetChild(root_, 0); + EXPECT_EQ(GUMBO_NODE_ELEMENT, html->type); + EXPECT_EQ(GUMBO_TAG_HTML, GetTag(html)); + EXPECT_EQ(2, GetChildCount(html)); + + GumboNode* head = GetChild(html, 0); + EXPECT_EQ(GUMBO_NODE_ELEMENT, head->type); + EXPECT_EQ(GUMBO_TAG_HEAD, GetTag(head)); + EXPECT_EQ(0, GetChildCount(head)); + + GumboNode* body = GetChild(html, 1); + EXPECT_EQ(GUMBO_NODE_ELEMENT, body->type); + EXPECT_EQ(GUMBO_TAG_BODY, GetTag(body)); + + GumboNode* text = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_TEXT, text->type); + EXPECT_STREQ("Test", text->v.text.text); +} + +TEST_F(GumboParserTest, Doctype) { + Parse("Test"); + GumboDocument* doc = &root_->v.document; + EXPECT_EQ(1, doc->children.length); + EXPECT_EQ(GUMBO_DOCTYPE_NO_QUIRKS, doc->doc_type_quirks_mode); + + EXPECT_STREQ("html", doc->name); + EXPECT_STREQ("", doc->public_identifier); + EXPECT_STREQ("", doc->system_identifier); +} + +TEST_F(GumboParserTest, InvalidDoctype) { + Parse("Test"); + + // Default doc token; the declared one is ignored. + GumboDocument* doc = &root_->v.document; + EXPECT_EQ(1, doc->children.length); + EXPECT_EQ(GUMBO_DOCTYPE_QUIRKS, doc->doc_type_quirks_mode); + + EXPECT_STREQ("", doc->name); + EXPECT_STREQ("", doc->public_identifier); + EXPECT_STREQ("", doc->system_identifier); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(GUMBO_NODE_ELEMENT, body->type); + EXPECT_EQ(GUMBO_TAG_BODY, GetTag(body)); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* text = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_TEXT, text->type); + EXPECT_STREQ("Test", text->v.text.text); +} + +TEST_F(GumboParserTest, SingleComment) { + Parse(""); + GumboNode* comment = GetChild(root_, 0); + ASSERT_EQ(GUMBO_NODE_COMMENT, comment->type); + EXPECT_STREQ(" comment ", comment->v.text.text); +} + +TEST_F(GumboParserTest, CommentInText) { + Parse("Start end"); + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(3, GetChildCount(body)); + + GumboNode* start = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_TEXT, start->type); + EXPECT_STREQ("Start ", start->v.text.text); + + GumboNode* comment = GetChild(body, 1); + ASSERT_EQ(GUMBO_NODE_COMMENT, comment->type); + EXPECT_EQ(body, comment->parent); + EXPECT_EQ(1, comment->index_within_parent); + EXPECT_STREQ(" comment ", comment->v.text.text); + + GumboNode* end = GetChild(body, 2); + ASSERT_EQ(GUMBO_NODE_TEXT, end->type); + EXPECT_STREQ(" end", end->v.text.text); +} + +TEST_F(GumboParserTest, CommentBeforeNode) { + Parse("\n

hello world!

"); + GumboNode* comment = GetChild(root_, 0); + ASSERT_EQ(GUMBO_NODE_COMMENT, comment->type); + EXPECT_STREQ("This is a comment", comment->v.text.text); + EXPECT_EQ( + "", ToString(comment->v.text.original_text)); + + // Newline is ignored per the rules for "initial" insertion mode. + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* h1 = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, h1->type); + EXPECT_EQ(GUMBO_TAG_H1, h1->v.element.tag); +} + +TEST_F(GumboParserTest, CommentInVerbatimMode) { + Parse("
Text
"); + + GumboNode* html = GetChild(root_, 0); + EXPECT_EQ(GUMBO_NODE_ELEMENT, html->type); + EXPECT_EQ(GUMBO_TAG_HTML, GetTag(html)); + EXPECT_EQ(GUMBO_INSERTION_BY_PARSER | GUMBO_INSERTION_IMPLIED | + GUMBO_INSERTION_IMPLICIT_END_TAG, + html->parse_flags); + EXPECT_EQ(3, GetChildCount(html)); + + GumboNode* body = GetChild(html, 1); + EXPECT_EQ(GUMBO_NODE_ELEMENT, body->type); + EXPECT_EQ(GUMBO_TAG_BODY, GetTag(body)); + EXPECT_EQ(GUMBO_INSERTION_NORMAL, body->parse_flags); + EXPECT_EQ(3, GetChildCount(body)); + + GumboNode* comment = GetChild(html, 2); + ASSERT_EQ(GUMBO_NODE_COMMENT, comment->type); + EXPECT_EQ(GUMBO_INSERTION_NORMAL, comment->parse_flags); + EXPECT_STREQ(" comment \n\n", comment->v.text.text); +} + +TEST_F(GumboParserTest, UnknownTag) { + Parse("1

2"); + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* foo = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, foo->type); + EXPECT_EQ(GUMBO_TAG_UNKNOWN, GetTag(foo)); + EXPECT_EQ("", ToString(foo->v.element.original_tag)); + // According to the spec, the misplaced end tag is ignored, and so we return + // an empty original_end_tag text. We may want to extend our error-reporting + // a bit so that we close off the tag that it *would have closed*, had the + // HTML been correct, along with a parse flag that says the end tag was in the + // wrong place. + EXPECT_EQ("", ToString(foo->v.element.original_end_tag)); +} + +TEST_F(GumboParserTest, UnknownTag2) { + Parse("

"); + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* div = GetChild(body, 0); + ASSERT_EQ(1, GetChildCount(div)); + GumboNode* sarcasm = GetChild(div, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, sarcasm->type); + EXPECT_EQ(GUMBO_TAG_UNKNOWN, GetTag(sarcasm)); + EXPECT_EQ("", ToString(sarcasm->v.element.original_tag)); + EXPECT_EQ("", ToString(sarcasm->v.element.original_end_tag)); +} + +TEST_F(GumboParserTest, InvalidEndTag) { + Parse(""); + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* a = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, a->type); + EXPECT_EQ(GUMBO_TAG_A, GetTag(a)); + ASSERT_EQ(1, GetChildCount(a)); + + GumboNode* img = GetChild(a, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, img->type); + EXPECT_EQ(GUMBO_TAG_IMG, GetTag(img)); + ASSERT_EQ(0, GetChildCount(img)); +} + +TEST_F(GumboParserTest, Tables) { + Parse( + "\n" + "
\n" + " \n" + " \n" + " \n" + " " + "
OneTwo
"); + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(4, GetChildCount(body)); + + GumboNode* br = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, br->type); + EXPECT_EQ(GUMBO_TAG_BR, GetTag(br)); + EXPECT_EQ(body, br->parent); + EXPECT_EQ(0, br->index_within_parent); + ASSERT_EQ(0, GetChildCount(br)); + + GumboNode* iframe = GetChild(body, 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, iframe->type); + EXPECT_EQ(GUMBO_TAG_IFRAME, GetTag(iframe)); + ASSERT_EQ(0, GetChildCount(iframe)); + + GumboNode* table = GetChild(body, 2); + ASSERT_EQ(GUMBO_NODE_ELEMENT, table->type); + EXPECT_EQ(GUMBO_TAG_TABLE, GetTag(table)); + EXPECT_EQ(body, table->parent); + EXPECT_EQ(2, table->index_within_parent); + ASSERT_EQ(2, GetChildCount(table)); + + GumboNode* table_text = GetChild(table, 0); + ASSERT_EQ(GUMBO_NODE_WHITESPACE, table_text->type); + EXPECT_STREQ("\n ", table_text->v.text.text); + + GumboNode* tbody = GetChild(table, 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, tbody->type); + EXPECT_EQ(GUMBO_TAG_TBODY, GetTag(tbody)); + ASSERT_EQ(2, GetChildCount(tbody)); + // Second node is whitespace. + + GumboNode* tr = GetChild(tbody, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, tr->type); + EXPECT_EQ(GUMBO_TAG_TR, GetTag(tr)); + ASSERT_EQ(5, GetChildCount(tr)); // Including whitespace. + + GumboNode* tr_text = GetChild(tr, 0); + ASSERT_EQ(GUMBO_NODE_WHITESPACE, tr_text->type); + EXPECT_EQ(tr, tr_text->parent); + EXPECT_EQ(0, tr_text->index_within_parent); + EXPECT_STREQ("\n ", tr_text->v.text.text); + + GumboNode* th = GetChild(tr, 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, th->type); + EXPECT_EQ(GUMBO_TAG_TH, GetTag(th)); + EXPECT_EQ(tr, th->parent); + EXPECT_EQ(1, th->index_within_parent); + ASSERT_EQ(1, GetChildCount(th)); + + GumboNode* th_text = GetChild(th, 0); + ASSERT_EQ(GUMBO_NODE_TEXT, th_text->type); + EXPECT_STREQ("One", th_text->v.text.text); + + GumboNode* td = GetChild(tr, 3); + ASSERT_EQ(GUMBO_NODE_ELEMENT, td->type); + EXPECT_EQ(GUMBO_TAG_TD, GetTag(td)); + ASSERT_EQ(1, GetChildCount(td)); + + GumboNode* td_text = GetChild(td, 0); + ASSERT_EQ(GUMBO_NODE_TEXT, td_text->type); + EXPECT_STREQ("Two", td_text->v.text.text); + + GumboNode* td2_text = GetChild(td, 0); + ASSERT_EQ(GUMBO_NODE_TEXT, td2_text->type); + EXPECT_STREQ("Two", td2_text->v.text.text); + + GumboNode* div = GetChild(body, 3); + ASSERT_EQ(GUMBO_NODE_ELEMENT, div->type); + EXPECT_EQ(GUMBO_TAG_DIV, GetTag(div)); + ASSERT_EQ(0, GetChildCount(div)); +} + +TEST_F(GumboParserTest, StartParagraphInTable) { + Parse("

foo

"); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(2, GetChildCount(body)); + + GumboNode* paragraph = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, paragraph->type); + EXPECT_EQ(GUMBO_TAG_P, GetTag(paragraph)); + EXPECT_EQ(body, paragraph->parent); + EXPECT_EQ(0, paragraph->index_within_parent); + ASSERT_EQ(1, GetChildCount(paragraph)); + + GumboNode* text = GetChild(paragraph, 0); + ASSERT_EQ(GUMBO_NODE_TEXT, text->type); + EXPECT_STREQ("foo", text->v.text.text); + + GumboNode* table = GetChild(body, 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, table->type); + EXPECT_EQ(GUMBO_TAG_TABLE, GetTag(table)); + EXPECT_EQ(body, table->parent); + EXPECT_EQ(1, table->index_within_parent); + ASSERT_EQ(0, GetChildCount(table)); +} + +TEST_F(GumboParserTest, EndParagraphInTable) { + Parse("

"); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(2, GetChildCount(body)); + + GumboNode* paragraph = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, paragraph->type); + EXPECT_EQ(GUMBO_TAG_P, GetTag(paragraph)); + EXPECT_EQ(body, paragraph->parent); + EXPECT_EQ(0, paragraph->index_within_parent); + ASSERT_EQ(0, GetChildCount(paragraph)); + + GumboNode* table = GetChild(body, 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, table->type); + EXPECT_EQ(GUMBO_TAG_TABLE, GetTag(table)); + EXPECT_EQ(body, table->parent); + EXPECT_EQ(1, table->index_within_parent); + ASSERT_EQ(0, GetChildCount(table)); +} + +TEST_F(GumboParserTest, UnknownTagInTable) { + Parse("bar
"); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(2, GetChildCount(body)); + + GumboNode* foo = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, foo->type); + EXPECT_EQ(GUMBO_TAG_UNKNOWN, GetTag(foo)); + EXPECT_EQ("", ToString(foo->v.element.original_tag)); + EXPECT_EQ(body, foo->parent); + EXPECT_EQ(0, foo->index_within_parent); + ASSERT_EQ(1, GetChildCount(foo)); + + GumboNode* bar = GetChild(foo, 0); + ASSERT_EQ(GUMBO_NODE_TEXT, bar->type); + EXPECT_STREQ("bar", bar->v.text.text); + + GumboNode* table = GetChild(body, 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, table->type); + EXPECT_EQ(GUMBO_TAG_TABLE, GetTag(table)); + EXPECT_EQ(body, table->parent); + EXPECT_EQ(1, table->index_within_parent); + ASSERT_EQ(0, GetChildCount(table)); +} + +TEST_F(GumboParserTest, UnclosedTableTags) { + Parse( + "\n" + " \n" + "
One\n" + " Two\n" + "
Row2\n" + "
Row3\n" + "
\n" + ""); + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(2, GetChildCount(body)); + + GumboNode* table = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, table->type); + EXPECT_EQ(GUMBO_TAG_TABLE, GetTag(table)); + ASSERT_EQ(2, GetChildCount(table)); + + GumboNode* table_text = GetChild(table, 0); + ASSERT_EQ(GUMBO_NODE_WHITESPACE, table_text->type); + EXPECT_STREQ("\n ", table_text->v.text.text); + + GumboNode* tbody = GetChild(table, 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, tbody->type); + EXPECT_EQ(GUMBO_TAG_TBODY, GetTag(tbody)); + ASSERT_EQ(3, GetChildCount(tbody)); + + GumboNode* tr = GetChild(tbody, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, tr->type); + EXPECT_EQ(GUMBO_TAG_TR, GetTag(tr)); + ASSERT_EQ(3, GetChildCount(tr)); + + GumboNode* tr_text = GetChild(tr, 0); + ASSERT_EQ(GUMBO_NODE_WHITESPACE, tr_text->type); + EXPECT_STREQ("\n ", tr_text->v.text.text); + + GumboNode* td1 = GetChild(tr, 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, td1->type); + EXPECT_EQ(GUMBO_TAG_TD, GetTag(td1)); + ASSERT_EQ(1, GetChildCount(td1)); + + GumboNode* td1_text = GetChild(td1, 0); + ASSERT_EQ(GUMBO_NODE_TEXT, td1_text->type); + EXPECT_STREQ("One\n ", td1_text->v.text.text); + + GumboNode* td2 = GetChild(tr, 2); + ASSERT_EQ(GUMBO_NODE_ELEMENT, td2->type); + EXPECT_EQ(GUMBO_TAG_TD, GetTag(td2)); + ASSERT_EQ(1, GetChildCount(td2)); + + GumboNode* td2_text = GetChild(td2, 0); + ASSERT_EQ(GUMBO_NODE_TEXT, td2_text->type); + EXPECT_STREQ("Two\n ", td2_text->v.text.text); + + GumboNode* tr3 = GetChild(tbody, 2); + ASSERT_EQ(GUMBO_NODE_ELEMENT, tr3->type); + EXPECT_EQ(GUMBO_TAG_TR, GetTag(tr3)); + ASSERT_EQ(1, GetChildCount(tr3)); + + GumboNode* body_text = GetChild(body, 1); + ASSERT_EQ(GUMBO_NODE_WHITESPACE, body_text->type); + EXPECT_STREQ("\n", body_text->v.text.text); +} + +TEST_F(GumboParserTest, MisnestedTable) { + Parse("
"); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(2, GetChildCount(body)); + + GumboNode* div = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, div->type); + EXPECT_EQ(GUMBO_TAG_DIV, GetTag(div)); + ASSERT_EQ(0, GetChildCount(div)); + + GumboNode* table = GetChild(body, 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, table->type); + EXPECT_EQ(GUMBO_TAG_TABLE, GetTag(table)); + ASSERT_EQ(1, GetChildCount(table)); + + GumboNode* tbody = GetChild(table, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, tbody->type); + EXPECT_EQ(GUMBO_TAG_TBODY, GetTag(tbody)); + ASSERT_EQ(1, GetChildCount(tbody)); + + GumboNode* tr = GetChild(tbody, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, tr->type); + EXPECT_EQ(GUMBO_TAG_TR, GetTag(tr)); + ASSERT_EQ(1, GetChildCount(tr)); + + GumboNode* td = GetChild(tr, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, td->type); + EXPECT_EQ(GUMBO_TAG_TD, GetTag(td)); + ASSERT_EQ(0, GetChildCount(td)); +} + +TEST_F(GumboParserTest, MisnestedTable2) { + Parse(" + GumboNode* cell3 = GetChild(td1, 1); + ASSERT_EQ(GUMBO_NODE_TEXT, cell3->type); + EXPECT_STREQ("Cell3", cell3->v.text.text); + + GumboNode* table2 = GetChild(td1, 2); + ASSERT_EQ(GUMBO_NODE_ELEMENT, table2->type); + EXPECT_EQ(GUMBO_TAG_TABLE, GetTag(table2)); + ASSERT_EQ(1, GetChildCount(table2)); + + GumboNode* tbody2 = GetChild(table2, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, tbody2->type); + EXPECT_EQ(GUMBO_TAG_TBODY, GetTag(tbody2)); + ASSERT_EQ(2, GetChildCount(tbody2)); + + GumboNode* tr2 = GetChild(tbody2, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, tr2->type); + EXPECT_EQ(GUMBO_TAG_TR, GetTag(tr2)); + ASSERT_EQ(1, GetChildCount(tr2)); + + GumboNode* th = GetChild(tr2, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, th->type); + EXPECT_EQ(GUMBO_TAG_TH, GetTag(th)); + ASSERT_EQ(1, GetChildCount(th)); + + GumboNode* cell2 = GetChild(th, 0); + ASSERT_EQ(GUMBO_NODE_TEXT, cell2->type); + EXPECT_STREQ("Cell2", cell2->v.text.text); + + GumboNode* tr3 = GetChild(tbody2, 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, tr3->type); + EXPECT_EQ(GUMBO_TAG_TR, GetTag(tr3)); + ASSERT_EQ(0, GetChildCount(tr3)); +} + +TEST_F(GumboParserTest, Select) { + Parse("
"); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(2, GetChildCount(body)); + + GumboNode* select = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, select->type); + EXPECT_EQ(GUMBO_TAG_SELECT, GetTag(select)); + ASSERT_EQ(2, GetChildCount(select)); + + GumboNode* option1 = GetChild(select, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, option1->type); + EXPECT_EQ(GUMBO_TAG_OPTION, GetTag(option1)); + ASSERT_EQ(1, GetChildCount(option1)); + + GumboNode* option2 = GetChild(select, 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, option2->type); + EXPECT_EQ(GUMBO_TAG_OPTION, GetTag(option2)); + ASSERT_EQ(1, GetChildCount(option2)); + + GumboNode* div = GetChild(body, 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, div->type); + EXPECT_EQ(GUMBO_TAG_DIV, GetTag(div)); + ASSERT_EQ(0, GetChildCount(div)); +} + +TEST_F(GumboParserTest, ComplicatedSelect) { + Parse( + ""); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(2, GetChildCount(body)); + + GumboNode* select = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, select->type); + EXPECT_EQ(GUMBO_TAG_SELECT, GetTag(select)); + ASSERT_EQ(1, GetChildCount(select)); + + GumboNode* optgroup = GetChild(select, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, optgroup->type); + EXPECT_EQ(GUMBO_TAG_OPTGROUP, GetTag(optgroup)); + ASSERT_EQ(1, GetChildCount(optgroup)); + + GumboNode* option = GetChild(optgroup, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, option->type); + EXPECT_EQ(GUMBO_TAG_OPTION, GetTag(option)); + ASSERT_EQ(1, GetChildCount(option)); + + GumboNode* text = GetChild(option, 0); + ASSERT_EQ(GUMBO_NODE_TEXT, text->type); + EXPECT_STREQ("Option", text->v.text.text); + + GumboNode* input = GetChild(body, 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, input->type); + EXPECT_EQ(GUMBO_TAG_INPUT, GetTag(input)); + ASSERT_EQ(0, GetChildCount(input)); +} + +TEST_F(GumboParserTest, DoubleSelect) { + Parse("
"); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(2, GetChildCount(body)); + + GumboNode* select = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, select->type); + EXPECT_EQ(GUMBO_TAG_SELECT, GetTag(select)); + ASSERT_EQ(0, GetChildCount(select)); + + GumboNode* div = GetChild(body, 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, div->type); + EXPECT_EQ(GUMBO_TAG_DIV, GetTag(div)); + ASSERT_EQ(0, GetChildCount(div)); +} + +TEST_F(GumboParserTest, InputInSelect) { + Parse("
"); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(3, GetChildCount(body)); + + GumboNode* select = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, select->type); + EXPECT_EQ(GUMBO_TAG_SELECT, GetTag(select)); + ASSERT_EQ(0, GetChildCount(select)); + + GumboNode* input = GetChild(body, 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, input->type); + EXPECT_EQ(GUMBO_TAG_INPUT, GetTag(input)); + ASSERT_EQ(0, GetChildCount(input)); + + GumboNode* div = GetChild(body, 2); + ASSERT_EQ(GUMBO_NODE_ELEMENT, div->type); + EXPECT_EQ(GUMBO_TAG_DIV, GetTag(div)); + ASSERT_EQ(0, GetChildCount(div)); +} + +TEST_F(GumboParserTest, SelectInTable) { + Parse("
Cell1Cell3
Cell2
"); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* table1 = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, table1->type); + EXPECT_EQ(GUMBO_TAG_TABLE, GetTag(table1)); + ASSERT_EQ(1, GetChildCount(table1)); + + GumboNode* tbody1 = GetChild(table1, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, tbody1->type); + EXPECT_EQ(GUMBO_TAG_TBODY, GetTag(tbody1)); + ASSERT_EQ(1, GetChildCount(tbody1)); + + GumboNode* tr1 = GetChild(tbody1, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, tr1->type); + EXPECT_EQ(GUMBO_TAG_TR, GetTag(tr1)); + ASSERT_EQ(1, GetChildCount(tr1)); + + GumboNode* td1 = GetChild(tr1, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, td1->type); + EXPECT_EQ(GUMBO_TAG_TD, GetTag(td1)); + ASSERT_EQ(3, GetChildCount(td1)); + + GumboNode* cell1 = GetChild(td1, 0); + ASSERT_EQ(GUMBO_NODE_TEXT, cell1->type); + EXPECT_STREQ("Cell1", cell1->v.text.text); + + // Foster-parented out of the inner
"); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* table = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, table->type); + EXPECT_EQ(GUMBO_TAG_TABLE, GetTag(table)); + ASSERT_EQ(1, GetChildCount(table)); + + GumboNode* tbody = GetChild(table, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, tbody->type); + EXPECT_EQ(GUMBO_TAG_TBODY, GetTag(tbody)); + ASSERT_EQ(1, GetChildCount(tbody)); + + GumboNode* tr = GetChild(tbody, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, tr->type); + EXPECT_EQ(GUMBO_TAG_TR, GetTag(tr)); + ASSERT_EQ(1, GetChildCount(tr)); + + GumboNode* td = GetChild(tr, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, td->type); + EXPECT_EQ(GUMBO_TAG_TD, GetTag(td)); + ASSERT_EQ(1, GetChildCount(td)); + + GumboNode* select = GetChild(td, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, select->type); + EXPECT_EQ(GUMBO_TAG_SELECT, GetTag(select)); + ASSERT_EQ(1, GetChildCount(select)); + + GumboNode* option = GetChild(select, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, option->type); + EXPECT_EQ(GUMBO_TAG_OPTION, GetTag(option)); + ASSERT_EQ(0, GetChildCount(option)); +} + +TEST_F(GumboParserTest, ImplicitColgroup) { + Parse("
"); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* table = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, table->type); + EXPECT_EQ(GUMBO_TAG_TABLE, GetTag(table)); + ASSERT_EQ(1, GetChildCount(table)); + + GumboNode* colgroup = GetChild(table, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, colgroup->type); + EXPECT_EQ(GUMBO_TAG_COLGROUP, GetTag(colgroup)); + ASSERT_EQ(2, GetChildCount(colgroup)); + + GumboNode* col1 = GetChild(colgroup, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, col1->type); + EXPECT_EQ(GUMBO_TAG_COL, GetTag(col1)); + ASSERT_EQ(0, GetChildCount(col1)); + + GumboNode* col2 = GetChild(colgroup, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, col2->type); + EXPECT_EQ(GUMBO_TAG_COL, GetTag(col2)); + ASSERT_EQ(0, GetChildCount(col2)); +} + +TEST_F(GumboParserTest, Form) { + Parse("
After form"); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(2, GetChildCount(body)); + + GumboNode* form = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, form->type); + EXPECT_EQ(GUMBO_TAG_FORM, GetTag(form)); + ASSERT_EQ(1, GetChildCount(form)); + + GumboNode* input = GetChild(form, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, input->type); + EXPECT_EQ(GUMBO_TAG_INPUT, GetTag(input)); + ASSERT_EQ(0, GetChildCount(input)); + + GumboNode* text = GetChild(body, 1); + ASSERT_EQ(GUMBO_NODE_TEXT, text->type); + EXPECT_STREQ("After form", text->v.text.text); +} + +// See: https://github.com/google/gumbo-parser/issues/350 +TEST_F(GumboParserTest, FormEndPos) { + Parse("
"); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* form = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, form->type); + EXPECT_EQ(GUMBO_TAG_FORM, GetTag(form)); + ASSERT_EQ(1, GetChildCount(form)); + + ASSERT_EQ(form->v.element.start_pos.offset, 1); + ASSERT_EQ(form->v.element.end_pos.offset, 28); +} + +TEST_F(GumboParserTest, NestedForm) { + Parse("
After form"); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(2, GetChildCount(body)); + + GumboNode* form = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, form->type); + EXPECT_EQ(GUMBO_TAG_FORM, GetTag(form)); + ASSERT_EQ(2, GetChildCount(form)); + + GumboNode* label = GetChild(form, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, label->type); + EXPECT_EQ(GUMBO_TAG_LABEL, GetTag(label)); + ASSERT_EQ(1, GetChildCount(label)); + + GumboNode* input = GetChild(form, 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, input->type); + EXPECT_EQ(GUMBO_TAG_INPUT, GetTag(input)); + ASSERT_EQ(0, GetChildCount(input)); + + GumboNode* text = GetChild(body, 1); + ASSERT_EQ(GUMBO_NODE_TEXT, text->type); + EXPECT_STREQ("After form", text->v.text.text); +} + +TEST_F(GumboParserTest, MisnestedFormInTable) { + // Parse of this is somewhat weird. The first
is opened outside the + // table, so when
checks to see if there's a form in scope, it stops + // at the boundary and returns null. The form pointer is nulled out + // anyway, though, which means that the second form (parsed in the table body + // state) ends up creating an element. It's immediately popped off + // the stack, but the form element pointer remains set to that node (which is + // not on the stack of open elements). The final tag triggers the + // "does not have node in scope" clause and is ignored. (Note that this is + // different from "has a form element in scope" - the first form is still in + // scope at that point, but the form pointer does not point to it.) Then the + // original element is closed implicitly when the table cell is closed. + Parse( + "
type); + EXPECT_EQ(GUMBO_TAG_TABLE, GetTag(table1)); + ASSERT_EQ(1, GetChildCount(table1)); + + GumboNode* tbody1 = GetChild(table1, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, tbody1->type); + EXPECT_EQ(GUMBO_TAG_TBODY, GetTag(tbody1)); + ASSERT_EQ(1, GetChildCount(tbody1)); + + GumboNode* tr1 = GetChild(tbody1, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, tr1->type); + EXPECT_EQ(GUMBO_TAG_TR, GetTag(tr1)); + ASSERT_EQ(1, GetChildCount(tr1)); + + GumboNode* td1 = GetChild(tr1, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, td1->type); + EXPECT_EQ(GUMBO_TAG_TD, GetTag(td1)); + ASSERT_EQ(1, GetChildCount(td1)); + + GumboNode* form1 = GetChild(td1, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, form1->type); + EXPECT_EQ(GUMBO_TAG_FORM, GetTag(form1)); + ASSERT_EQ(1, GetChildCount(form1)); + + GumboNode* table2 = GetChild(form1, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, table2->type); + EXPECT_EQ(GUMBO_TAG_TABLE, GetTag(table2)); + ASSERT_EQ(1, GetChildCount(table2)); + + GumboNode* tbody2 = GetChild(table2, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, tbody2->type); + EXPECT_EQ(GUMBO_TAG_TBODY, GetTag(tbody2)); + ASSERT_EQ(2, GetChildCount(tbody2)); + + GumboNode* tr2 = GetChild(tbody2, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, tr2->type); + EXPECT_EQ(GUMBO_TAG_TR, GetTag(tr2)); + ASSERT_EQ(1, GetChildCount(tr2)); + + GumboNode* form2 = GetChild(tbody2, 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, form2->type); + EXPECT_EQ(GUMBO_TAG_FORM, GetTag(form2)); + ASSERT_EQ(0, GetChildCount(form2)); +} + +TEST_F(GumboParserTest, NestedRawtextTags) { + Parse( + "
" + "" + "
" + "
0"); + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* math = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, math->type); + EXPECT_EQ(GUMBO_TAG_MATH, math->v.element.tag); + EXPECT_EQ(GUMBO_NAMESPACE_MATHML, math->v.element.tag_namespace); + ASSERT_EQ(1, GetChildCount(math)); + + GumboNode* th = GetChild(math, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, th->type); + EXPECT_EQ(GUMBO_TAG_TH, th->v.element.tag); + EXPECT_EQ(GUMBO_NAMESPACE_MATHML, th->v.element.tag_namespace); + ASSERT_EQ(1, GetChildCount(th)); + + GumboNode* mi = GetChild(th, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, mi->type); + EXPECT_EQ(GUMBO_TAG_MI, mi->v.element.tag); + EXPECT_EQ(GUMBO_NAMESPACE_MATHML, mi->v.element.tag_namespace); + ASSERT_EQ(2, GetChildCount(mi)); + + GumboNode* table = GetChild(mi, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, table->type); + EXPECT_EQ(GUMBO_TAG_TABLE, table->v.element.tag); + EXPECT_EQ(GUMBO_NAMESPACE_HTML, table->v.element.tag_namespace); + ASSERT_EQ(0, GetChildCount(table)); + + GumboNode* div = GetChild(mi, 1); + ASSERT_EQ(GUMBO_NODE_ELEMENT, div->type); + EXPECT_EQ(GUMBO_TAG_DIV, div->v.element.tag); + EXPECT_EQ(GUMBO_NAMESPACE_HTML, div->v.element.tag_namespace); + ASSERT_EQ(1, GetChildCount(div)); + + GumboNode* text = GetChild(div, 0); + ASSERT_EQ(GUMBO_NODE_TEXT, text->type); + EXPECT_STREQ("0", text->v.text.text); +} + +TEST_F(GumboParserTest, TdInMathml) { + Parse(""); + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* table = GetChild(body, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, table->type); + EXPECT_EQ(GUMBO_TAG_TABLE, table->v.element.tag); + EXPECT_EQ(GUMBO_NAMESPACE_HTML, table->v.element.tag_namespace); + ASSERT_EQ(1, GetChildCount(table)); + + GumboNode* tbody = GetChild(table, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, tbody->type); + EXPECT_EQ(GUMBO_TAG_TBODY, tbody->v.element.tag); + EXPECT_EQ(GUMBO_NAMESPACE_HTML, tbody->v.element.tag_namespace); + ASSERT_EQ(1, GetChildCount(tbody)); + + GumboNode* tr = GetChild(tbody, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, tr->type); + EXPECT_EQ(GUMBO_TAG_TR, tr->v.element.tag); + EXPECT_EQ(GUMBO_NAMESPACE_HTML, tr->v.element.tag_namespace); + ASSERT_EQ(1, GetChildCount(tr)); + + GumboNode* th = GetChild(tr, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, th->type); + EXPECT_EQ(GUMBO_TAG_TH, th->v.element.tag); + EXPECT_EQ(GUMBO_NAMESPACE_HTML, th->v.element.tag_namespace); + ASSERT_EQ(1, GetChildCount(th)); + + GumboNode* math = GetChild(th, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, math->type); + EXPECT_EQ(GUMBO_TAG_MATH, math->v.element.tag); + EXPECT_EQ(GUMBO_NAMESPACE_MATHML, math->v.element.tag_namespace); + ASSERT_EQ(1, GetChildCount(math)); + + GumboNode* td = GetChild(math, 0); + ASSERT_EQ(GUMBO_NODE_ELEMENT, td->type); + EXPECT_EQ(GUMBO_TAG_TD, td->v.element.tag); + EXPECT_EQ(GUMBO_NAMESPACE_MATHML, td->v.element.tag_namespace); + ASSERT_EQ(0, GetChildCount(td)); +} + +TEST_F(GumboParserTest, SelectInForeignContent) { + Parse("