-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgumbo.nim
649 lines (573 loc) · 36.3 KB
/
gumbo.nim
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
## # Copyright 2010 Google Inc. All Rights Reserved.
## #
## # Licensed under the Apache License, Version 2.0 (the "License");
## # you may not use this file except in compliance with the License.
## # You may obtain a copy of the License at
## #
## # http://www.apache.org/licenses/LICENSE-2.0
## #
## # Unless required by applicable law or agreed to in writing, software
## # distributed under the License is distributed on an "AS IS" BASIS,
## # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## # See the License for the specific language governing permissions and
## # limitations under the License.
## #
## # Author: jdtang@google.com (Jonathan Tang)
## #
## # We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and
## # GUMBO_ as a prefix for enum constants (static constants get the Google-style
## # kGumbo prefix).
## #*
## # @file
## # @mainpage Gumbo HTML Parser
## #
## # This provides a conformant, no-dependencies implementation of the HTML5
## # parsing algorithm. It supports only UTF8; if you need to parse a different
## # encoding, run a preprocessing step to convert to UTF8. It returns a parse
## # tree made of the structs in this file.
## #
## # Example:
## # @code
## # GumboOutput* output = gumbo_parse(input);
## # do_something_with_doctype(output->document);
## # do_something_with_html_tree(output->root);
## # gumbo_destroy_output(&options, output);
## # @endcode
## # HTML5 Spec:
## #
## # http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html
## #
when defined(windows):
const
libgumbo* = "libgumbo.dll"
elif defined(macosx):
const
libgumbo* = "libgumbo.dylib"
else:
const
libgumbo* = "libgumbo.so"
# when declared(_MSC_VER):
# const
# _CRT_SECURE_NO_WARNINGS* = true
# fileno* = _fileno
## #*
## # A struct representing a character position within the original text buffer.
## # Line and column numbers are 1-based and offsets are 0-based, which matches
## # how most editors and command-line tools work. Also, columns measure
## # positions in terms of characters while offsets measure by bytes; this is
## # because the offset field is often used to pull out a particular region of
## # text (which in most languages that bind to C implies pointer arithmetic on a
## # buffer of bytes), while the column field is often used to reference a
## # particular column on a printable display, which nowadays is usually UTF-8.
## #
type
GumboSourcePosition* {.importc: "GumboSourcePosition", header: "<gumbo.h>".} = object
line* {.importc: "line".}: cuint
column* {.importc: "column".}: cuint
offset* {.importc: "offset".}: cuint
## #*
## # A SourcePosition used for elements that have no source position, i.e.
## # parser-inserted elements.
## #
var kGumboEmptySourcePosition* {.importc: "kGumboEmptySourcePosition",
dynlib: libgumbo.}: GumboSourcePosition
## #*
## # A struct representing a string or part of a string. Strings within the
## # parser are represented by a char* and a length; the char* points into
## # an existing data buffer owned by some other code (often the original input).
## # GumboStringPieces are assumed (by convention) to be immutable, because they
## # may share data. Use GumboStringBuffer if you need to construct a string.
## # Clients should assume that it is not NUL-terminated, and should always use
## # explicit lengths when manipulating them.
## #
type
GumboStringPiece* {.importc: "GumboStringPiece", header: "<gumbo.h>".} = object
data* {.importc: "data".}: cstring ## #* A pointer to the beginning of the string. NULL iff length == 0.
## #* The length of the string fragment, in bytes. May be zero.
length* {.importc: "length".}: csize
## #* A constant to represent a 0-length null string.
var kGumboEmptyString* {.importc: "kGumboEmptyString", dynlib: libgumbo.}: GumboStringPiece
## #*
## # Compares two GumboStringPieces, and returns true if they're equal or false
## # otherwise.
## #
proc gumbo_string_equals*(str1: ptr GumboStringPiece; str2: ptr GumboStringPiece): bool {.
cdecl, importc: "gumbo_string_equals", dynlib: libgumbo.}
## #*
## # Compares two GumboStringPieces ignoring case, and returns true if they're
## # equal or false otherwise.
## #
proc gumbo_string_equals_ignore_case*(str1: ptr GumboStringPiece;
str2: ptr GumboStringPiece): bool {.cdecl,
importc: "gumbo_string_equals_ignore_case", dynlib: libgumbo.}
## #*
## # A simple vector implementation. This stores a pointer to a data array and a
## # length. All elements are stored as void*; client code must cast to the
## # appropriate type. Overflows upon addition result in reallocation of the data
## # array, with the size doubling to maintain O(1) amortized cost. There is no
## # removal function, as this isn't needed for any of the operations within this
## # library. Iteration can be done through inspecting the structure directly in
## # a for-loop.
## #
type
GumboVector* {.importc: "GumboVector", header: "<gumbo.h>".} = object
data* {.importc: "data".}: ptr pointer ## #* Data elements. This points to a dynamically-allocated array of capacity
## # elements, each a void* to the element itself.
## #
## #* Number of elements currently in the vector.
length* {.importc: "length".}: cuint ## #* Current array capacity.
capacity* {.importc: "capacity".}: cuint
## #* An empty (0-length, 0-capacity) GumboVector.
var kGumboEmptyVector* {.importc: "kGumboEmptyVector", dynlib: libgumbo.}: GumboVector
## #*
## # Returns the first index at which an element appears in this vector (testing
## # by pointer equality), or -1 if it never does.
## #
proc gumbo_vector_index_of*(vector: ptr GumboVector; element: pointer): cint {.cdecl,
importc: "gumbo_vector_index_of", dynlib: libgumbo.}
## #*
## # An enum for all the tags defined in the HTML5 standard. These correspond to
## # the tag names themselves. Enum constants exist only for tags which appear in
## # the spec itself (or for tags with special handling in the SVG and MathML
## # namespaces); any other tags appear as GUMBO_TAG_UNKNOWN and the actual tag
## # name can be obtained through original_tag.
## #
## # This is mostly for API convenience, so that clients of this library don't
## # need to perform a strcasecmp to find the normalized tag name. It also has
## # efficiency benefits, by letting the parser work with enums instead of
## # strings.
## #
type ## # Load all the tags from an external source, generated from tag.in.
## # Generated via `gentags.py src/tag.in`.
## # Do not edit; edit src/tag.in instead.
## # clang-format off
GumboTag* {.size: sizeof(cint).} = enum
GUMBO_TAG_HTML, GUMBO_TAG_HEAD, GUMBO_TAG_TITLE, GUMBO_TAG_BASE, GUMBO_TAG_LINK,
GUMBO_TAG_META, GUMBO_TAG_STYLE, GUMBO_TAG_SCRIPT, GUMBO_TAG_NOSCRIPT,
GUMBO_TAG_TEMPLATE, GUMBO_TAG_BODY, GUMBO_TAG_ARTICLE, GUMBO_TAG_SECTION,
GUMBO_TAG_NAV, GUMBO_TAG_ASIDE, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_HGROUP, GUMBO_TAG_HEADER,
GUMBO_TAG_FOOTER, GUMBO_TAG_ADDRESS, GUMBO_TAG_P, GUMBO_TAG_HR, GUMBO_TAG_PRE,
GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_OL, GUMBO_TAG_UL, GUMBO_TAG_LI, GUMBO_TAG_DL,
GUMBO_TAG_DT, GUMBO_TAG_DD, GUMBO_TAG_FIGURE, GUMBO_TAG_FIGCAPTION,
GUMBO_TAG_MAIN, GUMBO_TAG_DIV, GUMBO_TAG_A, GUMBO_TAG_EM, GUMBO_TAG_STRONG,
GUMBO_TAG_SMALL, GUMBO_TAG_S, GUMBO_TAG_CITE, GUMBO_TAG_Q, GUMBO_TAG_DFN,
GUMBO_TAG_ABBR, GUMBO_TAG_DATA, GUMBO_TAG_TIME, GUMBO_TAG_CODE, GUMBO_TAG_VAR,
GUMBO_TAG_SAMP, GUMBO_TAG_KBD, GUMBO_TAG_SUB, GUMBO_TAG_SUP, GUMBO_TAG_I,
GUMBO_TAG_B, GUMBO_TAG_U, GUMBO_TAG_MARK, GUMBO_TAG_RUBY, GUMBO_TAG_RT,
GUMBO_TAG_RP, GUMBO_TAG_BDI, GUMBO_TAG_BDO, GUMBO_TAG_SPAN, GUMBO_TAG_BR,
GUMBO_TAG_WBR, GUMBO_TAG_INS, GUMBO_TAG_DEL, GUMBO_TAG_IMAGE, GUMBO_TAG_IMG,
GUMBO_TAG_IFRAME, GUMBO_TAG_EMBED, GUMBO_TAG_OBJECT, GUMBO_TAG_PARAM,
GUMBO_TAG_VIDEO, GUMBO_TAG_AUDIO, GUMBO_TAG_SOURCE, GUMBO_TAG_TRACK,
GUMBO_TAG_CANVAS, GUMBO_TAG_MAP, GUMBO_TAG_AREA, GUMBO_TAG_MATH, GUMBO_TAG_MI,
GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS, GUMBO_TAG_MTEXT, GUMBO_TAG_MGLYPH,
GUMBO_TAG_MALIGNMARK, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_SVG,
GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_DESC, GUMBO_TAG_TABLE, GUMBO_TAG_CAPTION,
GUMBO_TAG_COLGROUP, GUMBO_TAG_COL, GUMBO_TAG_TBODY, GUMBO_TAG_THEAD,
GUMBO_TAG_TFOOT, GUMBO_TAG_TR, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_FORM,
GUMBO_TAG_FIELDSET, GUMBO_TAG_LEGEND, GUMBO_TAG_LABEL, GUMBO_TAG_INPUT,
GUMBO_TAG_BUTTON, GUMBO_TAG_SELECT, GUMBO_TAG_DATALIST, GUMBO_TAG_OPTGROUP,
GUMBO_TAG_OPTION, GUMBO_TAG_TEXTAREA, GUMBO_TAG_KEYGEN, GUMBO_TAG_OUTPUT,
GUMBO_TAG_PROGRESS, GUMBO_TAG_METER, GUMBO_TAG_DETAILS, GUMBO_TAG_SUMMARY,
GUMBO_TAG_MENU, GUMBO_TAG_MENUITEM, GUMBO_TAG_APPLET, GUMBO_TAG_ACRONYM,
GUMBO_TAG_BGSOUND, GUMBO_TAG_DIR, GUMBO_TAG_FRAME, GUMBO_TAG_FRAMESET,
GUMBO_TAG_NOFRAMES, GUMBO_TAG_ISINDEX, GUMBO_TAG_LISTING, GUMBO_TAG_XMP,
GUMBO_TAG_NEXTID, GUMBO_TAG_NOEMBED, GUMBO_TAG_PLAINTEXT, GUMBO_TAG_RB,
GUMBO_TAG_STRIKE, GUMBO_TAG_BASEFONT, GUMBO_TAG_BIG, GUMBO_TAG_BLINK,
GUMBO_TAG_CENTER, GUMBO_TAG_FONT, GUMBO_TAG_MARQUEE, GUMBO_TAG_MULTICOL,
GUMBO_TAG_NOBR, GUMBO_TAG_SPACER, GUMBO_TAG_TT, GUMBO_TAG_RTC, ## # Used for all tags that don't have special handling in HTML. Add new tags
## # to the end of tag.in so as to preserve
## backwards-compatibility.
GUMBO_TAG_UNKNOWN, ## # A marker value to indicate the end of the enum, for iterating over it.
## # Also used as the terminator for varargs functions that take tags.
GUMBO_TAG_LAST
## #*
## # Returns the normalized (usually all-lowercased, except for foreign content)
## # tag name for an GumboTag enum. Return value is static data owned by the
## # library.
## #
proc gumbo_normalized_tagname*(tag: GumboTag): cstring {.cdecl,
importc: "gumbo_normalized_tagname", dynlib: libgumbo.}
## #*
## # Extracts the tag name from the original_text field of an element or token by
## # stripping off </> characters and attributes and adjusting the passed-in
## # GumboStringPiece appropriately. The tag name is in the original case and
## # shares a buffer with the original text, to simplify memory management.
## # Behavior is undefined if a string-piece that doesn't represent an HTML tag
## # (<tagname> or </tagname>) is passed in. If the string piece is completely
## # empty (NULL data pointer), then this function will exit successfully as a
## # no-op.
## #
proc gumbo_tag_from_original_text*(text: ptr GumboStringPiece) {.cdecl,
importc: "gumbo_tag_from_original_text", dynlib: libgumbo.}
## #*
## # Fixes the case of SVG elements that are not all lowercase.
## # http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-inforeign
## # This is not done at parse time because there's no place to store a mutated
## # tag name. tag_name is an enum (which will be TAG_UNKNOWN for most SVG tags
## # without special handling), while original_tag_name is a pointer into the
## # original buffer. Instead, we provide this helper function that clients can
## # use to rename SVG tags as appropriate.
## # Returns the case-normalized SVG tagname if a replacement is found, or NULL if
## # no normalization is called for. The return value is static data and owned by
## # the library.
## #
proc gumbo_normalize_svg_tagname*(tagname: ptr GumboStringPiece): cstring {.cdecl,
importc: "gumbo_normalize_svg_tagname", dynlib: libgumbo.}
## #*
## # Converts a tag name string (which may be in upper or mixed case) to a tag
## # enum. The `tag` version expects `tagname` to be NULL-terminated
## #
proc gumbo_tag_enum*(tagname: cstring): GumboTag {.cdecl, importc: "gumbo_tag_enum",
dynlib: libgumbo.}
proc gumbo_tagn_enum*(tagname: cstring; length: cuint): GumboTag {.cdecl,
importc: "gumbo_tagn_enum", dynlib: libgumbo.}
## #*
## # Attribute namespaces.
## # HTML includes special handling for XLink, XML, and XMLNS namespaces on
## # attributes. Everything else goes in the generic "NONE" namespace.
## #
type
GumboAttributeNamespaceEnum* {.size: sizeof(cint).} = enum
GUMBO_ATTR_NAMESPACE_NONE, GUMBO_ATTR_NAMESPACE_XLINK,
GUMBO_ATTR_NAMESPACE_XML, GUMBO_ATTR_NAMESPACE_XMLNS
## #*
## # A struct representing a single attribute on an HTML tag. This is a
## # name-value pair, but also includes information about source locations and
## # original source text.
## #
type
GumboAttribute* {.importc: "GumboAttribute", header: "<gumbo.h>".} = object
attr_namespace* {.importc: "attr_namespace".}: GumboAttributeNamespaceEnum ## #*
## # The namespace for the attribute. This will usually be
## # GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special
## # values, per:
## # http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes
## #
## #*
## # The name of the attribute. This is in a freshly-allocated buffer to deal
## # with case-normalization, and is null-terminated.
## #
name* {.importc: "name".}: cstring ## #*
## # The original text of the attribute name, as a pointer into the original
## # source buffer.
## #
original_name* {.importc: "original_name".}: GumboStringPiece ## #*
## # The value of the attribute. This is in a freshly-allocated buffer to deal
## # with unescaping, and is null-terminated. It does not include any quotes
## # that surround the attribute. If the attribute has no value (for example,
## # 'selected' on a checkbox), this will be an empty string.
## #
value* {.importc: "value".}: cstring ## #*
## # The original text of the value of the attribute. This points into the
## # original source buffer. It includes any quotes that surround the
## # attribute, and you can look at original_value.data[0] and
## # original_value.data[original_value.length - 1] to determine what the quote
## # characters were. If the attribute has no value, this will be a 0-length
## # string.
## #
original_value* {.importc: "original_value".}: GumboStringPiece ## #* The starting position of the attribute name.
name_start* {.importc: "name_start".}: GumboSourcePosition ## #*
## # The ending position of the attribute name. This is not always derivable
## # from the starting position of the value because of the possibility of
## # whitespace around the = sign.
## #
name_end* {.importc: "name_end".}: GumboSourcePosition ## #* The starting position of the attribute value.
value_start* {.importc: "value_start".}: GumboSourcePosition ## #* The ending position of the attribute value.
value_end* {.importc: "value_end".}: GumboSourcePosition
## #*
## # Given a vector of GumboAttributes, look up the one with the specified name
## # and return it, or NULL if no such attribute exists. This uses a
## # case-insensitive match, as HTML is case-insensitive.
## #
proc gumbo_get_attribute*(attrs: ptr GumboVector; name: cstring): ptr GumboAttribute {.
cdecl, importc: "gumbo_get_attribute", dynlib: libgumbo.}
## #*
## # Enum denoting the type of node. This determines the type of the node.v
## # union.
## #
type ## #* Document node. v will be a GumboDocument.
GumboNodeType* {.size: sizeof(cint).} = enum
GUMBO_NODE_DOCUMENT, ## #* Element node. v will be a GumboElement.
GUMBO_NODE_ELEMENT, ## #* Text node. v will be a GumboText.
GUMBO_NODE_TEXT, ## #* CDATA node. v will be a GumboText.
GUMBO_NODE_CDATA, ## #* Comment node. v will be a GumboText, excluding comment delimiters.
GUMBO_NODE_COMMENT, ## #* Text node, where all contents is whitespace. v will be a GumboText.
GUMBO_NODE_WHITESPACE, ## #* Template node. This is separate from GUMBO_NODE_ELEMENT because many
## # client libraries will want to ignore the contents of template nodes, as
## # the spec suggests. Recursing on GUMBO_NODE_ELEMENT will do the right thing
## # here, while clients that want to include template contents should also
## # check for GUMBO_NODE_TEMPLATE. v will be a GumboElement.
GUMBO_NODE_TEMPLATE
## #*
## # http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode
## #
type
GumboQuirksModeEnum* {.size: sizeof(cint).} = enum
GUMBO_DOCTYPE_NO_QUIRKS, GUMBO_DOCTYPE_QUIRKS, GUMBO_DOCTYPE_LIMITED_QUIRKS
## #*
## # Namespaces.
## # Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix. Rather,
## # anything inside an <svg> tag is in the SVG namespace, anything inside the
## # <math> tag is in the MathML namespace, and anything else is inside the HTML
## # namespace. No other namespaces are supported, so this can be an enum only.
## #
type
GumboNamespaceEnum* {.size: sizeof(cint).} = enum
GUMBO_NAMESPACE_HTML, GUMBO_NAMESPACE_SVG, GUMBO_NAMESPACE_MATHML
## #*
## # Parse flags.
## # We track the reasons for parser insertion of nodes and store them in a
## # bitvector in the node itself. This lets client code optimize out nodes that
## # are implied by the HTML structure of the document, or flag constructs that
## # may not be allowed by a style guide, or track the prevalence of incorrect or
## # tricky HTML code.
## #
type ## #*
## # A normal node - both start and end tags appear in the source, nothing has
## # been reparented.
## #
GumboParseFlags* {.size: sizeof(cint).} = enum
GUMBO_INSERTION_NORMAL = 0, ## #*
## # A node inserted by the parser to fulfill some implicit insertion rule.
## # This is usually set in addition to some other flag giving a more specific
## # insertion reason; it's a generic catch-all term meaning "The start tag for
## # this node did not appear in the document source".
## #
GUMBO_INSERTION_BY_PARSER = 1 shl 0, ## #*
## # A flag indicating that the end tag for this node did not appear in the
## # document source. Note that in some cases, you can still have
## # parser-inserted nodes with an explicit end tag: for example, "Text</html>"
## # has GUMBO_INSERTED_BY_PARSER set on the <html> node, but
## # GUMBO_INSERTED_END_TAG_IMPLICITLY is unset, as the </html> tag actually
## # exists. This flag will be set only if the end tag is completely missing;
## # in some cases, the end tag may be misplaced (eg. a </body> tag with text
## # afterwards), which will leave this flag unset and require clients to
## # inspect the parse errors for that case.
## #
GUMBO_INSERTION_IMPLICIT_END_TAG = 1 shl 1, ## # Value 1 << 2 was for a flag that has since been removed.
## #*
## # A flag for nodes that are inserted because their presence is implied by
## # other tags, eg. <html>, <head>, <body>, <tbody>, etc.
## #
GUMBO_INSERTION_IMPLIED = 1 shl 3, ## #*
## # A flag for nodes that are converted from their end tag equivalents. For
## # example, </p> when no paragraph is open implies that the parser should
## # create a <p> tag and immediately close it, while </br> means the same thing
## # as <br>.
## #
GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 shl 4, ## #* A flag for nodes that are converted from the parse of an <isindex> tag.
GUMBO_INSERTION_FROM_ISINDEX = 1 shl 5, ## #* A flag for <image> tags that are rewritten as <img>.
GUMBO_INSERTION_FROM_IMAGE = 1 shl 6, ## #*
## # A flag for nodes that are cloned as a result of the reconstruction of
## # active formatting elements. This is set only on the clone; the initial
## # portion of the formatting run is a NORMAL node with an IMPLICIT_END_TAG.
## #
GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 shl 7, ## #* A flag for nodes that are cloned by the adoption agency algorithm.
GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 shl 8, ## #* A flag for nodes that are moved by the adoption agency algorithm.
GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 shl 9, ## #*
## # A flag for nodes that have been foster-parented out of a table (or
## # should've been foster-parented, if verbatim mode is set).
## #
GUMBO_INSERTION_FOSTER_PARENTED = 1 shl 10
## #*
## # Information specific to document nodes.
## #
type
GumboDocument* {.importc: "GumboDocument", header: "<gumbo.h>".} = object
children* {.importc: "children".}: GumboVector ## #*
## # An array of GumboNodes, containing the children of this element. This will
## # normally consist of the <html> element and any comment nodes found.
## # Pointers are owned.
## #
## # True if there was an explicit doctype token as opposed to it being omitted.
## # GumboNode*
has_doctype* {.importc: "has_doctype".}: bool ## # Fields from the doctype token, copied verbatim.
name* {.importc: "name".}: cstring
public_identifier* {.importc: "public_identifier".}: cstring
system_identifier* {.importc: "system_identifier".}: cstring ## #*
## # Whether or not the document is in QuirksMode, as determined by the values
## # in the GumboTokenDocType template.
## #
doc_type_quirks_mode* {.importc: "doc_type_quirks_mode".}: GumboQuirksModeEnum
## #*
## # The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE elements.
## # This contains just a block of text and its position.
## #
type
GumboText* {.importc: "GumboText", header: "<gumbo.h>".} = object
text* {.importc: "text".}: cstring ## #*
## # The text of this node, after entities have been parsed and decoded. For
## # comment/cdata nodes, this does not include the comment delimiters.
## #
## #*
## # The original text of this node, as a pointer into the original buffer. For
## # comment/cdata nodes, this includes the comment delimiters.
## #
original_text* {.importc: "original_text".}: GumboStringPiece ## #*
## # The starting position of this node. This corresponds to the position of
## # original_text, before entities are decoded.
## #
start_pos* {.importc: "start_pos".}: GumboSourcePosition
## #*
## # The struct used to represent all HTML elements. This contains information
## # about the tag, attributes, and child nodes.
## #
type
GumboElement* {.importc: "GumboElement", header: "<gumbo.h>".} = object
children* {.importc: "children".}: GumboVector ## #*
## # An array of GumboNodes, containing the children of this element. Pointers
## # are owned.
## #
## #* The GumboTag enum for this element.
## # GumboNode*
tag* {.importc: "tag".}: GumboTag ## #* The GumboNamespaceEnum for this element.
tag_namespace* {.importc: "tag_namespace".}: GumboNamespaceEnum ## #*
## # A GumboStringPiece pointing to the original tag text for this element,
## # pointing directly into the source buffer. If the tag was inserted
## # algorithmically (for example, <head> or <tbody> insertion), this will be a
## # zero-length string.
## #
original_tag* {.importc: "original_tag".}: GumboStringPiece ## #*
## # A GumboStringPiece pointing to the original end tag text for this element.
## # If the end tag was inserted algorithmically, (for example, closing a
## # self-closing tag), this will be a zero-length string.
## #
original_end_tag* {.importc: "original_end_tag".}: GumboStringPiece ## #* The source position for the start of the start tag.
start_pos* {.importc: "start_pos".}: GumboSourcePosition ## #* The source position for the start of the end tag.
end_pos* {.importc: "end_pos".}: GumboSourcePosition ## #*
## # An array of GumboAttributes, containing the attributes for this tag in the
## # order that they were parsed. Pointers are owned.
## #
attributes* {.importc: "attributes".}: GumboVector ## # GumboAttribute*
## #*
## # A supertype for GumboElement and GumboText, so that we can include one
## # generic type in lists of children and cast as necessary to subtypes.
## #
type
INNER_C_UNION_1328777486447091410* {.importc: "no_name", header: "<gumbo.h>".} = object {.
union.}
document* {.importc: "document".}: GumboDocument ## # For GUMBO_NODE_DOCUMENT.
element* {.importc: "element".}: GumboElement ## # For GUMBO_NODE_ELEMENT.
text* {.importc: "text".}: GumboText ## # For everything else.
GumboNode* {.importc: "GumboNode", header: "<gumbo.h>".} = object
`type`* {.importc: "type".}: GumboNodeType ## #* The type of node that this is.
## #* Pointer back to parent node. Not owned.
parent* {.importc: "parent".}: ptr GumboNode ## #* The index within the parent's children vector of this node.
index_within_parent* {.importc: "index_within_parent".}: csize ## #*
## # A bitvector of flags containing information about why this element was
## # inserted into the parse tree, including a variety of special parse
## # situations.
## #
parse_flags* {.importc: "parse_flags".}: GumboParseFlags ## #* The actual node data.
v* {.importc: "v".}: INNER_C_UNION_1328777486447091410
## #*
## # Forward declaration of GumboNode so it can be used recursively in
## # GumboNode.parent.
## #
type
GumboInternalNode* = GumboNode
## #*
## # The type for an allocator function. Takes the 'userdata' member of the
## # GumboParser struct as its first argument. Semantics should be the same as
## # malloc, i.e. return a block of size_t bytes on success or NULL on failure.
## # Allocating a block of 0 bytes behaves as per malloc.
## #
## # TODO(jdtang): Add checks throughout the codebase for out-of-memory condition.
type
GumboAllocatorFunction* = proc (userdata: pointer; size: csize): pointer {.cdecl.}
## #*
## # The type for a deallocator function. Takes the 'userdata' member of the
## # GumboParser struct as its first argument.
## #
type
GumboDeallocatorFunction* = proc (userdata: pointer; `ptr`: pointer) {.cdecl.}
## #*
## # Input struct containing configuration options for the parser.
## # These let you specify alternate memory managers, provide different error
## # handling, etc.
## # Use kGumboDefaultOptions for sensible defaults, and only set what you need.
## #
type
GumboOptions* {.importc: "GumboOptions", header: "<gumbo.h>".} = object
allocator* {.importc: "allocator".}: GumboAllocatorFunction ## #* A memory allocator function. Default: malloc.
## #* A memory deallocator function. Default: free.
deallocator* {.importc: "deallocator".}: GumboDeallocatorFunction ## #*
## # An opaque object that's passed in as the first argument to all callbacks
## # used by this library. Default: NULL.
## #
userdata* {.importc: "userdata".}: pointer ## #*
## # The tab-stop size, for computing positions in source code that uses tabs.
## # Default: 8.
## #
tab_stop* {.importc: "tab_stop".}: cint ## #*
## # Whether or not to stop parsing when the first error is encountered.
## # Default: false.
## #
stop_on_first_error* {.importc: "stop_on_first_error".}: bool ## #*
## # The maximum number of errors before the parser stops recording them. This
## # is provided so that if the page is totally borked, we don't completely fill
## # up the errors vector and exhaust memory with useless redundant errors. Set
## # to -1 to disable the limit.
## # Default: -1
## #
max_errors* {.importc: "max_errors".}: cint ## #*
## # The fragment context for parsing:
## # https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments
## #
## # If GUMBO_TAG_LAST is passed here, it is assumed to be "no fragment", i.e.
## # the regular parsing algorithm. Otherwise, pass the tag enum for the
## # intended parent of the parsed fragment. We use just the tag enum rather
## # than a full node because that's enough to set all the parsing context we
## # need, and it provides some additional flexibility for client code to act as
## # if parsing a fragment even when a full HTML tree isn't available.
## #
## # Default: GUMBO_TAG_LAST
## #
fragment_context* {.importc: "fragment_context".}: GumboTag ## #*
## # The namespace for the fragment context. This lets client code
## # differentiate between, say, parsing a <title> tag in SVG vs. parsing it in
## # HTML.
## # Default: GUMBO_NAMESPACE_HTML
## #
fragment_namespace* {.importc: "fragment_namespace".}: GumboNamespaceEnum
## #* Default options struct; use this with gumbo_parse_with_options.
var kGumboDefaultOptions* {.importc: "kGumboDefaultOptions", dynlib: libgumbo.}: GumboOptions
## #* The output struct containing the results of the parse.
type
GumboOutput* {.importc: "GumboOutput", header: "<gumbo.h>".} = object
document* {.importc: "document".}: ptr GumboNode ## #*
## # Pointer to the document node. This is a GumboNode of type NODE_DOCUMENT
## # that contains the entire document as its child.
## #
## #*
## # Pointer to the root node. This the <html> tag that forms the root of the
## # document.
## #
root* {.importc: "root".}: ptr GumboNode ## #*
## # A list of errors that occurred during the parse.
## # NOTE: In version 1.0 of this library, the API for errors hasn't been fully
## # fleshed out and may change in the future. For this reason, the GumboError
## # header isn't part of the public API. Contact us if you need errors
## # reported so we can work out something appropriate for your use-case.
## #
errors* {.importc: "errors".}: GumboVector ## # GumboError
## #*
## # Parses a buffer of UTF8 text into an GumboNode parse tree. The buffer must
## # live at least as long as the parse tree, as some fields (eg. original_text)
## # point directly into the original buffer.
## #
## # This doesn't support buffers longer than 4 gigabytes.
## #
proc gumbo_parse*(buffer: cstring): ptr GumboOutput {.cdecl, importc: "gumbo_parse",
dynlib: libgumbo.}
## #*
## # Extended version of gumbo_parse that takes an explicit options structure,
## # buffer, and length.
## #
proc gumbo_parse_with_options*(options: ptr GumboOptions; buffer: cstring;
buffer_length: csize): ptr GumboOutput {.cdecl,
importc: "gumbo_parse_with_options", dynlib: libgumbo.}
## #* Release the memory used for the parse tree & parse errors.
proc gumbo_destroy_output*(options: ptr GumboOptions; output: ptr GumboOutput) {.cdecl,
importc: "gumbo_destroy_output", dynlib: libgumbo.}