More Grammar changes as per USFM/X 3.1 (#255)

* Allow any 3 letter code as bookcode * Support nested markers without + * Make closing mandatory for \fv * Introduce 'key' attribute in \k
Bridgeconn · Aug 7, 2024 · 9edddf8 · 9edddf8
1 parent c151e14
commit 9edddf8
Show file tree

Hide file tree

Showing 4 changed files with 106 additions and 97 deletions.
diff --git a/py-usfm-parser/src/usfm_grammar/usx_generator.py b/py-usfm-parser/src/usfm_grammar/usx_generator.py
@@ -30,7 +30,7 @@ class USXGenerator:
     NESTED_CHAR_STYLE_MARKERS = [item+"Nested" for item in CHAR_STYLE_MARKERS]
     DEFAULT_ATTRIB_MAP = {"w":"lemma", "rb":"gloss", "xt":"href", "fig":"alt",
                         "xt_standalone":"href", "xtNested":"href", "ref":"loc",
-                        "milestone":"who"}
+                        "milestone":"who", "k":"key"}
     TABLE_CELL_MARKERS = ["tc", "th", "tcr", "thr"]
     MISC_MARKERS = ["fig", "cat", "esb", "b", "ph", "pi"]
 

diff --git a/py-usfm-parser/tests/__init__.py b/py-usfm-parser/tests/__init__.py
@@ -130,8 +130,6 @@ def find_all_markers(usfm_path, keep_id=False, keep_number=True):
 
     f"{TEST_DIR}/biblica/PublishingVersesWithFormatting/origin.usfm": "fail", # \c without number
 
-    f"{TEST_DIR}/specExamples/extended/contentCatogories1/origin.usfm": "fail", # cat inside footnote
-
     f'{TEST_DIR}/special-cases/figure_with_quotes_in_desc/origin.usfm': "fail", # quote within quote
     f'{TEST_DIR}/specExamples/poetry/origin.usfm': "fail", # \b not followed by a \p or \q
 

diff --git a/py-usfm-parser/tests/test_json_conversion.py b/py-usfm-parser/tests/test_json_conversion.py
@@ -142,9 +142,9 @@ def remove_newlines_in_text(usj_dict):
         for i,item in enumerate(usj_dict["content"]):
             if isinstance(item, str):
                 usj_dict['content'][i] = item.replace("\n", " ")
-                usj_dict['content'][i] = re.sub(r" +", " ", usj_dict['content'][i])
-                continue
-            remove_newlines_in_text(item)
+                usj_dict['content'][i] = re.sub(r"\s+", " ", usj_dict['content'][i])
+            else:
+                remove_newlines_in_text(item)
 
 def strip_text_value(usj_dict):
     '''Trailing and preceding space handling can be different between tcdocs and our logic.
@@ -188,6 +188,7 @@ def test_compare_usj_with_testsuite_samples(file_path):
             pass
         except AssertionError:
             strip_default_attrib_value(origin_usj)
+            remove_newlines_in_text(origin_usj)
             strip_text_value(usj_dict)
             strip_text_value(origin_usj)
             dict_diff = DeepDiff(usj_dict, origin_usj, ignore_order=True)

diff --git a/tree-sitter-usfm3/grammar.js b/tree-sitter-usfm3/grammar.js
@@ -14,28 +14,29 @@ module.exports = grammar({
       )),
     _mandatoryHead: $ => prec.right(0, seq($.book, repeat($._bookHeader))),
 
-    bookcode: $ => choice("GEN", "EXO", "LEV", "NUM", "DEU", "JOS", "JDG",
-              "RUT", "1SA", "2SA", "1KI", "2KI", 
-              "1CH", "2CH", "EZR", "NEH", "EST", "JOB", "PSA", "PRO", 
-              "ECC", "SNG", "ISA", "JER", "LAM", "EZK", "DAN", 
-              "HOS", "JOL", "AMO", "OBA", "JON", "MIC", "NAM", "HAB", 
-              "ZEP", "HAG", "ZEC", "MAL", "MAT", "MRK", "LUK", 
-              "JHN", "ACT", "ROM", "1CO", "2CO", "GAL", "EPH", "PHP", 
-              "COL", "1TH", "2TH", "1TI", "2TI", "TIT", "PHM", 
-              "HEB", "JAS", "1PE", "2PE", "1JN", "2JN", "3JN", "JUD", 
-              "REV", "TOB", "JDT", "ESG", "WIS", "SIR", "BAR", 
-              "LJE", "S3Y", "SUS", "BEL", "1MA", "2MA", "3MA", "4MA", 
-              "1ES", "2ES", "MAN", "PS2", "ODA", "PSS", "EZA", 
-              "5EZ", "6EZ", "DAG", "PS3", "2BA", "LBA", "JUB", "ENO", 
-              "1MQ", "2MQ", "3MQ", "REP", "4BA", "LAO", "FRT", 
-              "BAK", "OTH", "INT", "CNC", "GLO", "TDX", "NDX", "TOB",
-              "JDT", "ESG", "WIS", "SIR", "BAR", "LJE", "S3Y", 
-              "SUS", "BEL", "1MA", "2MA", "3MA", "4MA", "1ES", "2ES", 
-              "MAN", "PS2", "ODA", "PSS", "EZA", "5EZ", "6EZ", 
-              "DAG", "PS3", "2BA", "LBA", "JUB", "ENO", "1MQ", "2MQ", 
-              "3MQ", "REP", "4BA", "LAO", "FRT", "BAK", "OTH", 
-              "INT", "CNC", "GLO", "TDX", "NDX", "XXA", "XXB", "XXC", 
-              "XXD", "XXE", "XXF", "XXG"),
+    bookcode: $ => /\w{3}/,
+     // bookcode: $ => choice("GEN", "EXO", "LEV", "NUM", "DEU", "JOS", "JDG",
+     //          "RUT", "1SA", "2SA", "1KI", "2KI", 
+     //          "1CH", "2CH", "EZR", "NEH", "EST", "JOB", "PSA", "PRO", 
+     //          "ECC", "SNG", "ISA", "JER", "LAM", "EZK", "DAN", 
+     //          "HOS", "JOL", "AMO", "OBA", "JON", "MIC", "NAM", "HAB", 
+     //          "ZEP", "HAG", "ZEC", "MAL", "MAT", "MRK", "LUK", 
+     //          "JHN", "ACT", "ROM", "1CO", "2CO", "GAL", "EPH", "PHP", 
+     //          "COL", "1TH", "2TH", "1TI", "2TI", "TIT", "PHM", 
+     //          "HEB", "JAS", "1PE", "2PE", "1JN", "2JN", "3JN", "JUD", 
+     //          "REV", "TOB", "JDT", "ESG", "WIS", "SIR", "BAR", 
+     //          "LJE", "S3Y", "SUS", "BEL", "1MA", "2MA", "3MA", "4MA", 
+     //          "1ES", "2ES", "MAN", "PS2", "ODA", "PSS", "EZA", 
+     //          "5EZ", "6EZ", "DAG", "PS3", "2BA", "LBA", "JUB", "ENO", 
+     //          "1MQ", "2MQ", "3MQ", "REP", "4BA", "LAO", "FRT", 
+     //          "BAK", "OTH", "INT", "CNC", "GLO", "TDX", "NDX", "TOB",
+     //          "JDT", "ESG", "WIS", "SIR", "BAR", "LJE", "S3Y", 
+     //          "SUS", "BEL", "1MA", "2MA", "3MA", "4MA", "1ES", "2ES", 
+     //          "MAN", "PS2", "ODA", "PSS", "EZA", "5EZ", "6EZ", 
+     //          "DAG", "PS3", "2BA", "LBA", "JUB", "ENO", "1MQ", "2MQ", 
+     //          "3MQ", "REP", "4BA", "LAO", "FRT", "BAK", "OTH", 
+     //          "INT", "CNC", "GLO", "TDX", "NDX", "XXA", "XXB", "XXC", 
+     //          "XXD", "XXE", "XXF", "XXG"),
     text: $ => /[^\\\|]+/,
     _text: $ => /[^\\\|]+/,
     _spaceOrLine: $ => /[\s\n\r]/,
@@ -87,6 +88,7 @@ module.exports = grammar({
     _introText: $ => repeat1(choice($.text, $.iqt,
       $.xt_standalone,
       $._characterMarker,
+      $.fig,
       $.ref,
       )),
 
@@ -128,7 +130,7 @@ module.exports = grammar({
 
     // verse
     verseText: $ => prec.right(0, repeat1(choice($.text,
-      $._characterMarker,
+      $._characterMarker, $.fig
       ))),
     v: $ => prec.right(0,seq("\\v ", $.verseNumber, repeat($._verseMeta))),
     verseNumber: $ => /\d+\w?(-\d+\w?)?[\s\n\r]*/,
@@ -178,6 +180,7 @@ module.exports = grammar({
     cp: $ => seq("\\cp ", $.text),
     cd: $ => prec.right(0,seq("\\cd ", repeat1(choice($.text,
       $._characterMarker,
+      $.fig,
       $.xt_standalone
       )))),
 
@@ -195,28 +198,30 @@ module.exports = grammar({
 
     mtBlock: $ => prec.right(0,repeat1($.mt)),
     mt: $ => seq($.mtTag, repeat1(choice($.text,
-      $.footnote, $.crossref      
+      $.footnote, $.crossref
       ))),
     mtTag: $ => seq("\\mt",optional($.numberedLevelMax4), " "),
 
     mteBlock: $ => prec.right(0,repeat1($.mte)),
     mte: $ => prec.right(0, seq($.mteTag, repeat1(choice($.text,
-      $.footnote, $.crossref      
+      $.footnote, $.crossref
       )))),
     mteTag: $ => seq("\\mte",optional(token.immediate(/[12]/)), " "),
 
     msBlock: $ => prec.right(0, repeat1($.ms)),
     ms: $ => prec.right(0, seq($.msTag, repeat1(choice($.text,
       $.footnote, $.crossref,
-      $._characterMarker      
+      $._characterMarker,
+      $.fig,
       )), optional($.mr))),
     msTag: $ => seq("\\ms",optional($.numberedLevelMax3), " "),
     mr: $ => seq("\\mr ", $.text),
 
     sBlock: $ => prec.right(0, repeat1($.s)),
     s: $ => prec.right(0, seq($.sTag, repeat(choice($.text,
       $.footnote, $.crossref, 
-      $._characterMarker      
+      $._characterMarker,
+      $.fig,
       )), optional($.sr), optional($.r))),
     sTag: $ => seq("\\s",optional($.numberedLevelMax5), " "),
     sr: $ => seq("\\sr ", $.text),
@@ -371,6 +376,7 @@ module.exports = grammar({
     caller: $ => /[^\s\\]+/,
     footnoteText: $ => prec.right(0, repeat1(choice($.text,
       $._nestedCharacterMarker,
+      $._characterMarker,
       $.ref,
       $.fig,
       $.xt_standalone,
@@ -380,6 +386,7 @@ module.exports = grammar({
 
     crossrefText: $ => prec.right(0, repeat1(choice($.text,
       $._nestedCharacterMarker,
+      // $._characterMarker,
       $.ref,
       $.fig,
       ))),
@@ -398,7 +405,7 @@ module.exports = grammar({
     fp: $ => seq("\\fp ", $.footnoteText, optional("\\fp*")),
     ft: $ => seq("\\ft ", $.footnoteText, optional("\\ft*")),
     fdc: $ => seq("\\fdc ", $.footnoteText, optional("\\fdc*")),
-    fv: $ => seq("\\fv ", $.text, optional("\\fv*")),
+    fv: $ => seq("\\fv ", $.text, "\\fv*"),
     fm: $ => seq("\\fm ", $.footnoteText, "\\fm*"),
 
     _footnoteContents: $ => choice(
@@ -432,7 +439,7 @@ module.exports = grammar({
       optional("\\+xt*")),
     xt_standalone: $ => seq("\\xt ", $.crossrefText,
       optional(choice($.defaultAttribute, $._attributesInCrossref)), 
-      choice("\\xt*", "\\x*")),
+      "\\xt*"),
     xta: $ => seq("\\xta ", $.crossrefText, optional("\\xta*")),
     xop: $ => seq("\\xop ", $.crossrefText, optional("\\xop*")),
     xot: $ => seq("\\xot ", $.crossrefText, optional("\\xot*")),
@@ -459,39 +466,40 @@ module.exports = grammar({
       $._nestedCharacterMarker,
       // $._characterMarker,
       $.footnote,
-      $.crossref
+      $.crossref,
+      $.fig,
     ))),
 
-    add: $ => seq("\\add", $._innerText, "\\add*"),
-    bk: $ => seq("\\bk", $._innerText, "\\bk*"),
-    dc: $ => seq("\\dc", $._innerText, "\\dc*"),
-    k: $ => seq("\\k", $._innerText, "\\k*"),
-    nd: $ => seq("\\nd", $._innerText, "\\nd*"),
-    ord: $ => seq("\\ord", $._innerText, "\\ord*"),
-    pn: $ => seq("\\pn", $._innerText, "\\pn*"),
-    png: $ => seq("\\png", $._innerText, "\\png*"),
-    addpn: $ => seq("\\addpn", $._innerText, "\\addpn*"),
-    qt: $ => seq("\\qt", $._innerText, "\\qt*"),
-    sig: $ => seq("\\sig", $._innerText, "\\sig*"),
-    sls: $ => seq("\\sls", $._innerText, "\\sls*"),
-    tl: $ => seq("\\tl", $._innerText, "\\tl*"),
-    wj: $ => seq("\\wj", $._innerText, "\\wj*"),
-
-    em: $ => seq("\\em", $._innerText, "\\em*"),
-    bd: $ => seq("\\bd", $._innerText, "\\bd*"),
-    it: $ => seq("\\it", $._innerText, "\\it*"),
-    bdit: $ => seq("\\bdit", $._innerText, "\\bdit*"),
-    no: $ => seq("\\no", $._innerText, "\\no*"),
-    sc: $ => seq("\\sc", $._innerText, "\\sc*"),
-    sup: $ => seq("\\sup", $._innerText, "\\sup*"),
-
-    ndx: $ => seq("\\ndx", $._innerText, "\\ndx*"),
-    pro: $ => seq("\\pro", $._innerText, "\\pro*"),
-    rb: $ => seq("\\rb", $._innerText, choice($.defaultAttribute, $._rbAttributes), "\\rb*"),
-    w: $ => seq("\\w", $._innerText, optional(choice($.defaultAttribute, $._wAttributes)), "\\w*"),
-    wg: $ => seq("\\wg", $._innerText, "\\wg*"),
-    wh: $ => seq("\\wh", $._innerText, "\\wh*"),
-    wa: $ => seq("\\wa", $._innerText, "\\wa*"),
+    add: $ => seq("\\add", $._spaceOrLine, $._innerText, "\\add*"),
+    bk: $ => seq("\\bk", $._spaceOrLine, $._innerText, "\\bk*"),
+    dc: $ => seq("\\dc", $._spaceOrLine, $._innerText, "\\dc*"),
+    k: $ => seq("\\k", $._spaceOrLine, $._innerText, optional(choice($.defaultAttribute, $._kAttributes)), "\\k*"),
+    nd: $ => seq("\\nd", $._spaceOrLine, $._innerText, "\\nd*"),
+    ord: $ => seq("\\ord", $._spaceOrLine, $._innerText, "\\ord*"),
+    pn: $ => seq("\\pn", $._spaceOrLine, $._innerText, "\\pn*"),
+    png: $ => seq("\\png", $._spaceOrLine, $._innerText, "\\png*"),
+    addpn: $ => seq("\\addpn", $._spaceOrLine, $._innerText, "\\addpn*"),
+    qt: $ => seq("\\qt", $._spaceOrLine, $._innerText, "\\qt*"),
+    sig: $ => seq("\\sig", $._spaceOrLine, $._innerText, "\\sig*"),
+    sls: $ => seq("\\sls", $._spaceOrLine, $._innerText, "\\sls*"),
+    tl: $ => seq("\\tl", $._spaceOrLine, $._innerText, "\\tl*"),
+    wj: $ => seq("\\wj", $._spaceOrLine, $._innerText, "\\wj*"),
+
+    em: $ => seq("\\em", $._spaceOrLine, $._innerText, "\\em*"),
+    bd: $ => seq("\\bd", $._spaceOrLine, $._innerText, "\\bd*"),
+    it: $ => seq("\\it", $._spaceOrLine, $._innerText, "\\it*"),
+    bdit: $ => seq("\\bdit", $._spaceOrLine, $._innerText, "\\bdit*"),
+    no: $ => seq("\\no", $._spaceOrLine, $._innerText, "\\no*"),
+    sc: $ => seq("\\sc", $._spaceOrLine, $._innerText, "\\sc*"),
+    sup: $ => seq("\\sup", $._spaceOrLine, $._innerText, "\\sup*"),
+
+    ndx: $ => seq("\\ndx", $._spaceOrLine, $._innerText, "\\ndx*"),
+    pro: $ => seq("\\pro", $._spaceOrLine, $._innerText, "\\pro*"),
+    rb: $ => seq("\\rb", $._spaceOrLine, $._innerText, choice($.defaultAttribute, $._rbAttributes), "\\rb*"),
+    w: $ => seq("\\w", $._spaceOrLine, $._innerText, optional(choice($.defaultAttribute, $._wAttributes)), "\\w*"),
+    wg: $ => seq("\\wg", $._spaceOrLine, $._innerText, "\\wg*"),
+    wh: $ => seq("\\wh", $._spaceOrLine, $._innerText, "\\wh*"),
+    wa: $ => seq("\\wa", $._spaceOrLine, $._innerText, "\\wa*"),
 
     _characterMarker: $ => choice(
       $.add,
@@ -523,42 +531,42 @@ module.exports = grammar({
       $.wh,
       $.wa,
       $.jmp,
-      $.fig,
+//      $.fig,
       // $.zNameSpace, makes all zNameSpaces part of paragraph content, like milestones
     ),
 
-    addNested: $ => seq("\\+add", $._innerText, "\\+add*"),
-    bkNested: $ => seq("\\+bk", $._innerText, "\\+bk*"),
-    dcNested: $ => seq("\\+dc", $._innerText, "\\+dc*"),
-    kNested: $ => seq("\\+k", $._innerText, "\\+k*"),
-    ndNested: $ => seq("\\+nd", $._innerText, "\\+nd*"),
-    ordNested: $ => seq("\\+ord", $._innerText, "\\+ord*"),
-    pnNested: $ => seq("\\+pn", $._innerText, "\\+pn*"),
-    pngNested: $ => seq("\\+png", $._innerText, "\\+png*"),
-    addpnNested: $ => seq("\\+addpn", $._innerText, "\\+addpn*"),
-    qtNested: $ => seq("\\+qt", $._innerText, "\\+qt*"),
-    sigNested: $ => seq("\\+sig", $._innerText, "\\+sig*"),
-    slsNested: $ => seq("\\+sls", $._innerText, "\\+sls*"),
-    tlNested: $ => seq("\\+tl", $._innerText, "\\+tl*"),
-    wjNested: $ => seq("\\+wj", $._innerText, "\\+wj*"),
-
-    emNested: $ => seq("\\+em", $._innerText, "\\+em*"),
-    bdNested: $ => seq("\\+bd", $._innerText, "\\+bd*"),
-    itNested: $ => seq("\\+it", $._innerText, "\\+it*"),
-    bditNested: $ => seq("\\+bdit", $._innerText, "\\+bdit*"),
-    noNested: $ => seq("\\+no", $._innerText, "\\+no*"),
-    scNested: $ => seq("\\+sc", $._innerText, "\\+sc*"),
-    supNested: $ => seq("\\+sup", $._innerText, "\\+sup*"),
-
-    ndxNested: $ => seq("\\+ndx", $._innerText, "\\+ndx*"),
-    proNested: $ => seq("\\+pro", $._innerText, "\\+pro*"),
-    rbNested: $ => seq("\\+rb", $._innerText, optional(
+    addNested: $ => seq("\\+add", $._spaceOrLine, $._innerText, "\\+add*"),
+    bkNested: $ => seq("\\+bk", $._spaceOrLine, $._innerText, "\\+bk*"),
+    dcNested: $ => seq("\\+dc", $._spaceOrLine, $._innerText, "\\+dc*"),
+    kNested: $ => seq("\\+k", $._spaceOrLine, $._innerText, "\\+k*"),
+    ndNested: $ => seq("\\+nd", $._spaceOrLine, $._innerText, "\\+nd*"),
+    ordNested: $ => seq("\\+ord", $._spaceOrLine, $._innerText, "\\+ord*"),
+    pnNested: $ => seq("\\+pn", $._spaceOrLine, $._innerText, "\\+pn*"),
+    pngNested: $ => seq("\\+png", $._spaceOrLine, $._innerText, "\\+png*"),
+    addpnNested: $ => seq("\\+addpn", $._spaceOrLine, $._innerText, "\\+addpn*"),
+    qtNested: $ => seq("\\+qt", $._spaceOrLine, $._innerText, "\\+qt*"),
+    sigNested: $ => seq("\\+sig", $._spaceOrLine, $._innerText, "\\+sig*"),
+    slsNested: $ => seq("\\+sls", $._spaceOrLine, $._innerText, "\\+sls*"),
+    tlNested: $ => seq("\\+tl", $._spaceOrLine, $._innerText, "\\+tl*"),
+    wjNested: $ => seq("\\+wj", $._spaceOrLine, $._innerText, "\\+wj*"),
+
+    emNested: $ => seq("\\+em", $._spaceOrLine, $._innerText, "\\+em*"),
+    bdNested: $ => seq("\\+bd", $._spaceOrLine, $._innerText, "\\+bd*"),
+    itNested: $ => seq("\\+it", $._spaceOrLine, $._innerText, "\\+it*"),
+    bditNested: $ => seq("\\+bdit", $._spaceOrLine, $._innerText, "\\+bdit*"),
+    noNested: $ => seq("\\+no", $._spaceOrLine, $._innerText, "\\+no*"),
+    scNested: $ => seq("\\+sc", $._spaceOrLine, $._innerText, "\\+sc*"),
+    supNested: $ => seq("\\+sup", $._spaceOrLine, $._innerText, "\\+sup*"),
+
+    ndxNested: $ => seq("\\+ndx", $._spaceOrLine, $._innerText, "\\+ndx*"),
+    proNested: $ => seq("\\+pro", $._spaceOrLine, $._innerText, "\\+pro*"),
+    rbNested: $ => seq("\\+rb", $._spaceOrLine, $._innerText, optional(
       choice($.defaultAttribute, $._rbAttributes)), "\\+rb*"),
-    wNested: $ => seq("\\+w", $._innerText, optional(
+    wNested: $ => seq("\\+w", $._spaceOrLine, $._innerText, optional(
       choice($.defaultAttribute, $._wAttributes)), "\\+w*"),
-    wgNested: $ => seq("\\+wg", $._innerText, "\\+wg*"),
-    whNested: $ => seq("\\+wh", $._innerText, "\\+wh*"),
-    waNested: $ => seq("\\+wa", $._innerText, "\\+wa*"),
+    wgNested: $ => seq("\\+wg", $._spaceOrLine, $._innerText, "\\+wg*"),
+    whNested: $ => seq("\\+wh", $._spaceOrLine, $._innerText, "\\+wh*"),
+    waNested: $ => seq("\\+wa", $._spaceOrLine, $._innerText, "\\+wa*"),
 
     _nestedCharacterMarker: $ => choice(
       $.addNested,
@@ -669,6 +677,8 @@ module.exports = grammar({
       attributed elements are defined here*/
 
     // _wAttributes: $ => seq("|", $.lemmaAttribute),
+    _kAttributes: $ => $.keyAttribute,
+    keyAttribute: $ => seq("key", "=", '"', optional($.attributeValue), '"'),
     _wAttributes: $ => prec.right(0, seq("|", repeat1(choice($.lemmaAttribute, $.strongAttribute,
       $.scrlocAttribute, $.linkAttribute, $.customAttribute)))),
     _rbAttributes: $ => prec.right(0, seq("|", repeat1(choice($.glossAttribute, $.customAttribute,