Skip to content

Commit

Permalink
More Grammar changes as per USFM/X 3.1 (#255)
Browse files Browse the repository at this point in the history
* Allow any 3 letter code as bookcode

* Support nested markers without +

* Make closing mandatory for \fv

* Introduce 'key' attribute in \k
  • Loading branch information
kavitharaju authored Aug 7, 2024
1 parent c151e14 commit 9edddf8
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 97 deletions.
2 changes: 1 addition & 1 deletion py-usfm-parser/src/usfm_grammar/usx_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class USXGenerator:
NESTED_CHAR_STYLE_MARKERS = [item+"Nested" for item in CHAR_STYLE_MARKERS]
DEFAULT_ATTRIB_MAP = {"w":"lemma", "rb":"gloss", "xt":"href", "fig":"alt",
"xt_standalone":"href", "xtNested":"href", "ref":"loc",
"milestone":"who"}
"milestone":"who", "k":"key"}
TABLE_CELL_MARKERS = ["tc", "th", "tcr", "thr"]
MISC_MARKERS = ["fig", "cat", "esb", "b", "ph", "pi"]

Expand Down
2 changes: 0 additions & 2 deletions py-usfm-parser/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,6 @@ def find_all_markers(usfm_path, keep_id=False, keep_number=True):

f"{TEST_DIR}/biblica/PublishingVersesWithFormatting/origin.usfm": "fail", # \c without number

f"{TEST_DIR}/specExamples/extended/contentCatogories1/origin.usfm": "fail", # cat inside footnote

f'{TEST_DIR}/special-cases/figure_with_quotes_in_desc/origin.usfm': "fail", # quote within quote
f'{TEST_DIR}/specExamples/poetry/origin.usfm': "fail", # \b not followed by a \p or \q

Expand Down
7 changes: 4 additions & 3 deletions py-usfm-parser/tests/test_json_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,9 @@ def remove_newlines_in_text(usj_dict):
for i,item in enumerate(usj_dict["content"]):
if isinstance(item, str):
usj_dict['content'][i] = item.replace("\n", " ")
usj_dict['content'][i] = re.sub(r" +", " ", usj_dict['content'][i])
continue
remove_newlines_in_text(item)
usj_dict['content'][i] = re.sub(r"\s+", " ", usj_dict['content'][i])
else:
remove_newlines_in_text(item)

def strip_text_value(usj_dict):
'''Trailing and preceding space handling can be different between tcdocs and our logic.
Expand Down Expand Up @@ -188,6 +188,7 @@ def test_compare_usj_with_testsuite_samples(file_path):
pass
except AssertionError:
strip_default_attrib_value(origin_usj)
remove_newlines_in_text(origin_usj)
strip_text_value(usj_dict)
strip_text_value(origin_usj)
dict_diff = DeepDiff(usj_dict, origin_usj, ignore_order=True)
Expand Down
192 changes: 101 additions & 91 deletions tree-sitter-usfm3/grammar.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,28 +14,29 @@ module.exports = grammar({
)),
_mandatoryHead: $ => prec.right(0, seq($.book, repeat($._bookHeader))),

bookcode: $ => choice("GEN", "EXO", "LEV", "NUM", "DEU", "JOS", "JDG",
"RUT", "1SA", "2SA", "1KI", "2KI",
"1CH", "2CH", "EZR", "NEH", "EST", "JOB", "PSA", "PRO",
"ECC", "SNG", "ISA", "JER", "LAM", "EZK", "DAN",
"HOS", "JOL", "AMO", "OBA", "JON", "MIC", "NAM", "HAB",
"ZEP", "HAG", "ZEC", "MAL", "MAT", "MRK", "LUK",
"JHN", "ACT", "ROM", "1CO", "2CO", "GAL", "EPH", "PHP",
"COL", "1TH", "2TH", "1TI", "2TI", "TIT", "PHM",
"HEB", "JAS", "1PE", "2PE", "1JN", "2JN", "3JN", "JUD",
"REV", "TOB", "JDT", "ESG", "WIS", "SIR", "BAR",
"LJE", "S3Y", "SUS", "BEL", "1MA", "2MA", "3MA", "4MA",
"1ES", "2ES", "MAN", "PS2", "ODA", "PSS", "EZA",
"5EZ", "6EZ", "DAG", "PS3", "2BA", "LBA", "JUB", "ENO",
"1MQ", "2MQ", "3MQ", "REP", "4BA", "LAO", "FRT",
"BAK", "OTH", "INT", "CNC", "GLO", "TDX", "NDX", "TOB",
"JDT", "ESG", "WIS", "SIR", "BAR", "LJE", "S3Y",
"SUS", "BEL", "1MA", "2MA", "3MA", "4MA", "1ES", "2ES",
"MAN", "PS2", "ODA", "PSS", "EZA", "5EZ", "6EZ",
"DAG", "PS3", "2BA", "LBA", "JUB", "ENO", "1MQ", "2MQ",
"3MQ", "REP", "4BA", "LAO", "FRT", "BAK", "OTH",
"INT", "CNC", "GLO", "TDX", "NDX", "XXA", "XXB", "XXC",
"XXD", "XXE", "XXF", "XXG"),
bookcode: $ => /\w{3}/,
// bookcode: $ => choice("GEN", "EXO", "LEV", "NUM", "DEU", "JOS", "JDG",
// "RUT", "1SA", "2SA", "1KI", "2KI",
// "1CH", "2CH", "EZR", "NEH", "EST", "JOB", "PSA", "PRO",
// "ECC", "SNG", "ISA", "JER", "LAM", "EZK", "DAN",
// "HOS", "JOL", "AMO", "OBA", "JON", "MIC", "NAM", "HAB",
// "ZEP", "HAG", "ZEC", "MAL", "MAT", "MRK", "LUK",
// "JHN", "ACT", "ROM", "1CO", "2CO", "GAL", "EPH", "PHP",
// "COL", "1TH", "2TH", "1TI", "2TI", "TIT", "PHM",
// "HEB", "JAS", "1PE", "2PE", "1JN", "2JN", "3JN", "JUD",
// "REV", "TOB", "JDT", "ESG", "WIS", "SIR", "BAR",
// "LJE", "S3Y", "SUS", "BEL", "1MA", "2MA", "3MA", "4MA",
// "1ES", "2ES", "MAN", "PS2", "ODA", "PSS", "EZA",
// "5EZ", "6EZ", "DAG", "PS3", "2BA", "LBA", "JUB", "ENO",
// "1MQ", "2MQ", "3MQ", "REP", "4BA", "LAO", "FRT",
// "BAK", "OTH", "INT", "CNC", "GLO", "TDX", "NDX", "TOB",
// "JDT", "ESG", "WIS", "SIR", "BAR", "LJE", "S3Y",
// "SUS", "BEL", "1MA", "2MA", "3MA", "4MA", "1ES", "2ES",
// "MAN", "PS2", "ODA", "PSS", "EZA", "5EZ", "6EZ",
// "DAG", "PS3", "2BA", "LBA", "JUB", "ENO", "1MQ", "2MQ",
// "3MQ", "REP", "4BA", "LAO", "FRT", "BAK", "OTH",
// "INT", "CNC", "GLO", "TDX", "NDX", "XXA", "XXB", "XXC",
// "XXD", "XXE", "XXF", "XXG"),
text: $ => /[^\\\|]+/,
_text: $ => /[^\\\|]+/,
_spaceOrLine: $ => /[\s\n\r]/,
Expand Down Expand Up @@ -87,6 +88,7 @@ module.exports = grammar({
_introText: $ => repeat1(choice($.text, $.iqt,
$.xt_standalone,
$._characterMarker,
$.fig,
$.ref,
)),

Expand Down Expand Up @@ -128,7 +130,7 @@ module.exports = grammar({

// verse
verseText: $ => prec.right(0, repeat1(choice($.text,
$._characterMarker,
$._characterMarker, $.fig
))),
v: $ => prec.right(0,seq("\\v ", $.verseNumber, repeat($._verseMeta))),
verseNumber: $ => /\d+\w?(-\d+\w?)?[\s\n\r]*/,
Expand Down Expand Up @@ -178,6 +180,7 @@ module.exports = grammar({
cp: $ => seq("\\cp ", $.text),
cd: $ => prec.right(0,seq("\\cd ", repeat1(choice($.text,
$._characterMarker,
$.fig,
$.xt_standalone
)))),

Expand All @@ -195,28 +198,30 @@ module.exports = grammar({

mtBlock: $ => prec.right(0,repeat1($.mt)),
mt: $ => seq($.mtTag, repeat1(choice($.text,
$.footnote, $.crossref
$.footnote, $.crossref
))),
mtTag: $ => seq("\\mt",optional($.numberedLevelMax4), " "),

mteBlock: $ => prec.right(0,repeat1($.mte)),
mte: $ => prec.right(0, seq($.mteTag, repeat1(choice($.text,
$.footnote, $.crossref
$.footnote, $.crossref
)))),
mteTag: $ => seq("\\mte",optional(token.immediate(/[12]/)), " "),

msBlock: $ => prec.right(0, repeat1($.ms)),
ms: $ => prec.right(0, seq($.msTag, repeat1(choice($.text,
$.footnote, $.crossref,
$._characterMarker
$._characterMarker,
$.fig,
)), optional($.mr))),
msTag: $ => seq("\\ms",optional($.numberedLevelMax3), " "),
mr: $ => seq("\\mr ", $.text),

sBlock: $ => prec.right(0, repeat1($.s)),
s: $ => prec.right(0, seq($.sTag, repeat(choice($.text,
$.footnote, $.crossref,
$._characterMarker
$._characterMarker,
$.fig,
)), optional($.sr), optional($.r))),
sTag: $ => seq("\\s",optional($.numberedLevelMax5), " "),
sr: $ => seq("\\sr ", $.text),
Expand Down Expand Up @@ -371,6 +376,7 @@ module.exports = grammar({
caller: $ => /[^\s\\]+/,
footnoteText: $ => prec.right(0, repeat1(choice($.text,
$._nestedCharacterMarker,
$._characterMarker,
$.ref,
$.fig,
$.xt_standalone,
Expand All @@ -380,6 +386,7 @@ module.exports = grammar({

crossrefText: $ => prec.right(0, repeat1(choice($.text,
$._nestedCharacterMarker,
// $._characterMarker,
$.ref,
$.fig,
))),
Expand All @@ -398,7 +405,7 @@ module.exports = grammar({
fp: $ => seq("\\fp ", $.footnoteText, optional("\\fp*")),
ft: $ => seq("\\ft ", $.footnoteText, optional("\\ft*")),
fdc: $ => seq("\\fdc ", $.footnoteText, optional("\\fdc*")),
fv: $ => seq("\\fv ", $.text, optional("\\fv*")),
fv: $ => seq("\\fv ", $.text, "\\fv*"),
fm: $ => seq("\\fm ", $.footnoteText, "\\fm*"),

_footnoteContents: $ => choice(
Expand Down Expand Up @@ -432,7 +439,7 @@ module.exports = grammar({
optional("\\+xt*")),
xt_standalone: $ => seq("\\xt ", $.crossrefText,
optional(choice($.defaultAttribute, $._attributesInCrossref)),
choice("\\xt*", "\\x*")),
"\\xt*"),
xta: $ => seq("\\xta ", $.crossrefText, optional("\\xta*")),
xop: $ => seq("\\xop ", $.crossrefText, optional("\\xop*")),
xot: $ => seq("\\xot ", $.crossrefText, optional("\\xot*")),
Expand All @@ -459,39 +466,40 @@ module.exports = grammar({
$._nestedCharacterMarker,
// $._characterMarker,
$.footnote,
$.crossref
$.crossref,
$.fig,
))),

add: $ => seq("\\add", $._innerText, "\\add*"),
bk: $ => seq("\\bk", $._innerText, "\\bk*"),
dc: $ => seq("\\dc", $._innerText, "\\dc*"),
k: $ => seq("\\k", $._innerText, "\\k*"),
nd: $ => seq("\\nd", $._innerText, "\\nd*"),
ord: $ => seq("\\ord", $._innerText, "\\ord*"),
pn: $ => seq("\\pn", $._innerText, "\\pn*"),
png: $ => seq("\\png", $._innerText, "\\png*"),
addpn: $ => seq("\\addpn", $._innerText, "\\addpn*"),
qt: $ => seq("\\qt", $._innerText, "\\qt*"),
sig: $ => seq("\\sig", $._innerText, "\\sig*"),
sls: $ => seq("\\sls", $._innerText, "\\sls*"),
tl: $ => seq("\\tl", $._innerText, "\\tl*"),
wj: $ => seq("\\wj", $._innerText, "\\wj*"),

em: $ => seq("\\em", $._innerText, "\\em*"),
bd: $ => seq("\\bd", $._innerText, "\\bd*"),
it: $ => seq("\\it", $._innerText, "\\it*"),
bdit: $ => seq("\\bdit", $._innerText, "\\bdit*"),
no: $ => seq("\\no", $._innerText, "\\no*"),
sc: $ => seq("\\sc", $._innerText, "\\sc*"),
sup: $ => seq("\\sup", $._innerText, "\\sup*"),

ndx: $ => seq("\\ndx", $._innerText, "\\ndx*"),
pro: $ => seq("\\pro", $._innerText, "\\pro*"),
rb: $ => seq("\\rb", $._innerText, choice($.defaultAttribute, $._rbAttributes), "\\rb*"),
w: $ => seq("\\w", $._innerText, optional(choice($.defaultAttribute, $._wAttributes)), "\\w*"),
wg: $ => seq("\\wg", $._innerText, "\\wg*"),
wh: $ => seq("\\wh", $._innerText, "\\wh*"),
wa: $ => seq("\\wa", $._innerText, "\\wa*"),
add: $ => seq("\\add", $._spaceOrLine, $._innerText, "\\add*"),
bk: $ => seq("\\bk", $._spaceOrLine, $._innerText, "\\bk*"),
dc: $ => seq("\\dc", $._spaceOrLine, $._innerText, "\\dc*"),
k: $ => seq("\\k", $._spaceOrLine, $._innerText, optional(choice($.defaultAttribute, $._kAttributes)), "\\k*"),
nd: $ => seq("\\nd", $._spaceOrLine, $._innerText, "\\nd*"),
ord: $ => seq("\\ord", $._spaceOrLine, $._innerText, "\\ord*"),
pn: $ => seq("\\pn", $._spaceOrLine, $._innerText, "\\pn*"),
png: $ => seq("\\png", $._spaceOrLine, $._innerText, "\\png*"),
addpn: $ => seq("\\addpn", $._spaceOrLine, $._innerText, "\\addpn*"),
qt: $ => seq("\\qt", $._spaceOrLine, $._innerText, "\\qt*"),
sig: $ => seq("\\sig", $._spaceOrLine, $._innerText, "\\sig*"),
sls: $ => seq("\\sls", $._spaceOrLine, $._innerText, "\\sls*"),
tl: $ => seq("\\tl", $._spaceOrLine, $._innerText, "\\tl*"),
wj: $ => seq("\\wj", $._spaceOrLine, $._innerText, "\\wj*"),

em: $ => seq("\\em", $._spaceOrLine, $._innerText, "\\em*"),
bd: $ => seq("\\bd", $._spaceOrLine, $._innerText, "\\bd*"),
it: $ => seq("\\it", $._spaceOrLine, $._innerText, "\\it*"),
bdit: $ => seq("\\bdit", $._spaceOrLine, $._innerText, "\\bdit*"),
no: $ => seq("\\no", $._spaceOrLine, $._innerText, "\\no*"),
sc: $ => seq("\\sc", $._spaceOrLine, $._innerText, "\\sc*"),
sup: $ => seq("\\sup", $._spaceOrLine, $._innerText, "\\sup*"),

ndx: $ => seq("\\ndx", $._spaceOrLine, $._innerText, "\\ndx*"),
pro: $ => seq("\\pro", $._spaceOrLine, $._innerText, "\\pro*"),
rb: $ => seq("\\rb", $._spaceOrLine, $._innerText, choice($.defaultAttribute, $._rbAttributes), "\\rb*"),
w: $ => seq("\\w", $._spaceOrLine, $._innerText, optional(choice($.defaultAttribute, $._wAttributes)), "\\w*"),
wg: $ => seq("\\wg", $._spaceOrLine, $._innerText, "\\wg*"),
wh: $ => seq("\\wh", $._spaceOrLine, $._innerText, "\\wh*"),
wa: $ => seq("\\wa", $._spaceOrLine, $._innerText, "\\wa*"),

_characterMarker: $ => choice(
$.add,
Expand Down Expand Up @@ -523,42 +531,42 @@ module.exports = grammar({
$.wh,
$.wa,
$.jmp,
$.fig,
// $.fig,
// $.zNameSpace, makes all zNameSpaces part of paragraph content, like milestones
),

addNested: $ => seq("\\+add", $._innerText, "\\+add*"),
bkNested: $ => seq("\\+bk", $._innerText, "\\+bk*"),
dcNested: $ => seq("\\+dc", $._innerText, "\\+dc*"),
kNested: $ => seq("\\+k", $._innerText, "\\+k*"),
ndNested: $ => seq("\\+nd", $._innerText, "\\+nd*"),
ordNested: $ => seq("\\+ord", $._innerText, "\\+ord*"),
pnNested: $ => seq("\\+pn", $._innerText, "\\+pn*"),
pngNested: $ => seq("\\+png", $._innerText, "\\+png*"),
addpnNested: $ => seq("\\+addpn", $._innerText, "\\+addpn*"),
qtNested: $ => seq("\\+qt", $._innerText, "\\+qt*"),
sigNested: $ => seq("\\+sig", $._innerText, "\\+sig*"),
slsNested: $ => seq("\\+sls", $._innerText, "\\+sls*"),
tlNested: $ => seq("\\+tl", $._innerText, "\\+tl*"),
wjNested: $ => seq("\\+wj", $._innerText, "\\+wj*"),

emNested: $ => seq("\\+em", $._innerText, "\\+em*"),
bdNested: $ => seq("\\+bd", $._innerText, "\\+bd*"),
itNested: $ => seq("\\+it", $._innerText, "\\+it*"),
bditNested: $ => seq("\\+bdit", $._innerText, "\\+bdit*"),
noNested: $ => seq("\\+no", $._innerText, "\\+no*"),
scNested: $ => seq("\\+sc", $._innerText, "\\+sc*"),
supNested: $ => seq("\\+sup", $._innerText, "\\+sup*"),

ndxNested: $ => seq("\\+ndx", $._innerText, "\\+ndx*"),
proNested: $ => seq("\\+pro", $._innerText, "\\+pro*"),
rbNested: $ => seq("\\+rb", $._innerText, optional(
addNested: $ => seq("\\+add", $._spaceOrLine, $._innerText, "\\+add*"),
bkNested: $ => seq("\\+bk", $._spaceOrLine, $._innerText, "\\+bk*"),
dcNested: $ => seq("\\+dc", $._spaceOrLine, $._innerText, "\\+dc*"),
kNested: $ => seq("\\+k", $._spaceOrLine, $._innerText, "\\+k*"),
ndNested: $ => seq("\\+nd", $._spaceOrLine, $._innerText, "\\+nd*"),
ordNested: $ => seq("\\+ord", $._spaceOrLine, $._innerText, "\\+ord*"),
pnNested: $ => seq("\\+pn", $._spaceOrLine, $._innerText, "\\+pn*"),
pngNested: $ => seq("\\+png", $._spaceOrLine, $._innerText, "\\+png*"),
addpnNested: $ => seq("\\+addpn", $._spaceOrLine, $._innerText, "\\+addpn*"),
qtNested: $ => seq("\\+qt", $._spaceOrLine, $._innerText, "\\+qt*"),
sigNested: $ => seq("\\+sig", $._spaceOrLine, $._innerText, "\\+sig*"),
slsNested: $ => seq("\\+sls", $._spaceOrLine, $._innerText, "\\+sls*"),
tlNested: $ => seq("\\+tl", $._spaceOrLine, $._innerText, "\\+tl*"),
wjNested: $ => seq("\\+wj", $._spaceOrLine, $._innerText, "\\+wj*"),

emNested: $ => seq("\\+em", $._spaceOrLine, $._innerText, "\\+em*"),
bdNested: $ => seq("\\+bd", $._spaceOrLine, $._innerText, "\\+bd*"),
itNested: $ => seq("\\+it", $._spaceOrLine, $._innerText, "\\+it*"),
bditNested: $ => seq("\\+bdit", $._spaceOrLine, $._innerText, "\\+bdit*"),
noNested: $ => seq("\\+no", $._spaceOrLine, $._innerText, "\\+no*"),
scNested: $ => seq("\\+sc", $._spaceOrLine, $._innerText, "\\+sc*"),
supNested: $ => seq("\\+sup", $._spaceOrLine, $._innerText, "\\+sup*"),

ndxNested: $ => seq("\\+ndx", $._spaceOrLine, $._innerText, "\\+ndx*"),
proNested: $ => seq("\\+pro", $._spaceOrLine, $._innerText, "\\+pro*"),
rbNested: $ => seq("\\+rb", $._spaceOrLine, $._innerText, optional(
choice($.defaultAttribute, $._rbAttributes)), "\\+rb*"),
wNested: $ => seq("\\+w", $._innerText, optional(
wNested: $ => seq("\\+w", $._spaceOrLine, $._innerText, optional(
choice($.defaultAttribute, $._wAttributes)), "\\+w*"),
wgNested: $ => seq("\\+wg", $._innerText, "\\+wg*"),
whNested: $ => seq("\\+wh", $._innerText, "\\+wh*"),
waNested: $ => seq("\\+wa", $._innerText, "\\+wa*"),
wgNested: $ => seq("\\+wg", $._spaceOrLine, $._innerText, "\\+wg*"),
whNested: $ => seq("\\+wh", $._spaceOrLine, $._innerText, "\\+wh*"),
waNested: $ => seq("\\+wa", $._spaceOrLine, $._innerText, "\\+wa*"),

_nestedCharacterMarker: $ => choice(
$.addNested,
Expand Down Expand Up @@ -669,6 +677,8 @@ module.exports = grammar({
attributed elements are defined here*/

// _wAttributes: $ => seq("|", $.lemmaAttribute),
_kAttributes: $ => $.keyAttribute,
keyAttribute: $ => seq("key", "=", '"', optional($.attributeValue), '"'),
_wAttributes: $ => prec.right(0, seq("|", repeat1(choice($.lemmaAttribute, $.strongAttribute,
$.scrlocAttribute, $.linkAttribute, $.customAttribute)))),
_rbAttributes: $ => prec.right(0, seq("|", repeat1(choice($.glossAttribute, $.customAttribute,
Expand Down

0 comments on commit 9edddf8

Please sign in to comment.