Skip to content

Commit

Permalink
Modify USJ spec (#226)
Browse files Browse the repository at this point in the history
* change oneOf in USJ schema to anyOf

* change USJ schema: oneOf and type field

* udpate USJ generation as per the changes

* udpate List generation as it uses USJ output

* update USJ filtering to work with changed USJ output

* update USFM generation from USJ, for the output changes

* Change the USJ samples in test suite as per new strcuture

* test and fix the JSON conversion

* test and fix the List conversion

* Add new USJ for the bigger test samples too

* Change test for until next tcdocs tests' sync
  • Loading branch information
kavitharaju authored Nov 10, 2023
1 parent bf8b42d commit 93df393
Show file tree
Hide file tree
Showing 188 changed files with 181,011 additions and 88,128 deletions.
12 changes: 7 additions & 5 deletions py-usfm-parser/src/usfm_grammar/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,10 @@ def exclude_markers_in_usj(input_usj,
return []
return [input_usj]
cleaned_kids = []
exclude_markers = [re.sub(trailing_num_pattern, '', item.split(':')[-1])
exclude_markers = [re.sub(trailing_num_pattern, '', item)
for item in exclude_markers]
this_marker = re.sub(trailing_num_pattern,'', input_usj['type'].split(':')[-1])
this_marker = input_usj['marker'] if 'marker' in input_usj else ''
this_marker = re.sub(trailing_num_pattern, '', this_marker)
this_marker_needed = True
excluded_parent=False # used to check if its text is needed or not, in the subsequent call
inner_content_needed = True
Expand Down Expand Up @@ -92,13 +93,14 @@ def include_markers_in_usj(input_usj,
return []
return [input_usj]
cleaned_kids = []
include_markers = [re.sub(trailing_num_pattern,'', item.split(':')[-1])
include_markers = [re.sub(trailing_num_pattern,'', item)
for item in include_markers]
this_marker = re.sub(trailing_num_pattern,'', input_usj['type'].split(':')[-1])
this_marker = input_usj['marker'] if 'marker' in input_usj else ''
this_marker = re.sub(trailing_num_pattern, '', this_marker)
this_marker_needed = True
excluded_parent = False # used to check if its text is needed or not in the subsequent call
inner_content_needed = True
if this_marker not in include_markers:
if this_marker not in include_markers+['']:
this_marker_needed =False
excluded_parent = True
if this_marker in MARKERS_WITH_DISCARDABLE_CONTENTS:
Expand Down
12 changes: 7 additions & 5 deletions py-usfm-parser/src/usfm_grammar/list_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def __init__(self):
self.book = ""
self.current_chapter = ""
self.current_verse = ""
self.list = [["Book","Chapter","Verse","Text","Type"]]
self.list = [["Book","Chapter","Verse","Text","Type","Marker"]]

def usj_to_list_id(self, obj):
'''update book code'''
Expand All @@ -24,20 +24,22 @@ def usj_to_list_v(self, obj):

def usj_to_list(self, obj):
'''Traverse the USJ dict and build the table in self.list'''
if obj['type'] == "book:id":
if obj['type'] == "book":
self.usj_to_list_id(obj)
elif obj['type'] == "chapter:c":
elif obj['type'] == "chapter":
self.usj_to_list_c(obj)
elif obj['type'] == "verse:v":
elif obj['type'] == "verse":
self.usj_to_list_v(obj)
marker_type = obj['type']
marker_name = obj['marker'] if "marker" in obj else ''
if marker_type == "USJ":
# This would occur if the JSON got flatttened after removing paragraph markers
marker_type = ""
if 'content' in obj:
for item in obj['content']:
if isinstance(item, str):
self.list.append(
[self.book, self.current_chapter, self.current_verse, item, marker_type])
[self.book, self.current_chapter, self.current_verse,
item, marker_type, marker_name])
else:
self.usj_to_list(item)
33 changes: 16 additions & 17 deletions py-usfm-parser/src/usfm_grammar/usfm_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
NO_USFM_USJ_TYPES = ['USJ', 'table']
NO_NEWLINE_USJ_TYPES = ['char', 'note', 'verse', 'table:cell']
CLOSING_USJ_TYPES = ['char', 'note', 'figure']
NON_ATTRIB_USJ_KEYS = ['type', 'content', 'number', 'sid',
NON_ATTRIB_USJ_KEYS = ['type', 'marker', 'content', 'number', 'sid',
'code', 'caller', 'align',
'version', 'altnumber', 'pubnumber', 'category']

Expand All @@ -20,17 +20,16 @@ def is_valid_usfm(self, usfm_string: dict = None) -> bool:

def usj_to_usfm(self, usj_obj: dict, nested=False) -> None: # pylint: disable=too-many-statements, too-many-branches
'''Traverses through the dict/json and uses 'type' field to form USFM elements'''
marker_types = usj_obj['type'].split(':')
if usj_obj['type'] not in NO_USFM_USJ_TYPES:
self.usfm_string += "\\"
if nested and marker_types[0] == 'char':
if nested and usj_obj['type'] == 'char':
self.usfm_string+="+"
self.usfm_string += f"{marker_types[-1]} "
self.usfm_string += f"{usj_obj['marker']} "
if 'code' in usj_obj:
self.usfm_string += f"{usj_obj['code']} "
if 'number' in usj_obj:
self.usfm_string += usj_obj['number']
if marker_types[0] == "verse":
if usj_obj['type'] == "verse":
self.usfm_string += " "
if 'caller' in usj_obj:
self.usfm_string += f"{usj_obj['caller']} "
Expand All @@ -41,7 +40,7 @@ def usj_to_usfm(self, usj_obj: dict, nested=False) -> None: # pylint: disable=to
if isinstance(item, str):
self.usfm_string += item
else:
if marker_types[0] in ['char']:
if usj_obj['type']in ['char']:
self.usj_to_usfm(item, nested=True)
else:
self.usj_to_usfm(item)
Expand All @@ -56,30 +55,30 @@ def usj_to_usfm(self, usj_obj: dict, nested=False) -> None: # pylint: disable=to
else:
self.usfm_string += f"{key}=\"{usj_obj[key]}\" "

if marker_types[0] in CLOSING_USJ_TYPES:
if usj_obj['type'] in CLOSING_USJ_TYPES:
self.usfm_string = self.usfm_string.strip() + "\\"
if nested and marker_types[0] == 'char':
if nested and usj_obj['type'] == 'char':
self.usfm_string+="+"
self.usfm_string += f"{marker_types[-1]}* "
if marker_types[0] == "ms":
self.usfm_string += f"{usj_obj['marker']}* "
if usj_obj['type'] == "ms":
if "sid" in usj_obj:
if not attributes:
self.usfm_string += "|"
attributes = True
self.usfm_string += f"sid=\"{usj_obj['sid']}\" "
self.usfm_string = self.usfm_string.strip() + "\\*"
if marker_types[0] == "sidebar":
if usj_obj['type'] == "sidebar":
self.usfm_string += "\\esbe"
if ":".join(marker_types[:-1]) not in NO_NEWLINE_USJ_TYPES and \
if usj_obj['type'] not in NO_NEWLINE_USJ_TYPES and \
self.usfm_string[-1] != "\n":
self.usfm_string += "\n"
if "altnumber" in usj_obj:
self.usfm_string += f"\\{marker_types[-1]}a {usj_obj['altnumber']}"
self.usfm_string += f"\\{marker_types[-1]}a* "
self.usfm_string += f"\\{usj_obj['marker']}a {usj_obj['altnumber']}"
self.usfm_string += f"\\{usj_obj['marker']}a* "
if "pubnumber" in usj_obj:
self.usfm_string += f"\\{marker_types[-1]}p {usj_obj['pubnumber']}"
if marker_types[-1] == "v":
self.usfm_string += f"\\{marker_types[-1]}p* "
self.usfm_string += f"\\{usj_obj['marker']}p {usj_obj['pubnumber']}"
if usj_obj['marker'] == "v":
self.usfm_string += f"\\{usj_obj['marker']}p* "
else:
self.usfm_string += "\n"

Expand Down
38 changes: 20 additions & 18 deletions py-usfm-parser/src/usfm_grammar/usj_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ def __init__(self, tree_sitter_language_obj, usfm_bytes, usj_root_obj=None):
def findlast_from_json(self, json_obj, type_value):
'''Traverse the given JSON and list all elements with given value in type field'''
output = None
if json_obj['type'] == type_value or type_value in json_obj['type'].split(':'):
if type_value == json_obj['type']:
output = json_obj
elif "marker" in json_obj and type_value == json_obj['marker']:
output = json_obj
if 'content' in json_obj:
for child in json_obj['content']:
Expand All @@ -53,7 +55,7 @@ def node_2_usj_id(self, node, parent_json_obj):
code = self.usfm[tupl[0].start_byte:tupl[0].end_byte].decode('utf-8')
elif tupl[1] == 'desc':
desc = self.usfm[tupl[0].start_byte:tupl[0].end_byte].decode('utf-8')
book_json_obj = {"type": "book:id", "content":[]}
book_json_obj = {"type": "book", "marker":"id", "content":[]}
book_json_obj['code'] = code
if desc is not None and desc.strip() != "":
book_json_obj['content'].append(desc.strip())
Expand All @@ -68,10 +70,10 @@ def node_2_usj_c(self, node, parent_json_obj):
chap_num = self.usfm[chap_cap[0][0].start_byte:chap_cap[0][0].end_byte].decode('utf-8')
chap_ref = None
for child in self.json_root_obj['content']:
if child['type'] == "book:id":
if child['type'] == "book":
chap_ref = child['code']+" "+chap_num
break
chap_json_obj = {"type":"chapter:c"}
chap_json_obj = {"type":"chapter", "marker":"c"}
chap_json_obj["number"] = chap_num
chap_json_obj["sid"] = chap_ref
for tupl in chap_cap:
Expand Down Expand Up @@ -105,7 +107,7 @@ def node_2_usj_verse(self, node, parent_json_obj):
)''').captures(node)
verse_num = self.usfm[verse_num_cap[0][0].start_byte:
verse_num_cap[0][0].end_byte].decode('utf-8')
v_json_obj = {"type":"verse:v"}
v_json_obj = {"type":"verse", "marker":"v"}
for tupl in verse_num_cap:
if tupl[1] == 'alt':
alt_num = self.usfm[tupl[0].start_byte:tupl[0].end_byte].decode('utf-8')
Expand All @@ -121,7 +123,7 @@ def node_2_usj_verse(self, node, parent_json_obj):
def node_2_usj_ca_va(self, node, parent_json_obj):
'''Build elements for independant ca and va away from c and v'''
style = node.type
char_json_obj = {"type":f"char:{style}"}
char_json_obj = {"type":"char", "marker":style}
alt_num_match = self.usfm_language.query('''([
(chapterNumber)
(verseNumber)
Expand All @@ -143,14 +145,14 @@ def node_2_usj_para(self, node, parent_json_obj):
if para_marker == "b":
self.node_2_usj_special(para_tag_cap[0], parent_json_obj)
elif not para_marker.endswith("Block"):
para_json_obj = {"type": f"para:{para_marker}", "content":[]}
para_json_obj = {"type": "para", "marker":para_marker, "content":[]}
for child in para_tag_cap[0].children[1:]:
self.node_2_usj(child, para_json_obj)
parent_json_obj['content'].append(para_json_obj)
elif node.type in ['pi', "ph"]:
para_marker = self.usfm[node.children[0].start_byte:\
node.children[0].end_byte].decode('utf-8').replace("\\", "").strip()
para_json_obj = {"type": f"para:{para_marker}", "content":[]}
para_json_obj = {"type": "para", "marker":para_marker, "content":[]}
for child in node.children[1:]:
self.node_2_usj(child, para_json_obj)
parent_json_obj['content'].append(para_json_obj)
Expand All @@ -161,7 +163,7 @@ def node_2_usj_notes(self, node, parent_json_obj):
caller_node = node.children[1]
style = self.usfm[tag_node.start_byte:tag_node.end_byte].decode(
'utf-8').replace("\\","").strip()
note_json_obj = {"type": f"note:{style}", "content":[]}
note_json_obj = {"type": "note", "marker":style, "content":[]}
note_json_obj["caller"] = \
self.usfm[caller_node.start_byte:caller_node.end_byte].decode('utf-8').strip()
for child in node.children[2:-1]:
Expand All @@ -178,7 +180,7 @@ def node_2_usj_char(self, node, parent_json_obj):
children_range = children_range-1
style = self.usfm[tag_node.start_byte:tag_node.end_byte].decode(
'utf-8').replace("\\","").replace("+","").strip()
char_json_obj = {"type": f"char:{style}", "content":[]}
char_json_obj = {"type": "char", "marker":style, "content":[]}
# if closing_node is None:
# char_json_obj["closed"] = false
# else:
Expand Down Expand Up @@ -213,15 +215,15 @@ def node_2_usj_table(self, node, parent_json_obj):
self.node_2_usj(child, table_json_obj)
parent_json_obj['content'].append(table_json_obj)
elif node.type == "tr":
row_json_obj = {"type": "table:row:tr", "content":[]}
row_json_obj = {"type": "table:row", "marker":"tr", "content":[]}
for child in node.children[1:]:
self.node_2_usj(child, row_json_obj)
parent_json_obj['content'].append(row_json_obj)
elif node.type in self.TABLE_CELL_MARKERS:
tag_node = node.children[0]
style = self.usfm[tag_node.start_byte:tag_node.end_byte].decode('utf-8')\
.replace("\\","").strip()
cell_json_obj = {"type": f"table:cell:{style}", "content":[]}
cell_json_obj = {"type": "table:cell", "marker":style, "content":[]}
if "r" in style:
cell_json_obj["align"] = "end"
else:
Expand All @@ -240,7 +242,7 @@ def node_2_usj_milestone(self, node, parent_json_obj):
] @ms-name)''').captures(node)[0]
style = self.usfm[ms_name_cap[0].start_byte:ms_name_cap[0].end_byte].decode('utf-8')\
.replace("\\","").strip()
ms_json_obj = {"type": f"ms:{style}", "content":[]}
ms_json_obj = {"type": "ms", 'marker':style, "content":[]}
for child in node.children:
if child.type.endswith("Attribute"):
self.node_2_usj(child, ms_json_obj)
Expand All @@ -252,7 +254,7 @@ def node_2_usj_milestone(self, node, parent_json_obj):
def node_2_usj_special(self, node, parent_json_obj):
'''Build nodes for esb, cat, fig, optbreak in USX'''
if node.type == "esb":
sidebar_json_obj = {"type": "sidebar:esb", "content":[]}
sidebar_json_obj = {"type": "sidebar", "marker":"esb", "content":[]}
for child in node.children[1:-1]:
self.node_2_usj(child, sidebar_json_obj)
parent_json_obj['content'].append(sidebar_json_obj)
Expand All @@ -261,15 +263,15 @@ def node_2_usj_special(self, node, parent_json_obj):
category = self.usfm[cat_cap[0].start_byte:cat_cap[0].end_byte].decode('utf-8').strip()
parent_json_obj['category'] = category
elif node.type == 'fig':
fig_json_obj = {"type":"figure:fig", "content":[]}
fig_json_obj = {"type":"figure", "marker":"fig", "content":[]}
for child in node.children[1:-1]:
self.node_2_usj(child, fig_json_obj)
parent_json_obj['content'].append(fig_json_obj)
elif node.type == 'b':
b_json_obj = {"type": "optbreak:b"}
b_json_obj = {"type": "optbreak", "marker":"b"}
parent_json_obj['content'].append(b_json_obj)
elif node.type == "usfm":
ver_json_obj = {"type": "para:usfm", "content":[]}
ver_json_obj = {"type": "para", "marker":"usfm", "content":[]}
version = self.usfm[
node.start_byte:node.end_byte].decode('utf-8').replace("\\usfm","").strip()
ver_json_obj['content'].append(version)
Expand All @@ -289,7 +291,7 @@ def node_2_usj_generic(self, node, parent_json_obj):
num = self.usfm[num_node.start_byte:num_node.end_byte].decode('utf-8')
style += num
children_range_start = 2
para_json_obj = {"type": f"para:{style}", "content":[]}
para_json_obj = {"type": "para", "marker":style, "content":[]}
parent_json_obj['content'].append(para_json_obj)
for child in node.children[children_range_start:]:
if child.type in self.CHAR_STYLE_MARKERS+self.NESTED_CHAR_STYLE_MARKERS+\
Expand Down
14 changes: 8 additions & 6 deletions py-usfm-parser/tests/test_json_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ def test_usj_converions_without_filter(file_path):
assert not test_parser.errors, test_parser.errors
usfm_dict = test_parser.to_usj()
assert isinstance(usfm_dict, dict)
# usj_file_path = file_path.replace("origin.usfm", "origin-usj.json")
# with open(usj_file_path, 'w', encoding='utf-8') as usj_file:
# json.dump(usfm_dict, usj_file, indent=2 )

@pytest.mark.parametrize('file_path', test_files)
@pytest.mark.parametrize('exclude_markers', [
Expand Down Expand Up @@ -64,11 +67,9 @@ def get_types(element):
types = []
if isinstance(element, str):
pass
elif element['type'].split(':')[0] == "ms":
types.append('milestone')
types.append(element['type'].split(':')[-1] )
else:
types += element['type'].split(':')
if 'marker' in element:
types.append(element['marker'])
if "altnumber" in element:
if "c" in element['type']:
types.append("ca")
Expand Down Expand Up @@ -120,7 +121,8 @@ def test_usj_output_is_valid(file_path):
@pytest.mark.parametrize('file_path', test_files)
@pytest.mark.timeout(30)
def test_usj_round_tripping(file_path):
'''Convert USFM to USJ and back to USFM. Compare first USFM and second USFM based on parse tree'''
'''Convert USFM to USJ and back to USFM.
Compare first USFM and second USFM based on parse tree'''
test_parser1 = initialise_parser(file_path)
assert not test_parser1.errors, test_parser1.errors
usj_dict = test_parser1.to_usj()
Expand Down Expand Up @@ -167,7 +169,7 @@ def test_compare_usj_with_testsuite_samples(file_path):
usx_file_path = file_path.replace("origin.usfm", "origin.xml")
if usx_file_path not in exclude_USX_files:
usj_dict = test_parser.to_usj()
remove_newlines_in_text(usj_dict)
# remove_newlines_in_text(usj_dict) # need this if using USJ generated from tcdocs
try:
usj_file_path = file_path.replace("origin.usfm", "origin-usj.json")
with open(usj_file_path, 'r', encoding='utf-8') as usj_file:
Expand Down
4 changes: 2 additions & 2 deletions py-usfm-parser/tests/test_list_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_list_converions_with_exclude_markers(file_path, exclude_markers):
usfm_list = test_parser.to_list(exclude_markers=exclude_markers)
assert isinstance(usfm_list, list)
for row in usfm_list[1:]:
assert row[4].split(':')[-1] not in exclude_markers
assert row[5] not in exclude_markers

trailing_num_pattern = re.compile(r'\d+$')
@pytest.mark.parametrize('file_path', test_files)
Expand All @@ -42,7 +42,7 @@ def test_list_converions_with_include_markers(file_path, include_markers):
usfm_list = test_parser.to_list(include_markers=include_markers)
assert isinstance(usfm_list, list)
for row in usfm_list[1:]:
marker = row[4].split(':')[-1]
marker = row[5]
marker = re.sub(trailing_num_pattern, "", marker)
assert marker in include_markers

10 changes: 7 additions & 3 deletions schemas/usj.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,17 @@
"type": "object",
"properties": {
"type": {
"description": "The kind of node or element this is, corresponding each marker in USFM or each node in USX",
"description": "The kind/category of node or element this is, corresponding the USFM marker and USX node",
"type": "string"
},
"marker": {
"description": "The corresponding marker in USFM or style in USX",
"type": "string"
},
"content": {
"type": "array",
"items": {
"oneOf":[
"anyOf":[
{"type": "string"},
{"$ref": "#/$defs/markerObject"}
]
Expand Down Expand Up @@ -71,7 +75,7 @@
"description": "The JSON representation of scripture contents from USFM/USX",
"type": "array",
"items":{
"oneOf":[
"anyOf":[
{"type": "string"},
{"$ref": "#/$defs/markerObject"}
]
Expand Down
Loading

0 comments on commit 93df393

Please sign in to comment.