Modify USJ spec (#226)

* change oneOf in USJ schema to anyOf * change USJ schema: oneOf and type field * udpate USJ generation as per the changes * udpate List generation as it uses USJ output * update USJ filtering to work with changed USJ output * update USFM generation from USJ, for the output changes * Change the USJ samples in test suite as per new strcuture * test and fix the JSON conversion * test and fix the List conversion * Add new USJ for the bigger test samples too * Change test for until next tcdocs tests' sync
Bridgeconn · Nov 10, 2023 · 93df393 · 93df393
1 parent bf8b42d
commit 93df393
Show file tree

Hide file tree

Showing 188 changed files with 181,011 additions and 88,128 deletions.
diff --git a/py-usfm-parser/src/usfm_grammar/filters.py b/py-usfm-parser/src/usfm_grammar/filters.py
@@ -53,9 +53,10 @@ def exclude_markers_in_usj(input_usj,
             return []
         return [input_usj]
     cleaned_kids = []
-    exclude_markers = [re.sub(trailing_num_pattern, '', item.split(':')[-1])
+    exclude_markers = [re.sub(trailing_num_pattern, '', item)
                                                      for item in exclude_markers]
-    this_marker = re.sub(trailing_num_pattern,'', input_usj['type'].split(':')[-1])
+    this_marker = input_usj['marker'] if 'marker' in input_usj else ''
+    this_marker = re.sub(trailing_num_pattern, '', this_marker)
     this_marker_needed = True
     excluded_parent=False # used to check if its text is needed or not, in the subsequent call
     inner_content_needed = True
@@ -92,13 +93,14 @@ def include_markers_in_usj(input_usj,
             return []
         return [input_usj]
     cleaned_kids = []
-    include_markers = [re.sub(trailing_num_pattern,'', item.split(':')[-1])
+    include_markers = [re.sub(trailing_num_pattern,'', item)
                                                 for item in include_markers]
-    this_marker = re.sub(trailing_num_pattern,'', input_usj['type'].split(':')[-1])
+    this_marker = input_usj['marker'] if 'marker' in input_usj else ''
+    this_marker = re.sub(trailing_num_pattern, '', this_marker)
     this_marker_needed = True
     excluded_parent = False # used to check if its text is needed or not in the subsequent call
     inner_content_needed = True
-    if this_marker not in include_markers:
+    if this_marker not in include_markers+['']:
         this_marker_needed =False
         excluded_parent = True
         if this_marker in MARKERS_WITH_DISCARDABLE_CONTENTS:

diff --git a/py-usfm-parser/src/usfm_grammar/list_generator.py b/py-usfm-parser/src/usfm_grammar/list_generator.py
@@ -8,7 +8,7 @@ def __init__(self):
         self.book = ""
         self.current_chapter = ""
         self.current_verse = ""
-        self.list = [["Book","Chapter","Verse","Text","Type"]]
+        self.list = [["Book","Chapter","Verse","Text","Type","Marker"]]
 
     def usj_to_list_id(self, obj):
         '''update book code'''
@@ -24,20 +24,22 @@ def usj_to_list_v(self, obj):
 
     def usj_to_list(self, obj):
         '''Traverse the USJ dict and build the table in self.list'''
-        if obj['type'] == "book:id":
+        if obj['type'] == "book":
             self.usj_to_list_id(obj)
-        elif obj['type'] == "chapter:c":
+        elif obj['type'] == "chapter":
             self.usj_to_list_c(obj)
-        elif obj['type'] == "verse:v":
+        elif obj['type'] == "verse":
             self.usj_to_list_v(obj)
         marker_type = obj['type']
+        marker_name = obj['marker'] if "marker" in obj else ''
         if marker_type == "USJ":
             # This would occur if the JSON got flatttened after removing paragraph markers
             marker_type = ""
         if 'content' in obj:
             for item in obj['content']:
                 if isinstance(item, str):
                     self.list.append(
-                        [self.book, self.current_chapter, self.current_verse, item, marker_type])
+                        [self.book, self.current_chapter, self.current_verse,
+                            item, marker_type, marker_name])
                 else:
                     self.usj_to_list(item)
diff --git a/py-usfm-parser/src/usfm_grammar/usfm_generator.py b/py-usfm-parser/src/usfm_grammar/usfm_generator.py
@@ -3,7 +3,7 @@
 NO_USFM_USJ_TYPES = ['USJ', 'table']
 NO_NEWLINE_USJ_TYPES = ['char', 'note', 'verse', 'table:cell']
 CLOSING_USJ_TYPES = ['char', 'note', 'figure']
-NON_ATTRIB_USJ_KEYS = ['type', 'content', 'number', 'sid',
+NON_ATTRIB_USJ_KEYS = ['type', 'marker', 'content', 'number', 'sid',
                         'code', 'caller', 'align',
                         'version', 'altnumber', 'pubnumber', 'category']
 
@@ -20,17 +20,16 @@ def is_valid_usfm(self, usfm_string: dict = None) -> bool:
 
     def usj_to_usfm(self, usj_obj: dict, nested=False) -> None: # pylint: disable=too-many-statements, too-many-branches
         '''Traverses through the dict/json and uses 'type' field to form USFM elements'''
-        marker_types = usj_obj['type'].split(':')
         if usj_obj['type'] not in NO_USFM_USJ_TYPES:
             self.usfm_string += "\\"
-            if nested and marker_types[0] == 'char':
+            if nested and usj_obj['type'] == 'char':
                 self.usfm_string+="+"
-            self.usfm_string += f"{marker_types[-1]} "
+            self.usfm_string += f"{usj_obj['marker']} "
         if 'code' in usj_obj:
             self.usfm_string += f"{usj_obj['code']} "
         if 'number' in usj_obj:
             self.usfm_string += usj_obj['number']
-            if marker_types[0] == "verse":
+            if usj_obj['type'] == "verse":
                 self.usfm_string += " "
         if 'caller' in usj_obj:
             self.usfm_string += f"{usj_obj['caller']} "
@@ -41,7 +40,7 @@ def usj_to_usfm(self, usj_obj: dict, nested=False) -> None: # pylint: disable=to
                 if isinstance(item, str):
                     self.usfm_string += item
                 else:
-                    if marker_types[0] in ['char']:
+                    if usj_obj['type']in ['char']:
                         self.usj_to_usfm(item, nested=True)
                     else:
                         self.usj_to_usfm(item)
@@ -56,30 +55,30 @@ def usj_to_usfm(self, usj_obj: dict, nested=False) -> None: # pylint: disable=to
                 else:
                     self.usfm_string += f"{key}=\"{usj_obj[key]}\" "
 
-        if marker_types[0] in CLOSING_USJ_TYPES:
+        if usj_obj['type'] in CLOSING_USJ_TYPES:
             self.usfm_string = self.usfm_string.strip() + "\\"
-            if nested and marker_types[0] == 'char':
+            if nested and usj_obj['type'] == 'char':
                 self.usfm_string+="+"
-            self.usfm_string += f"{marker_types[-1]}* "
-        if marker_types[0] == "ms":
+            self.usfm_string += f"{usj_obj['marker']}* "
+        if usj_obj['type'] == "ms":
             if "sid" in usj_obj:
                 if not attributes:
                     self.usfm_string += "|"
                     attributes = True
                 self.usfm_string += f"sid=\"{usj_obj['sid']}\" "
             self.usfm_string = self.usfm_string.strip() + "\\*"
-        if marker_types[0] == "sidebar":
+        if usj_obj['type'] == "sidebar":
             self.usfm_string += "\\esbe"
-        if ":".join(marker_types[:-1]) not in NO_NEWLINE_USJ_TYPES and \
+        if usj_obj['type'] not in NO_NEWLINE_USJ_TYPES and \
             self.usfm_string[-1] != "\n":
             self.usfm_string += "\n"
         if "altnumber" in usj_obj:
-            self.usfm_string += f"\\{marker_types[-1]}a {usj_obj['altnumber']}"
-            self.usfm_string += f"\\{marker_types[-1]}a* "
+            self.usfm_string += f"\\{usj_obj['marker']}a {usj_obj['altnumber']}"
+            self.usfm_string += f"\\{usj_obj['marker']}a* "
         if "pubnumber" in usj_obj:
-            self.usfm_string += f"\\{marker_types[-1]}p {usj_obj['pubnumber']}"
-            if marker_types[-1] == "v":
-                self.usfm_string += f"\\{marker_types[-1]}p* "
+            self.usfm_string += f"\\{usj_obj['marker']}p {usj_obj['pubnumber']}"
+            if usj_obj['marker'] == "v":
+                self.usfm_string += f"\\{usj_obj['marker']}p* "
             else:
                 self.usfm_string += "\n"
 

diff --git a/py-usfm-parser/src/usfm_grammar/usj_generator.py b/py-usfm-parser/src/usfm_grammar/usj_generator.py
@@ -31,7 +31,9 @@ def __init__(self, tree_sitter_language_obj, usfm_bytes, usj_root_obj=None):
     def findlast_from_json(self, json_obj, type_value):
         '''Traverse the given JSON and list all elements with given value in type field'''
         output = None
-        if json_obj['type'] == type_value or type_value in json_obj['type'].split(':'):
+        if type_value == json_obj['type']:
+            output = json_obj
+        elif "marker" in json_obj and type_value == json_obj['marker']:
             output = json_obj
         if 'content' in json_obj:
             for child in json_obj['content']:
@@ -53,7 +55,7 @@ def node_2_usj_id(self, node, parent_json_obj):
                 code = self.usfm[tupl[0].start_byte:tupl[0].end_byte].decode('utf-8')
             elif tupl[1] == 'desc':
                 desc = self.usfm[tupl[0].start_byte:tupl[0].end_byte].decode('utf-8')
-        book_json_obj = {"type": "book:id", "content":[]}
+        book_json_obj = {"type": "book", "marker":"id", "content":[]}
         book_json_obj['code'] = code
         if desc is not None and desc.strip() != "":
             book_json_obj['content'].append(desc.strip())
@@ -68,10 +70,10 @@ def node_2_usj_c(self, node, parent_json_obj):
         chap_num = self.usfm[chap_cap[0][0].start_byte:chap_cap[0][0].end_byte].decode('utf-8')
         chap_ref = None
         for child in self.json_root_obj['content']:
-            if child['type'] == "book:id":
+            if child['type'] == "book":
                 chap_ref = child['code']+" "+chap_num
                 break
-        chap_json_obj = {"type":"chapter:c"}
+        chap_json_obj = {"type":"chapter", "marker":"c"}
         chap_json_obj["number"] = chap_num
         chap_json_obj["sid"] = chap_ref
         for tupl in chap_cap:
@@ -105,7 +107,7 @@ def node_2_usj_verse(self, node, parent_json_obj):
                                 )''').captures(node)
         verse_num = self.usfm[verse_num_cap[0][0].start_byte:
             verse_num_cap[0][0].end_byte].decode('utf-8')
-        v_json_obj = {"type":"verse:v"}
+        v_json_obj = {"type":"verse", "marker":"v"}
         for tupl in verse_num_cap:
             if tupl[1] == 'alt':
                 alt_num = self.usfm[tupl[0].start_byte:tupl[0].end_byte].decode('utf-8')
@@ -121,7 +123,7 @@ def node_2_usj_verse(self, node, parent_json_obj):
     def node_2_usj_ca_va(self, node, parent_json_obj):
         '''Build elements for independant ca and va away from c and v'''
         style = node.type
-        char_json_obj = {"type":f"char:{style}"}
+        char_json_obj = {"type":"char", "marker":style}
         alt_num_match = self.usfm_language.query('''([
                                             (chapterNumber)
                                             (verseNumber)
@@ -143,14 +145,14 @@ def node_2_usj_para(self, node, parent_json_obj):
             if para_marker == "b":
                 self.node_2_usj_special(para_tag_cap[0], parent_json_obj)
             elif not para_marker.endswith("Block"):
-                para_json_obj = {"type": f"para:{para_marker}", "content":[]}
+                para_json_obj = {"type": "para", "marker":para_marker, "content":[]}
                 for child in para_tag_cap[0].children[1:]:
                     self.node_2_usj(child, para_json_obj)
                 parent_json_obj['content'].append(para_json_obj)
         elif node.type in ['pi', "ph"]:
             para_marker = self.usfm[node.children[0].start_byte:\
                 node.children[0].end_byte].decode('utf-8').replace("\\", "").strip()
-            para_json_obj = {"type": f"para:{para_marker}", "content":[]}
+            para_json_obj = {"type": "para", "marker":para_marker, "content":[]}
             for child in node.children[1:]:
                 self.node_2_usj(child, para_json_obj)
             parent_json_obj['content'].append(para_json_obj)
@@ -161,7 +163,7 @@ def node_2_usj_notes(self, node, parent_json_obj):
         caller_node = node.children[1]
         style = self.usfm[tag_node.start_byte:tag_node.end_byte].decode(
             'utf-8').replace("\\","").strip()
-        note_json_obj = {"type": f"note:{style}", "content":[]}
+        note_json_obj = {"type": "note", "marker":style, "content":[]}
         note_json_obj["caller"] = \
             self.usfm[caller_node.start_byte:caller_node.end_byte].decode('utf-8').strip()
         for child in node.children[2:-1]:
@@ -178,7 +180,7 @@ def node_2_usj_char(self, node, parent_json_obj):
             children_range = children_range-1
         style = self.usfm[tag_node.start_byte:tag_node.end_byte].decode(
             'utf-8').replace("\\","").replace("+","").strip()
-        char_json_obj = {"type": f"char:{style}", "content":[]}
+        char_json_obj = {"type": "char", "marker":style, "content":[]}
         # if closing_node is None:
         #     char_json_obj["closed"] = false
         # else:
@@ -213,15 +215,15 @@ def node_2_usj_table(self, node, parent_json_obj):
                 self.node_2_usj(child, table_json_obj)
             parent_json_obj['content'].append(table_json_obj)
         elif node.type == "tr":
-            row_json_obj = {"type": "table:row:tr", "content":[]}
+            row_json_obj = {"type": "table:row", "marker":"tr", "content":[]}
             for child in node.children[1:]:
                 self.node_2_usj(child, row_json_obj)
             parent_json_obj['content'].append(row_json_obj)
         elif node.type in self.TABLE_CELL_MARKERS:
             tag_node = node.children[0]
             style = self.usfm[tag_node.start_byte:tag_node.end_byte].decode('utf-8')\
             .replace("\\","").strip()
-            cell_json_obj = {"type": f"table:cell:{style}", "content":[]}
+            cell_json_obj = {"type": "table:cell", "marker":style, "content":[]}
             if "r" in style:
                 cell_json_obj["align"] = "end"
             else:
@@ -240,7 +242,7 @@ def node_2_usj_milestone(self, node, parent_json_obj):
              ] @ms-name)''').captures(node)[0]
         style = self.usfm[ms_name_cap[0].start_byte:ms_name_cap[0].end_byte].decode('utf-8')\
         .replace("\\","").strip()
-        ms_json_obj = {"type": f"ms:{style}", "content":[]}
+        ms_json_obj = {"type": "ms", 'marker':style, "content":[]}
         for child in node.children:
             if child.type.endswith("Attribute"):
                 self.node_2_usj(child, ms_json_obj)
@@ -252,7 +254,7 @@ def node_2_usj_milestone(self, node, parent_json_obj):
     def node_2_usj_special(self, node, parent_json_obj):
         '''Build nodes for esb, cat, fig, optbreak in USX'''
         if node.type == "esb":
-            sidebar_json_obj = {"type": "sidebar:esb", "content":[]}
+            sidebar_json_obj = {"type": "sidebar", "marker":"esb", "content":[]}
             for child in node.children[1:-1]:
                 self.node_2_usj(child, sidebar_json_obj)
             parent_json_obj['content'].append(sidebar_json_obj)
@@ -261,15 +263,15 @@ def node_2_usj_special(self, node, parent_json_obj):
             category = self.usfm[cat_cap[0].start_byte:cat_cap[0].end_byte].decode('utf-8').strip()
             parent_json_obj['category'] = category
         elif node.type == 'fig':
-            fig_json_obj = {"type":"figure:fig", "content":[]}
+            fig_json_obj = {"type":"figure", "marker":"fig", "content":[]}
             for child in node.children[1:-1]:
                 self.node_2_usj(child, fig_json_obj)
             parent_json_obj['content'].append(fig_json_obj)
         elif node.type == 'b':
-            b_json_obj = {"type": "optbreak:b"}
+            b_json_obj = {"type": "optbreak", "marker":"b"}
             parent_json_obj['content'].append(b_json_obj)
         elif node.type == "usfm":
-            ver_json_obj = {"type": "para:usfm", "content":[]}
+            ver_json_obj = {"type": "para", "marker":"usfm", "content":[]}
             version = self.usfm[
                 node.start_byte:node.end_byte].decode('utf-8').replace("\\usfm","").strip()
             ver_json_obj['content'].append(version)
@@ -289,7 +291,7 @@ def node_2_usj_generic(self, node, parent_json_obj):
             num = self.usfm[num_node.start_byte:num_node.end_byte].decode('utf-8')
             style += num
             children_range_start = 2
-        para_json_obj = {"type": f"para:{style}", "content":[]}
+        para_json_obj = {"type": "para", "marker":style, "content":[]}
         parent_json_obj['content'].append(para_json_obj)
         for child in node.children[children_range_start:]:
             if child.type in self.CHAR_STYLE_MARKERS+self.NESTED_CHAR_STYLE_MARKERS+\

diff --git a/py-usfm-parser/tests/test_json_conversion.py b/py-usfm-parser/tests/test_json_conversion.py
@@ -23,6 +23,9 @@ def test_usj_converions_without_filter(file_path):
     assert not test_parser.errors, test_parser.errors
     usfm_dict = test_parser.to_usj()
     assert isinstance(usfm_dict, dict)
+    # usj_file_path = file_path.replace("origin.usfm", "origin-usj.json")
+    # with open(usj_file_path, 'w', encoding='utf-8') as usj_file:
+    #     json.dump(usfm_dict, usj_file, indent=2 )
 
 @pytest.mark.parametrize('file_path', test_files)
 @pytest.mark.parametrize('exclude_markers', [
@@ -64,11 +67,9 @@ def get_types(element):
     types = []
     if isinstance(element, str):
         pass
-    elif element['type'].split(':')[0] == "ms":
-        types.append('milestone')
-        types.append(element['type'].split(':')[-1] )
     else:
-        types += element['type'].split(':')
+        if 'marker' in element:
+            types.append(element['marker'])
         if "altnumber" in element:
             if "c" in element['type']:
                 types.append("ca")
@@ -120,7 +121,8 @@ def test_usj_output_is_valid(file_path):
 @pytest.mark.parametrize('file_path', test_files)
 @pytest.mark.timeout(30)
 def test_usj_round_tripping(file_path):
-    '''Convert USFM to USJ and back to USFM. Compare first USFM and second USFM based on parse tree''' 
+    '''Convert USFM to USJ and back to USFM.
+    Compare first USFM and second USFM based on parse tree''' 
     test_parser1 = initialise_parser(file_path)
     assert not test_parser1.errors, test_parser1.errors
     usj_dict = test_parser1.to_usj()
@@ -167,7 +169,7 @@ def test_compare_usj_with_testsuite_samples(file_path):
     usx_file_path = file_path.replace("origin.usfm", "origin.xml")
     if usx_file_path not in exclude_USX_files:
         usj_dict = test_parser.to_usj()
-        remove_newlines_in_text(usj_dict)
+        # remove_newlines_in_text(usj_dict) # need this if using USJ generated from tcdocs
         try:
             usj_file_path = file_path.replace("origin.usfm", "origin-usj.json")
             with open(usj_file_path, 'r', encoding='utf-8') as usj_file:

diff --git a/py-usfm-parser/tests/test_list_conversion.py b/py-usfm-parser/tests/test_list_conversion.py
@@ -29,7 +29,7 @@ def test_list_converions_with_exclude_markers(file_path, exclude_markers):
     usfm_list = test_parser.to_list(exclude_markers=exclude_markers)
     assert isinstance(usfm_list, list)
     for row in usfm_list[1:]:
-        assert row[4].split(':')[-1] not in exclude_markers
+        assert row[5] not in exclude_markers
 
 trailing_num_pattern = re.compile(r'\d+$')
 @pytest.mark.parametrize('file_path', test_files)
@@ -42,7 +42,7 @@ def test_list_converions_with_include_markers(file_path, include_markers):
     usfm_list = test_parser.to_list(include_markers=include_markers)
     assert isinstance(usfm_list, list)
     for row in usfm_list[1:]:
-        marker = row[4].split(':')[-1]
+        marker = row[5]
         marker = re.sub(trailing_num_pattern, "", marker)
         assert marker in include_markers
 
diff --git a/schemas/usj.js b/schemas/usj.js
@@ -9,13 +9,17 @@
       "type": "object",
       "properties": {
         "type": { 
-        	"description": "The kind of node or element this is, corresponding each marker in USFM or each node in USX",
+        	"description": "The kind/category of node or element this is, corresponding the USFM marker and USX node",
         	"type": "string"
         },
+        "marker": { 
+          "description": "The corresponding marker in USFM or style in USX",
+          "type": "string"
+        },
         "content": {
           "type": "array",
           "items": {
-            "oneOf":[
+            "anyOf":[
               {"type": "string"},
               {"$ref": "#/$defs/markerObject"}
             ]
@@ -71,7 +75,7 @@
       "description": "The JSON representation of scripture contents from USFM/USX",
       "type": "array",
       "items":{
-        "oneOf":[
+        "anyOf":[
           {"type": "string"},
           {"$ref": "#/$defs/markerObject"}
         ]