From usx parsing (#230)

* Build USX to USFM conversion * integrate it as 'from_usx' option in usfm-parser * test annd fix USX to USFM conversion feature * Add infile=usx option in CLI * Update documentation, including the new feature of from-usx parsing * fix linter errors
Bridgeconn · Feb 6, 2024 · 49a3bd3 · 49a3bd3
1 parent 6dc912a
commit 49a3bd3
Show file tree

Hide file tree

Showing 7 changed files with 203 additions and 27 deletions.
diff --git a/docs/Dev_notes.md b/docs/Dev_notes.md
@@ -22,4 +22,25 @@ In python module,
 cd py-usfm-parser
 source ENV-dev
 bumpversion --new-version 3.0.0-alpha.28 num
-```
+```
+
+## Run tests
+To check Syntax trees in Grammar module
+```
+cd tree-sitter-usfm3
+export PATH=$PATH:./node_modules/.bin
+tree-sitter generate
+tree-sitter test
+```
+
+In python module alone
+
+```
+cd py-usfm-parser
+python -m pytest - auto
+
+# to run selectively
+pytest -k "not compare_usx_with_testsuite_samples and not testsuite_usx_with_rnc_grammar and not generated_usx_with_rnc_grammar and not samples-from-wild" -n auto
+
+```
+
diff --git a/py-usfm-parser/README.md b/py-usfm-parser/README.md
@@ -36,15 +36,15 @@ errors = my_parser.errors
 print(errors)
 ```
 
-To convert to USX
+##### To convert to USX
 ```
 from lxml import etree
 
 usx_elem = my_parser.to_usx() # default filter=ALL
 print(etree.tostring(usx_elem, encoding="unicode", pretty_print=True))
 ```
 
-To convert to Dict
+##### To convert to Dict/USJ
 
 ```
 output = my_parser.to_usj() # default all markers
@@ -68,15 +68,15 @@ print(output)
 ```
 To understand more about how `exclude_markers`, `include_markers`, `combine_texts`  and `Filter` works refer the section on [filtering on USJ](#filtering-on-usj)
 
-To save as json
+##### To save as json
 ```
 import json
 dict_output = my_parser.to_usj()
 with open("file_path.json", "w", encoding='utf-8') as fp:
 	json.dump(dict_output, fp)
 ```
 
-To convert to List or table like format
+##### To convert to List or table like format
 ```
 list_output = my_parser.to_list() 
 #list_output = my_parser.to_list([Filter.SCRIPTURE_TEXT])
@@ -86,7 +86,7 @@ print(table_output)
 
 ```
 
-To round trip with USJ
+##### To round trip with USJ
 ```
 from usfm_grammar import USFMParser, Filter
 
@@ -98,7 +98,7 @@ print(my_parser2.usfm)
 ```
 :warning: There will be differences between first USFM and the generated one in 1. Spaces and lines 2. Default attributes will be given their names 3. Closing markers may be newly added
 
-To remove unwanted markers from USFM
+##### To remove unwanted markers from USFM
 ```
 from usfm_grammar import USFMParser, Filter, USFMGenerator
 
@@ -108,9 +108,9 @@ usj_obj = my_parser.to_usj(include_markers=Filter.BCV+Filter.TEXT)
 my_parser2 = USFMParser(from_usj=usj_obj)
 print(my_parser2.usfm)
 ```
-USJ to USX or Table
+##### USJ to USX or Table
 ```
-rom usfm_grammar import USFMParser, Filter
+from usfm_grammar import USFMParser, Filter
 
 my_parser = USFMParser(input_usfm_str)
 usj_obj = my_parser.to_usj()
@@ -119,10 +119,27 @@ my_parser2 = USFMParser(from_usj=usj_obj)
 print(my_parser2.to_usx())
 # print(my_parser2.to_list())
 ```
+
+##### USX to USFM, USJ or Table
+```
+from usfm_grammar import USFMParser, Filter
+from lxml import etree
+
+test_xml_file = "sample_usx.xml"
+with open(test_xml_file, 'r', encoding='utf-8') as usx_file:
+    usx_str = usx_file.read()
+    usx_obj = etree.fromstring(usx_str)
+
+    my_parser = USFMParser(from_usx=usx_obj)
+    print(my_parser.usfm)
+    # print(my_parser.to_usj())
+    # print(my_parser.to_list())
+```
+
 ### From CLI
 
 ```
-usage: usfm-grammar [-h] [--in_format {usfm,usj}]
+usage: usfm-grammar [-h] [--in_format {usfm,usj,usx}]
                     [--out_format {usj,table,syntax-tree,usx,markdown,usfm}]
                     [--include_markers {book_headers,titles,...}]
                     [--exclude_markers {book_headers,titles,...}]

diff --git a/py-usfm-parser/src/usfm_grammar/__main__.py b/py-usfm-parser/src/usfm_grammar/__main__.py
@@ -18,9 +18,12 @@ def handle_input_file(arg_parser):
     with open(infile, 'r', encoding='utf-8') as usfm_file:
         file_content = usfm_file.read()
 
-    if input_format == Format.JSON or infile.split(".")[-1] in ['json', 'usj']:
+    if input_format == Format.JSON or infile.split(".")[-1].lower() in ['json', 'usj']:
         usj_obj = json.loads(file_content)
         my_parser = USFMParser(from_usj=usj_obj)
+    elif input_format == Format.USX or infile.split(".")[-1].lower() in ['xml', 'usx']:
+        usx_obj = etree.fromstring(file_content)
+        my_parser = USFMParser(from_usx=usx_obj)
     elif input_format == Format.USFM:
         my_parser = USFMParser(file_content)
     else:
@@ -62,7 +65,7 @@ def main(): #pylint: disable=too-many-locals
     arg_parser.add_argument('infile', type=str, help='input usfm or usj file')
 
     arg_parser.add_argument('--in_format', type=str, help='input file format',
-                            choices=[Format.USFM.value, Format.JSON.value],
+                            choices=[Format.USFM.value, Format.JSON.value, Format.USX.value],
                             default=Format.USFM.value)
     arg_parser.add_argument('--out_format', type=str, help='output format',
                             choices=[itm.value for itm in Format],

diff --git a/py-usfm-parser/src/usfm_grammar/usfm_generator.py b/py-usfm-parser/src/usfm_grammar/usfm_generator.py
@@ -6,17 +6,22 @@
 NON_ATTRIB_USJ_KEYS = ['type', 'marker', 'content', 'number', 'sid',
                         'code', 'caller', 'align',
                         'version', 'altnumber', 'pubnumber', 'category']
+NON_ATTRIB_USX_KEYS =  ['number', 'code', 'caller', 'align', "sid", "eid",
+                        "style", "closed", "vid", "status",'version',
+                        'altnumber', 'pubnumber', 'category']
+NO_NEWLINE_USX_TYPES = ['char', 'note', "cell", "figure", "usx", "book", "optbreak"]
+CLOSING_USX_TYPES = ['char', 'note', 'figure', "ms"]
 
 class USFMGenerator:
     '''Combines the different methods that generate USFM from other formats in one class'''
     def __init__(self):
         self.usfm_string = ''
 
-    def is_valid_usfm(self, usfm_string: dict = None) -> bool:
-        '''Check the generated or passed USFM's correctness using the grammar'''
-        if usfm_string is None:
-            usfm_string = self.usfm_string
-        return False
+    # def is_valid_usfm(self, usfm_string: str = None) -> bool:
+    #     '''Check the generated or passed USFM's correctness using the grammar'''
+    #     if usfm_string is None:
+    #         usfm_string = self.usfm_string
+    #     return False
 
     def usj_to_usfm(self, usj_obj: dict, nested=False) -> None: # pylint: disable=too-many-statements, too-many-branches
         '''Traverses through the dict/json and uses 'type' field to form USFM elements'''
@@ -82,7 +87,91 @@ def usj_to_usfm(self, usj_obj: dict, nested=False) -> None: # pylint: disable=to
             else:
                 self.usfm_string += "\n"
 
-    # def usx_to_usfm(self, usx_xml_tree) -> str: # should we call it just from_usx() instead
-    #     '''Traverses xml tree and converts nodes to usfm elements
-    #     based on type and style fields'''
-    #     return self.usfm_string
+    def usx_to_usfm(self, xml_obj, nested=False): # pylint: disable=too-many-statements, too-many-branches
+        '''Traverses xml tree and converts nodes to usfm elements
+        based on type and style fields'''
+        if isinstance(xml_obj, str):
+            self.usfm_string += xml_obj
+            return
+        obj_type = xml_obj.tag
+        marker = None
+        usfm_attributes = []
+        if obj_type in ["verse", "chapter"] and "eid" in xml_obj.attrib:
+            return
+        if obj_type not in NO_NEWLINE_USX_TYPES:
+            self.usfm_string += "\n"
+        if obj_type == "optbreak":
+            if self.usfm_string != "" and self.usfm_string[-1] not in ["\n", "\r", " ", "\t"]:
+                self.usfm_string += " "
+            self.usfm_string += "// "
+        if "style" in xml_obj.attrib:
+            marker = xml_obj.attrib["style"]
+            if nested and obj_type=="char":
+                marker = "+"+marker
+            self.usfm_string += f"\\{marker} "
+        if "code" in xml_obj.attrib:
+            self.usfm_string += xml_obj.attrib['code']
+        if "number" in xml_obj.attrib:
+            self.usfm_string += f"{xml_obj.attrib['number']} "
+        if "caller" in xml_obj.attrib:
+            self.usfm_string += f"{xml_obj.attrib['caller']} "
+        if "altnumber" in xml_obj.attrib:
+            if obj_type == "verse":
+                self.usfm_string += f"\\va {xml_obj.attrib['altnumber']}\\va*"
+            elif obj_type == "chapter":
+                self.usfm_string += f"\n\\ca {xml_obj.attrib['altnumber']}\\ca*"
+        if "pubnumber" in xml_obj.attrib:
+            if obj_type == "verse":
+                self.usfm_string += f"\\vp {xml_obj.attrib['pubnumber']}\\vp*"
+            elif obj_type == "chapter":
+                self.usfm_string += f"\n\\cp {xml_obj.attrib['pubnumber']}"
+        if "category" in xml_obj.attrib:
+            self.usfm_string += f"\n\\cat {xml_obj.attrib['category']} \\cat*"
+        if xml_obj.text:
+            if self.usfm_string != "" and self.usfm_string[-1] not in ["\n", "\r", " ", "\t"]:
+                self.usfm_string += " "
+            self.usfm_string += xml_obj.text.strip()
+        for child in xml_obj.getchildren():
+            if obj_type in ["char"]:
+                self.usx_to_usfm(child, nested=True)
+            else:
+                self.usx_to_usfm(child, nested=False)
+            if child.tail:
+                if self.usfm_string !="" and self.usfm_string[-1] not in ["\n", "\r", " ", "\t"]:
+                    self.usfm_string += " "
+                self.usfm_string += child.tail.strip()
+        for key in xml_obj.attrib:
+            val = xml_obj.attrib[key]
+            val = val.replace('"', '')
+            if key == "file" and obj_type=="figure":
+                usfm_attributes.append(f'src="{val}"')
+            elif key not in NON_ATTRIB_USX_KEYS:
+                usfm_attributes.append(f'{key}="{val}"')
+            if key in ['sid', 'eid'] and obj_type=="ms":
+                usfm_attributes.append(f'{key}="{val}"')
+        if len(usfm_attributes) > 0:
+            self.usfm_string += "|"
+            self.usfm_string += " ".join(usfm_attributes)
+
+        if (("closed" in xml_obj.attrib and xml_obj.attrib['closed']=="true")
+            or obj_type in CLOSING_USX_TYPES
+            or len(usfm_attributes)>0):
+            # if not ("closed" in xml_obj.attrib and xml_obj.attrib['closed']=="false"):
+            if obj_type == "ms":
+                self.usfm_string += "\\*"
+            else:
+                self.usfm_string += f"\\{marker}*"
+
+if __name__ == "__main__":
+    from lxml import etree
+
+    TEST_FILE = "../../../tests/basic/cross-refs/origin.xml"
+
+    with open(TEST_FILE, 'r', encoding='utf-8') as usx_file:
+        usx_str = usx_file.read()
+        root = etree.fromstring(usx_str)
+
+        gen = USFMGenerator()
+
+        gen.usx_to_usfm(root)
+        print(gen.usfm_string)
diff --git a/py-usfm-parser/src/usfm_grammar/usfm_parser.py b/py-usfm-parser/src/usfm_grammar/usfm_parser.py
@@ -77,18 +77,32 @@ class Format(str, Enum):
 
 class USFMParser():
     """Parser class with usfmstring, syntax_tree and methods for JSON convertions"""
-    def __init__(self, usfm_string:str=None, from_usj:dict=None):
+    def __init__(self, usfm_string:str=None, from_usj:dict=None, from_usx:etree.Element=None):
         # super(USFMParser, self).__init__()
-        if usfm_string is not None and from_usj is not None:
-            raise Exception("Found USFM and USJ inputs! Only one supported in one object.")
+        inputs_given = 0
+        if usfm_string is not None:
+            inputs_given += 1
+        if from_usj is not None:
+            inputs_given += 1
+        if from_usx is not None:
+            inputs_given += 1
+
+        if inputs_given > 1:
+            raise Exception("Found more than one input!"+\
+                " Only one of USFM, USJ or USX is supported in one object.")
+        if inputs_given == 0:
+            raise Exception("Missing input! Either USFM, USJ or USX is to be provided.")
+
         if usfm_string is not None:
             self.usfm = usfm_string
         elif from_usj is not None:
             usj_converter = USFMGenerator()
             usj_converter.usj_to_usfm(from_usj)
             self.usfm = usj_converter.usfm_string
-        else:
-            raise Exception("Missing input! Either USFM or USJ to be provided.")
+        elif from_usx is not None:
+            usx_converter = USFMGenerator()
+            usx_converter.usx_to_usfm(from_usx)
+            self.usfm = usx_converter.usfm_string
 
         self.usfm_bytes = None
         self.syntax_tree = None

diff --git a/py-usfm-parser/tests/__init__.py b/py-usfm-parser/tests/__init__.py
@@ -18,6 +18,12 @@ def generate_USFM_from_USJ(input_usj):
     usj_parser = USFMParser(from_usj=input_usj)
     return usj_parser.usfm
 
+def generate_USFM_from_USX(input_usx):
+    '''Create a generator, and use usj_to_usfm convertion API'''
+    usx_parser = USFMParser(from_usx=input_usx)
+    return usx_parser.usfm
+
+
 def parse_USFM_string(usfm_string):
     '''Set up a parser obj with given string input'''
     test_parser = USFMParser(usfm_string)

diff --git a/py-usfm-parser/tests/test_usx_conversion.py b/py-usfm-parser/tests/test_usx_conversion.py
@@ -6,7 +6,8 @@
 from lxml.doctestcompare import LXMLOutputChecker, PARSE_XML
 
 from tests import all_usfm_files, initialise_parser, doubtful_usfms,\
-    doubtful_usxs, negative_tests, find_all_markers
+    doubtful_usxs, negative_tests, find_all_markers,\
+    generate_USFM_from_USX, parse_USFM_string
 
 lxml_object = etree.Element('Root')
 checker = LXMLOutputChecker()
@@ -115,3 +116,28 @@ def test_all_markers_are_in_output(file_path):
         #     marker.endswith("-e") or marker.startswith("z")):
         #     marker = "milestone"
         assert marker in all_styles or synonym in all_styles, marker
+
+known_issue_of_failed_xml_parsing = [
+    "../tests/usfmjsTests/inline_God/origin.xml",
+    "../tests/paratextTests/GlossaryCitationFormContainsNonWordformingPunctuation/origin.xml",
+]
+
+@pytest.mark.parametrize('file_path', test_files)
+@pytest.mark.timeout(30)
+def test_usx_round_tripping(file_path):
+    '''Convert USFM to USJ and back to USFM.
+    Compare first USFM and second USFM based on parse tree''' 
+    file_path = file_path.replace(".usfm", ".xml")
+    if file_path in known_issue_of_failed_xml_parsing:
+        return
+    with open(file_path, 'r', encoding='utf-8') as usx_file:
+        usx_str = usx_file.read()
+        if 'status="invalid"' in usx_str:
+            return
+        usx_xml = etree.fromstring(usx_str)
+
+        generated_USFM = generate_USFM_from_USX(usx_xml)
+        test_parser2 = parse_USFM_string(generated_USFM)
+        assert not test_parser2.errors, str(test_parser2.errors)+"\n"+ generated_USFM
+
+        # assert test_parser2.to_usx() == usx_xml, generated_USX not same as input USX