Skip to content

Commit

Permalink
From usx parsing (#230)
Browse files Browse the repository at this point in the history
* Build USX to USFM conversion

* integrate it as 'from_usx' option in usfm-parser

* test annd fix USX to USFM conversion feature

* Add infile=usx option in CLI

* Update documentation, including the new feature of from-usx parsing

* fix linter errors
  • Loading branch information
kavitharaju authored Feb 6, 2024
1 parent 6dc912a commit 49a3bd3
Show file tree
Hide file tree
Showing 7 changed files with 203 additions and 27 deletions.
23 changes: 22 additions & 1 deletion docs/Dev_notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,25 @@ In python module,
cd py-usfm-parser
source ENV-dev
bumpversion --new-version 3.0.0-alpha.28 num
```
```

## Run tests
To check Syntax trees in Grammar module
```
cd tree-sitter-usfm3
export PATH=$PATH:./node_modules/.bin
tree-sitter generate
tree-sitter test
```

In python module alone

```
cd py-usfm-parser
python -m pytest - auto
# to run selectively
pytest -k "not compare_usx_with_testsuite_samples and not testsuite_usx_with_rnc_grammar and not generated_usx_with_rnc_grammar and not samples-from-wild" -n auto
```

35 changes: 26 additions & 9 deletions py-usfm-parser/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,15 @@ errors = my_parser.errors
print(errors)
```

To convert to USX
##### To convert to USX
```
from lxml import etree
usx_elem = my_parser.to_usx() # default filter=ALL
print(etree.tostring(usx_elem, encoding="unicode", pretty_print=True))
```

To convert to Dict
##### To convert to Dict/USJ

```
output = my_parser.to_usj() # default all markers
Expand All @@ -68,15 +68,15 @@ print(output)
```
To understand more about how `exclude_markers`, `include_markers`, `combine_texts` and `Filter` works refer the section on [filtering on USJ](#filtering-on-usj)

To save as json
##### To save as json
```
import json
dict_output = my_parser.to_usj()
with open("file_path.json", "w", encoding='utf-8') as fp:
json.dump(dict_output, fp)
```

To convert to List or table like format
##### To convert to List or table like format
```
list_output = my_parser.to_list()
#list_output = my_parser.to_list([Filter.SCRIPTURE_TEXT])
Expand All @@ -86,7 +86,7 @@ print(table_output)
```

To round trip with USJ
##### To round trip with USJ
```
from usfm_grammar import USFMParser, Filter
Expand All @@ -98,7 +98,7 @@ print(my_parser2.usfm)
```
:warning: There will be differences between first USFM and the generated one in 1. Spaces and lines 2. Default attributes will be given their names 3. Closing markers may be newly added

To remove unwanted markers from USFM
##### To remove unwanted markers from USFM
```
from usfm_grammar import USFMParser, Filter, USFMGenerator
Expand All @@ -108,9 +108,9 @@ usj_obj = my_parser.to_usj(include_markers=Filter.BCV+Filter.TEXT)
my_parser2 = USFMParser(from_usj=usj_obj)
print(my_parser2.usfm)
```
USJ to USX or Table
##### USJ to USX or Table
```
rom usfm_grammar import USFMParser, Filter
from usfm_grammar import USFMParser, Filter
my_parser = USFMParser(input_usfm_str)
usj_obj = my_parser.to_usj()
Expand All @@ -119,10 +119,27 @@ my_parser2 = USFMParser(from_usj=usj_obj)
print(my_parser2.to_usx())
# print(my_parser2.to_list())
```

##### USX to USFM, USJ or Table
```
from usfm_grammar import USFMParser, Filter
from lxml import etree
test_xml_file = "sample_usx.xml"
with open(test_xml_file, 'r', encoding='utf-8') as usx_file:
usx_str = usx_file.read()
usx_obj = etree.fromstring(usx_str)
my_parser = USFMParser(from_usx=usx_obj)
print(my_parser.usfm)
# print(my_parser.to_usj())
# print(my_parser.to_list())
```

### From CLI

```
usage: usfm-grammar [-h] [--in_format {usfm,usj}]
usage: usfm-grammar [-h] [--in_format {usfm,usj,usx}]
[--out_format {usj,table,syntax-tree,usx,markdown,usfm}]
[--include_markers {book_headers,titles,...}]
[--exclude_markers {book_headers,titles,...}]
Expand Down
7 changes: 5 additions & 2 deletions py-usfm-parser/src/usfm_grammar/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,12 @@ def handle_input_file(arg_parser):
with open(infile, 'r', encoding='utf-8') as usfm_file:
file_content = usfm_file.read()

if input_format == Format.JSON or infile.split(".")[-1] in ['json', 'usj']:
if input_format == Format.JSON or infile.split(".")[-1].lower() in ['json', 'usj']:
usj_obj = json.loads(file_content)
my_parser = USFMParser(from_usj=usj_obj)
elif input_format == Format.USX or infile.split(".")[-1].lower() in ['xml', 'usx']:
usx_obj = etree.fromstring(file_content)
my_parser = USFMParser(from_usx=usx_obj)
elif input_format == Format.USFM:
my_parser = USFMParser(file_content)
else:
Expand Down Expand Up @@ -62,7 +65,7 @@ def main(): #pylint: disable=too-many-locals
arg_parser.add_argument('infile', type=str, help='input usfm or usj file')

arg_parser.add_argument('--in_format', type=str, help='input file format',
choices=[Format.USFM.value, Format.JSON.value],
choices=[Format.USFM.value, Format.JSON.value, Format.USX.value],
default=Format.USFM.value)
arg_parser.add_argument('--out_format', type=str, help='output format',
choices=[itm.value for itm in Format],
Expand Down
107 changes: 98 additions & 9 deletions py-usfm-parser/src/usfm_grammar/usfm_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,22 @@
NON_ATTRIB_USJ_KEYS = ['type', 'marker', 'content', 'number', 'sid',
'code', 'caller', 'align',
'version', 'altnumber', 'pubnumber', 'category']
NON_ATTRIB_USX_KEYS = ['number', 'code', 'caller', 'align', "sid", "eid",
"style", "closed", "vid", "status",'version',
'altnumber', 'pubnumber', 'category']
NO_NEWLINE_USX_TYPES = ['char', 'note', "cell", "figure", "usx", "book", "optbreak"]
CLOSING_USX_TYPES = ['char', 'note', 'figure', "ms"]

class USFMGenerator:
'''Combines the different methods that generate USFM from other formats in one class'''
def __init__(self):
self.usfm_string = ''

def is_valid_usfm(self, usfm_string: dict = None) -> bool:
'''Check the generated or passed USFM's correctness using the grammar'''
if usfm_string is None:
usfm_string = self.usfm_string
return False
# def is_valid_usfm(self, usfm_string: str = None) -> bool:
# '''Check the generated or passed USFM's correctness using the grammar'''
# if usfm_string is None:
# usfm_string = self.usfm_string
# return False

def usj_to_usfm(self, usj_obj: dict, nested=False) -> None: # pylint: disable=too-many-statements, too-many-branches
'''Traverses through the dict/json and uses 'type' field to form USFM elements'''
Expand Down Expand Up @@ -82,7 +87,91 @@ def usj_to_usfm(self, usj_obj: dict, nested=False) -> None: # pylint: disable=to
else:
self.usfm_string += "\n"

# def usx_to_usfm(self, usx_xml_tree) -> str: # should we call it just from_usx() instead
# '''Traverses xml tree and converts nodes to usfm elements
# based on type and style fields'''
# return self.usfm_string
def usx_to_usfm(self, xml_obj, nested=False): # pylint: disable=too-many-statements, too-many-branches
'''Traverses xml tree and converts nodes to usfm elements
based on type and style fields'''
if isinstance(xml_obj, str):
self.usfm_string += xml_obj
return
obj_type = xml_obj.tag
marker = None
usfm_attributes = []
if obj_type in ["verse", "chapter"] and "eid" in xml_obj.attrib:
return
if obj_type not in NO_NEWLINE_USX_TYPES:
self.usfm_string += "\n"
if obj_type == "optbreak":
if self.usfm_string != "" and self.usfm_string[-1] not in ["\n", "\r", " ", "\t"]:
self.usfm_string += " "
self.usfm_string += "// "
if "style" in xml_obj.attrib:
marker = xml_obj.attrib["style"]
if nested and obj_type=="char":
marker = "+"+marker
self.usfm_string += f"\\{marker} "
if "code" in xml_obj.attrib:
self.usfm_string += xml_obj.attrib['code']
if "number" in xml_obj.attrib:
self.usfm_string += f"{xml_obj.attrib['number']} "
if "caller" in xml_obj.attrib:
self.usfm_string += f"{xml_obj.attrib['caller']} "
if "altnumber" in xml_obj.attrib:
if obj_type == "verse":
self.usfm_string += f"\\va {xml_obj.attrib['altnumber']}\\va*"
elif obj_type == "chapter":
self.usfm_string += f"\n\\ca {xml_obj.attrib['altnumber']}\\ca*"
if "pubnumber" in xml_obj.attrib:
if obj_type == "verse":
self.usfm_string += f"\\vp {xml_obj.attrib['pubnumber']}\\vp*"
elif obj_type == "chapter":
self.usfm_string += f"\n\\cp {xml_obj.attrib['pubnumber']}"
if "category" in xml_obj.attrib:
self.usfm_string += f"\n\\cat {xml_obj.attrib['category']} \\cat*"
if xml_obj.text:
if self.usfm_string != "" and self.usfm_string[-1] not in ["\n", "\r", " ", "\t"]:
self.usfm_string += " "
self.usfm_string += xml_obj.text.strip()
for child in xml_obj.getchildren():
if obj_type in ["char"]:
self.usx_to_usfm(child, nested=True)
else:
self.usx_to_usfm(child, nested=False)
if child.tail:
if self.usfm_string !="" and self.usfm_string[-1] not in ["\n", "\r", " ", "\t"]:
self.usfm_string += " "
self.usfm_string += child.tail.strip()
for key in xml_obj.attrib:
val = xml_obj.attrib[key]
val = val.replace('"', '')
if key == "file" and obj_type=="figure":
usfm_attributes.append(f'src="{val}"')
elif key not in NON_ATTRIB_USX_KEYS:
usfm_attributes.append(f'{key}="{val}"')
if key in ['sid', 'eid'] and obj_type=="ms":
usfm_attributes.append(f'{key}="{val}"')
if len(usfm_attributes) > 0:
self.usfm_string += "|"
self.usfm_string += " ".join(usfm_attributes)

if (("closed" in xml_obj.attrib and xml_obj.attrib['closed']=="true")
or obj_type in CLOSING_USX_TYPES
or len(usfm_attributes)>0):
# if not ("closed" in xml_obj.attrib and xml_obj.attrib['closed']=="false"):
if obj_type == "ms":
self.usfm_string += "\\*"
else:
self.usfm_string += f"\\{marker}*"

if __name__ == "__main__":
from lxml import etree

TEST_FILE = "../../../tests/basic/cross-refs/origin.xml"

with open(TEST_FILE, 'r', encoding='utf-8') as usx_file:
usx_str = usx_file.read()
root = etree.fromstring(usx_str)

gen = USFMGenerator()

gen.usx_to_usfm(root)
print(gen.usfm_string)
24 changes: 19 additions & 5 deletions py-usfm-parser/src/usfm_grammar/usfm_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,18 +77,32 @@ class Format(str, Enum):

class USFMParser():
"""Parser class with usfmstring, syntax_tree and methods for JSON convertions"""
def __init__(self, usfm_string:str=None, from_usj:dict=None):
def __init__(self, usfm_string:str=None, from_usj:dict=None, from_usx:etree.Element=None):
# super(USFMParser, self).__init__()
if usfm_string is not None and from_usj is not None:
raise Exception("Found USFM and USJ inputs! Only one supported in one object.")
inputs_given = 0
if usfm_string is not None:
inputs_given += 1
if from_usj is not None:
inputs_given += 1
if from_usx is not None:
inputs_given += 1

if inputs_given > 1:
raise Exception("Found more than one input!"+\
" Only one of USFM, USJ or USX is supported in one object.")
if inputs_given == 0:
raise Exception("Missing input! Either USFM, USJ or USX is to be provided.")

if usfm_string is not None:
self.usfm = usfm_string
elif from_usj is not None:
usj_converter = USFMGenerator()
usj_converter.usj_to_usfm(from_usj)
self.usfm = usj_converter.usfm_string
else:
raise Exception("Missing input! Either USFM or USJ to be provided.")
elif from_usx is not None:
usx_converter = USFMGenerator()
usx_converter.usx_to_usfm(from_usx)
self.usfm = usx_converter.usfm_string

self.usfm_bytes = None
self.syntax_tree = None
Expand Down
6 changes: 6 additions & 0 deletions py-usfm-parser/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ def generate_USFM_from_USJ(input_usj):
usj_parser = USFMParser(from_usj=input_usj)
return usj_parser.usfm

def generate_USFM_from_USX(input_usx):
'''Create a generator, and use usj_to_usfm convertion API'''
usx_parser = USFMParser(from_usx=input_usx)
return usx_parser.usfm


def parse_USFM_string(usfm_string):
'''Set up a parser obj with given string input'''
test_parser = USFMParser(usfm_string)
Expand Down
28 changes: 27 additions & 1 deletion py-usfm-parser/tests/test_usx_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from lxml.doctestcompare import LXMLOutputChecker, PARSE_XML

from tests import all_usfm_files, initialise_parser, doubtful_usfms,\
doubtful_usxs, negative_tests, find_all_markers
doubtful_usxs, negative_tests, find_all_markers,\
generate_USFM_from_USX, parse_USFM_string

lxml_object = etree.Element('Root')
checker = LXMLOutputChecker()
Expand Down Expand Up @@ -115,3 +116,28 @@ def test_all_markers_are_in_output(file_path):
# marker.endswith("-e") or marker.startswith("z")):
# marker = "milestone"
assert marker in all_styles or synonym in all_styles, marker

known_issue_of_failed_xml_parsing = [
"../tests/usfmjsTests/inline_God/origin.xml",
"../tests/paratextTests/GlossaryCitationFormContainsNonWordformingPunctuation/origin.xml",
]

@pytest.mark.parametrize('file_path', test_files)
@pytest.mark.timeout(30)
def test_usx_round_tripping(file_path):
'''Convert USFM to USJ and back to USFM.
Compare first USFM and second USFM based on parse tree'''
file_path = file_path.replace(".usfm", ".xml")
if file_path in known_issue_of_failed_xml_parsing:
return
with open(file_path, 'r', encoding='utf-8') as usx_file:
usx_str = usx_file.read()
if 'status="invalid"' in usx_str:
return
usx_xml = etree.fromstring(usx_str)

generated_USFM = generate_USFM_from_USX(usx_xml)
test_parser2 = parse_USFM_string(generated_USFM)
assert not test_parser2.errors, str(test_parser2.errors)+"\n"+ generated_USFM

# assert test_parser2.to_usx() == usx_xml, generated_USX not same as input USX

0 comments on commit 49a3bd3

Please sign in to comment.