Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Usj to usfm #224

Merged
merged 9 commits into from
Oct 18, 2023
273 changes: 124 additions & 149 deletions docs/API guide for python usfm_grammar.ipynb

Large diffs are not rendered by default.

50 changes: 45 additions & 5 deletions py-usfm-parser/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,44 @@ print(table_output)
```

To round trip with USJ
```
from usfm_grammar import USFMParser, Filter
my_parser = USFMParser(input_usfm_str)
usj_obj = my_parser.to_usj()
my_parser2 = USFMParser(from_usj=usj_obj)
print(my_parser2.usfm)
```
:warning: There will be differences between first USFM and the generated one in 1. Spaces and lines 2. Default attributes will be given their names 3. Closing markers may be newly added

To remove unwanted markers from USFM
```
from usfm_grammar import USFMParser, Filter, USFMGenerator
my_parser = USFMParser(input_usfm_str)
usj_obj = my_parser.to_usj(include_markers=Filter.BCV+Filter.TEXT)
my_parser2 = USFMParser(from_usj=usj_obj)
print(my_parser2.usfm)
```
USJ to USX or Table
```
rom usfm_grammar import USFMParser, Filter
my_parser = USFMParser(input_usfm_str)
usj_obj = my_parser.to_usj()
my_parser2 = USFMParser(from_usj=usj_obj)
print(my_parser2.to_usx())
# print(my_parser2.to_list())
```
### From CLI

```
usage: usfm-grammar [-h] [--format {json,table,syntax-tree,usx,markdown}]
usage: usfm-grammar [-h] [--in_format {usfm,usj}]
[--out_format {usj,table,syntax-tree,usx,markdown,usfm}]
[--include_markers {book_headers,titles,...}]
[--exclude_markers {book_headers,titles,...}]
[--csv_col_sep CSV_COL_SEP] [--csv_row_sep CSV_ROW_SEP]
Expand All @@ -100,11 +134,13 @@ Uses the tree-sitter-usfm grammar to parse and convert USFM to Syntax-tree,
JSON, CSV, USX etc.
positional arguments:
infile input usfm file
infile input usfm or usj file
options:
-h, --help show this help message and exit
--format {json,table,syntax-tree,usx,markdown}
--in_format {usfm,usj}
input file format
--out_format {usj,table,syntax-tree,usx,markdown,usfm}
output format
--include_markers {book_headers,titles,comments,paragraphs,characters,notes,study_bible,bcv,text,ide,usfm,h,toc,toca,imt,is,ip,ipi,im,imi,ipq,imq,ipr,iq,ib,ili,iot,io,iex,imte,ie,mt,mte,cl,cd,ms,mr,s,sr,r,d,sp,sd,sts,rem,lit,restore,p,m,po,pr,cls,pmo,pm,pmc,pmr,pi,mi,nb,pc,ph,q,qr,qc,qa,qm,qd,lh,li,lf,lim,litl,tr,tc,th,tcr,thr,table,b,add,bk,dc,ior,iqt,k,litl,nd,ord,pn,png,qac,qs,qt,rq,sig,sls,tl,wj,em,bd,bdit,it,no,sc,sup,rb,pro,w,wh,wa,wg,lik,liv,jmp,f,fe,ef,efe,x,ex,fr,ft,fk,fq,fqa,fl,fw,fp,fv,fdc,xo,xop,xt,xta,xk,xq,xot,xnt,xdc,esb,cat,id,c,v,text-in-excluded-parent}
the list of of contents to be included
Expand All @@ -123,11 +159,15 @@ options:
```
Example
```
>>> python3 -m usfm_grammar sample.usfm --format usx
>>> python3 -m usfm_grammar sample.usfm --out_format usx
>>> usfm-grammar sample.usfm --format usx
>>> usfm-grammar sample.usfm
>>> usfm-grammar sample.usfm --out_format usx
>>> usfm-grammar sample.usfm --include_markers bcv --include_markers text --include_markers s
>>> usfm-grammar sample-usj.json --out_format usfm
```

### Filtering on USJ
Expand Down
1 change: 1 addition & 0 deletions py-usfm-parser/src/usfm_grammar/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
'''Entry point of the package with its public values'''

from usfm_grammar import usfm_parser
from usfm_grammar import usfm_generator

Filter = usfm_parser.Filter
Format = usfm_parser.Format
Expand Down
91 changes: 58 additions & 33 deletions py-usfm-parser/src/usfm_grammar/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,60 @@
for member in Filter:
all_markers += member.value

def handle_input_file(arg_parser):
'''If initialsing with USFM or USJ?'''
infile = arg_parser.parse_args().infile
input_format = arg_parser.parse_args().in_format
with open(infile, 'r', encoding='utf-8') as usfm_file:
file_content = usfm_file.read()

if input_format == Format.JSON or infile.split(".")[-1] in ['json', 'usj']:
usj_obj = json.loads(file_content)
my_parser = USFMParser(from_usj=usj_obj)
elif input_format == Format.USFM:
my_parser = USFMParser(file_content)
else:
raise Exception("Un-recognized input_format!")
return my_parser

def handle_include_exclude_options(arg_parser):
'''Process list of markers and ENUMs'''
exclude_markers = arg_parser.parse_args().exclude_markers
include_markers = arg_parser.parse_args().include_markers

filter_names = [member.name for member in Filter]
if exclude_markers is None:
updated_exclude_markers = None
else:
updated_exclude_markers = []
for itm in exclude_markers:
if itm.upper() in filter_names:
updated_exclude_markers += Filter[itm.upper()]
else:
updated_exclude_markers.append(itm.lower().replace("\\", ""))
if include_markers is None:
updated_include_markers = None
else:
updated_include_markers = []
for itm in include_markers:
if itm.upper() in filter_names:
updated_include_markers += Filter[itm.upper()]
else:
updated_include_markers.append(itm.lower().replace("\\", ""))
return updated_exclude_markers, updated_include_markers


def main(): #pylint: disable=too-many-locals
'''handles the command line requests'''
arg_parser = argparse.ArgumentParser(
description='Uses the tree-sitter-usfm grammar to parse and convert USFM to '+\
'Syntax-tree, JSON, CSV, USX etc.')
arg_parser.add_argument('infile', type=str, help='input usfm file')
arg_parser.add_argument('--format', type=str, help='output format',
arg_parser.add_argument('infile', type=str, help='input usfm or usj file')

arg_parser.add_argument('--in_format', type=str, help='input file format',
choices=[Format.USFM.value, Format.JSON.value],
default=Format.USFM.value)
arg_parser.add_argument('--out_format', type=str, help='output format',
choices=[itm.value for itm in Format],
default=Format.JSON.value)
arg_parser.add_argument('--include_markers', type=str,
Expand All @@ -44,52 +90,29 @@ def main(): #pylint: disable=too-many-locals
'from different components, or not',
action='store_true')

infile = arg_parser.parse_args().infile
output_format = arg_parser.parse_args().format
exclude_markers = arg_parser.parse_args().exclude_markers
include_markers = arg_parser.parse_args().include_markers

with open(infile, 'r', encoding='utf-8') as usfm_file:
file_content = usfm_file.read()

my_parser = USFMParser(file_content)
my_parser = handle_input_file(arg_parser)

if my_parser.errors and not arg_parser.parse_args().ignore_errors:
err_str = "\n\t".join([":".join(err) for err in my_parser.errors])
print(f"Errors present:\n\t{err_str}")
sys.exit(1)

filter_names = [member.name for member in Filter]
if exclude_markers is None:
updated_exclude_markers = None
else:
updated_exclude_markers = []
for itm in exclude_markers:
if itm.upper() in filter_names:
updated_exclude_markers += Filter[itm.upper()]
else:
updated_exclude_markers.append(itm.lower().replace("\\", ""))
if include_markers is None:
updated_include_markers = None
else:
updated_include_markers = []
for itm in include_markers:
if itm.upper() in filter_names:
updated_include_markers += Filter[itm.upper()]
else:
updated_include_markers.append(itm.lower().replace("\\", ""))
exclude_markers, include_markers = handle_include_exclude_options(arg_parser)

output_format = arg_parser.parse_args().out_format

match output_format:
case Format.JSON:
dict_output = my_parser.to_usj(
exclude_markers=updated_exclude_markers,
include_markers=updated_include_markers,
exclude_markers=exclude_markers,
include_markers=include_markers,
ignore_errors=True)
print(json.dumps(dict_output, indent=4, ensure_ascii=False))
case Format.CSV:
table_output = my_parser.to_list(
exclude_markers=updated_exclude_markers,
include_markers=updated_include_markers,
exclude_markers=exclude_markers,
include_markers=include_markers,
ignore_errors=True)
outfile = sys.stdout
writer = csv.writer(outfile,
Expand All @@ -104,6 +127,8 @@ def main(): #pylint: disable=too-many-locals
print(my_parser.to_markdown())
case Format.ST:
print(my_parser.to_syntax_tree(ignore_errors=True))
case Format.USFM:
print(my_parser.usfm)
case _:
raise Exception(f"Un-recognized output format:{output_format}!")

Expand Down
89 changes: 89 additions & 0 deletions py-usfm-parser/src/usfm_grammar/usfm_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
'''Convert other formats back into USFM'''

NO_USFM_USJ_TYPES = ['USJ', 'table']
NO_NEWLINE_USJ_TYPES = ['char', 'note', 'verse', 'table:cell']
CLOSING_USJ_TYPES = ['char', 'note', 'figure']
NON_ATTRIB_USJ_KEYS = ['type', 'content', 'number', 'sid',
'code', 'caller', 'align',
'version', 'altnumber', 'pubnumber', 'category']

class USFMGenerator:
'''Combines the different methods that generate USFM from other formats in one class'''
def __init__(self):
self.usfm_string = ''

def is_valid_usfm(self, usfm_string: dict = None) -> bool:
'''Check the generated or passed USFM's correctness using the grammar'''
if usfm_string is None:
usfm_string = self.usfm_string
return False

def usj_to_usfm(self, usj_obj: dict, nested=False) -> None: # pylint: disable=too-many-statements, too-many-branches
'''Traverses through the dict/json and uses 'type' field to form USFM elements'''
marker_types = usj_obj['type'].split(':')
if usj_obj['type'] not in NO_USFM_USJ_TYPES:
self.usfm_string += "\\"
if nested and marker_types[0] == 'char':
self.usfm_string+="+"
self.usfm_string += f"{marker_types[-1]} "
if 'code' in usj_obj:
self.usfm_string += f"{usj_obj['code']} "
if 'number' in usj_obj:
self.usfm_string += usj_obj['number']
if marker_types[0] == "verse":
self.usfm_string += " "
if 'caller' in usj_obj:
self.usfm_string += f"{usj_obj['caller']} "
if 'category' in usj_obj:
self.usfm_string += f"\\cat {usj_obj['category']}\\cat*\n"
if 'content' in usj_obj:
for item in usj_obj['content']:
if isinstance(item, str):
self.usfm_string += item
else:
if marker_types[0] in ['char']:
self.usj_to_usfm(item, nested=True)
else:
self.usj_to_usfm(item)
attributes = False
for key in usj_obj:
if key not in NON_ATTRIB_USJ_KEYS:
if not attributes:
self.usfm_string += "|"
attributes = True
if key == "file":
self.usfm_string += f"src=\"{usj_obj[key]}\" "
else:
self.usfm_string += f"{key}=\"{usj_obj[key]}\" "

if marker_types[0] in CLOSING_USJ_TYPES:
self.usfm_string = self.usfm_string.strip() + "\\"
if nested and marker_types[0] == 'char':
self.usfm_string+="+"
self.usfm_string += f"{marker_types[-1]}* "
if marker_types[0] == "ms":
if "sid" in usj_obj:
if not attributes:
self.usfm_string += "|"
attributes = True
self.usfm_string += f"sid=\"{usj_obj['sid']}\" "
self.usfm_string = self.usfm_string.strip() + "\\*"
if marker_types[0] == "sidebar":
self.usfm_string += "\\esbe"
if ":".join(marker_types[:-1]) not in NO_NEWLINE_USJ_TYPES and \
self.usfm_string[-1] != "\n":
self.usfm_string += "\n"
if "altnumber" in usj_obj:
self.usfm_string += f"\\{marker_types[-1]}a {usj_obj['altnumber']}"
self.usfm_string += f"\\{marker_types[-1]}a* "
if "pubnumber" in usj_obj:
self.usfm_string += f"\\{marker_types[-1]}p {usj_obj['pubnumber']}"
if marker_types[-1] == "v":
self.usfm_string += f"\\{marker_types[-1]}p* "
else:
self.usfm_string += "\n"

# def usx_to_usfm(self, usx_xml_tree) -> str: # should we call it just from_usx() instead
# '''Traverses xml tree and converts nodes to usfm elements
# based on type and style fields'''
# return self.usfm_string
20 changes: 16 additions & 4 deletions py-usfm-parser/src/usfm_grammar/usfm_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from usfm_grammar.usx_generator import USXGenerator
from usfm_grammar.usj_generator import USJGenerator
from usfm_grammar.list_generator import ListGenerator
from usfm_grammar.usfm_generator import USFMGenerator
from usfm_grammar.filters import exclude_markers_in_usj, include_markers_in_usj

class Filter(list, Enum):
Expand Down Expand Up @@ -41,12 +42,13 @@ class Filter(list, Enum):
# INNER_CONTENT = ['content-in-excluded-parent']

class Format(str, Enum):
'''Defines the valid values for output formats'''
JSON = "json"
'''Defines the valid values for input and output formats'''
JSON = "usj"
CSV = "table"
ST = "syntax-tree"
USX = "usx"
MD = "markdown"
USFM = "usfm"

lang_file = resources.path('usfm_grammar','my-languages.so')
USFM_LANGUAGE = Language(str(lang_file), 'usfm3')
Expand Down Expand Up @@ -75,9 +77,19 @@ class Format(str, Enum):

class USFMParser():
"""Parser class with usfmstring, syntax_tree and methods for JSON convertions"""
def __init__(self, usfm_string):
def __init__(self, usfm_string:str=None, from_usj:dict=None):
# super(USFMParser, self).__init__()
self.usfm = usfm_string
if usfm_string is not None and from_usj is not None:
raise Exception("Found USFM and USJ inputs! Only one supported in one object.")
if usfm_string is not None:
self.usfm = usfm_string
elif from_usj is not None:
usj_converter = USFMGenerator()
usj_converter.usj_to_usfm(from_usj)
self.usfm = usj_converter.usfm_string
else:
raise Exception("Missing input! Either USFM or USJ to be provided.")

self.usfm_bytes = None
self.syntax_tree = None
self.errors = None
Expand Down
10 changes: 10 additions & 0 deletions py-usfm-parser/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,16 @@ def initialise_parser(input_usfm_path):
test_parser = USFMParser(usfm_string)
return test_parser

def generate_USFM_from_USJ(input_usj):
'''Create a generator, and use usj_to_usfm convertion API'''
usj_parser = USFMParser(from_usj=input_usj)
return usj_parser.usfm

def parse_USFM_string(usfm_string):
'''Set up a parser obj with given string input'''
test_parser = USFMParser(usfm_string)
return test_parser

def is_valid_usfm(input_usfm_path):
'''Checks the metadata.xml to see is the USFM is a valid one'''
if input_usfm_path in pass_fail_override_list:
Expand Down
Loading