Relaxed mode with "ignore_errors" option (#201)

* Exlcude one more large file from tests as it gets timeout * Add ignore_errors option on CLI * Bump version: 3.0.0-alpha.5 → 3.0.0-alpha.29 * add new gitactions workflow to publish on test.pypi * try publish on test.pypi attempt #2 * Bump version: 3.0.0-alpha.29 → 3.0.0-alpha.6 * undo changes in workflow made for test.pypi publish * Python module: Add ignore_errors option in class methods * Python module: Add ignore_errors in calling class methods from CLI * JSON/dict format: remove slash(\) from value of closing
Bridgeconn · Dec 15, 2022 · a08b2ae · a08b2ae
1 parent f8d3b49
commit a08b2ae
Show file tree

Hide file tree

Showing 10 changed files with 165 additions and 17 deletions.
diff --git a/.github/workflows/check-on-push.yml b/.github/workflows/check-on-push.yml
@@ -72,4 +72,4 @@ jobs:
       - name: Run tests for parsing errors
         working-directory: ./python-usfm-parser
         run:
-          pytest -k "not compare_usx_with_testsuite_samples and not testsuite_usx_with_rnc_grammar and not samples-from-wild" -n auto
+          pytest -k "not compare_usx_with_testsuite_samples and not testsuite_usx_with_rnc_grammar and not samples-from-wild and not 57-TIT.partial" -n auto
diff --git a/.github/workflows/pypi_publish.yml b/.github/workflows/pypi_publish.yml
@@ -57,7 +57,7 @@ jobs:
           name: artifact
           path: python-usfm-parser/dist/
 
-      - name: Publish distribution 📦 to Test PyPI
+      - name: Publish distribution 📦 to PyPI
         uses: pypa/gh-action-pypi-publish@release/v1
         with:
           user: __token__

diff --git a/.github/workflows/testpypi_publish.yml b/.github/workflows/testpypi_publish.yml
@@ -0,0 +1,64 @@
+name: Test.PyPI Publish
+
+on:
+  # push: # need to use this temporarly to be able to publish
+  workflow_dispatch: # works only on default branch
+
+jobs:
+  build_wheels:
+    name: Build wheels on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-20.04, windows-2019, macos-11]
+        # os: [ubuntu-20.04]
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Setup node and npm
+        uses: actions/setup-node@v2
+        with:
+          node-version: 14
+      - name: Build the tree-sitter parser
+        run: |
+          cd tree-sitter-usfm3
+          npm install .
+          ./node_modules/.bin/tree-sitter generate
+
+      - name: Build wheels
+        uses: pypa/cibuildwheel@v2.9.0
+        env:
+          CIBW_BEFORE_BUILD: >
+              python -m pip install tree-sitter==0.20.0 &&
+              python python-usfm-parser/src/grammar_rebuild.py tree-sitter-usfm3 python-usfm-parser/src/usfm_grammar/my-languages.so
+        with:
+          package-dir: python-usfm-parser
+          output-dir: python-usfm-parser/dist
+          config-file: "python-usfm-parser/pyproject.toml"
+
+      - uses: actions/upload-artifact@v3
+        with:
+          path: python-usfm-parser/dist/*.whl
+
+  publish_to_pypi:
+    needs: [build_wheels]
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@master
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v3
+        with:
+          python-version: "3.10"
+      - uses: actions/download-artifact@v3
+        with:
+          name: artifact
+          path: python-usfm-parser/dist/
+
+      - name: Publish distribution 📦 to Test PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          user: __token__
+          password: ${{ secrets.PUBLISH_ON_TEST_PYPI }}
+          repository_url: https://test.pypi.org/legacy/
+          packages_dir: python-usfm-parser/dist/
diff --git a/python-usfm-parser/.bumpversion.cfg b/python-usfm-parser/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 3.0.0-alpha.5
+current_version = 3.0.0-alpha.6
 commit = True
 tag = False
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)\-(?P<release>\w+).(?P<num>\d+)

diff --git a/python-usfm-parser/pyproject.toml b/python-usfm-parser/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "usfm-grammar"
-version = "3.0.0-alpha.5"
+version = "3.0.0-alpha.6"
 description = "Python parser for USFM files, based on tree-sitter-usfm3"
 readme = "README.md"
 authors = [{ name = "BCS Team", email = "joel@bridgeconn.com" }]

diff --git a/python-usfm-parser/setup.py b/python-usfm-parser/setup.py
@@ -7,7 +7,7 @@ def has_ext_modules(self):
 
 setup(
     name="usfm-grammar",  # Required
-    version="3.0.0-alpha.5",  # Required
+    version="3.0.0-alpha.6",  # Required
     python_requires=">=3.10",
     install_requires=["tree-sitter", "lxml"],  # Optional
     package_data={  # Optional

diff --git a/python-usfm-parser/src/usfm_grammar/__init__.py b/python-usfm-parser/src/usfm_grammar/__init__.py
@@ -6,4 +6,4 @@
 Format = usfm_parser.Format
 USFMParser = usfm_parser.USFMParser
 
-__version__ = "3.0.0-alpha.5"
+__version__ = "3.0.0-alpha.6"
diff --git a/python-usfm-parser/src/usfm_grammar/__main__.py b/python-usfm-parser/src/usfm_grammar/__main__.py
@@ -26,6 +26,9 @@ def main():
     arg_parser.add_argument('--csv_row_sep', type=str,
                             help="row separator or delimiter. Only useful with format=table.",
                             default="\n")
+    arg_parser.add_argument('--ignore_errors',
+                            help="to get some output from successfully parsed portions",
+                            action='store_true')
 
     infile = arg_parser.parse_args().infile
     output_format = arg_parser.parse_args().format
@@ -36,7 +39,7 @@ def main():
 
     my_parser = USFMParser(file_content)
 
-    if my_parser.errors:
+    if my_parser.errors and not arg_parser.parse_args().ignore_errors:
         err_str = "\n\t".join([":".join(err) for err in my_parser.errors])
         print(f"Errors present:\n\t{err_str}")
         sys.exit(1)
@@ -50,23 +53,23 @@ def main():
 
     match output_format:
         case Format.JSON:
-            dict_output = my_parser.to_dict(filters=updated_filt)
+            dict_output = my_parser.to_dict(filters=updated_filt, ignore_errors=True)
             print(json.dumps(dict_output, indent=4, ensure_ascii=False))
         case Format.CSV:
-            table_output = my_parser.to_list(filters = updated_filt)
+            table_output = my_parser.to_list(filters = updated_filt, ignore_errors=True)
             outfile = sys.stdout
             writer = csv.writer(outfile,
                 delimiter=arg_parser.parse_args().csv_col_sep,
                 lineterminator=arg_parser.parse_args().csv_row_sep)
             writer.writerows(table_output)
         case Format.USX:
-            xmlstr = etree.tostring(my_parser.to_usx(),
+            xmlstr = etree.tostring(my_parser.to_usx(ignore_errors=True),
                 encoding='unicode', pretty_print=True)
             print(xmlstr)
         case Format.MD:
             print(my_parser.to_markdown())
         case Format.ST:
-            print(my_parser.to_syntax_tree())
+            print(my_parser.to_syntax_tree(ignore_errors=True))
         case _:
             raise Exception(f"Un-recognized output format:{output_format}!")
 

diff --git a/python-usfm-parser/src/usfm_grammar/usfm_parser.py b/python-usfm-parser/src/usfm_grammar/usfm_parser.py
@@ -522,7 +522,7 @@ def node_2_dict_generic(node, usfm_bytes, filters): # pylint: disable=R0912
         result['attributes'] = attribs
     if closing_node is not None:
         result['closing'] = usfm_bytes[\
-            closing_node.start_byte:closing_node.end_byte].decode('utf-8').strip()
+            closing_node.start_byte:closing_node.end_byte].decode('utf-8').strip().replace("\\","")
     return result
 
 @reduce_nesting
@@ -722,12 +722,22 @@ def __init__(self, usfm_string):
                                     for err in errors]
 
 
-    def to_syntax_tree(self):
+    def to_syntax_tree(self, ignore_errors=False):
         '''gives the syntax tree from class, as a string'''
+        if not ignore_errors and self.errors:
+            err_str = "\n\t".join([":".join(err) for err in self.errors])
+            raise Exception("Errors present:"+\
+                f'\n\t{err_str}'+\
+                "\nUse ignore_errors=True, to generate output inspite of errors")
         return self.syntax_tree.sexp()
 
-    def to_dict(self, filters=None): #pylint: disable=too-many-branches
+    def to_dict(self, filters=None, ignore_errors=False): #pylint: disable=too-many-branches
         '''Converts syntax tree to dictionary/json and selection of desired type of contents'''
+        if (not ignore_errors) and self.errors:
+            err_str = "\n\t".join([":".join(err) for err in self.errors])
+            raise Exception("Errors present:"+\
+                f'\n\t{err_str}'+\
+                "\nUse ignore_errors=True, to generate output inspite of errors")
         dict_output = {"book":{}}
         if filters is None or filters == []:
             filters = list(Filter)
@@ -775,14 +785,20 @@ def to_dict(self, filters=None): #pylint: disable=too-many-branches
             raise Exception(message)  from exe
         return dict_output
 
-    def to_list(self, filters=None): # pylint: disable=too-many-branches
+    def to_list(self, filters=None, ignore_errors=False): # pylint: disable=too-many-branches, too-many-locals
         '''uses the toJSON function and converts JSON to CSV
         To be re-implemented to work with the flat JSON schema'''
+        if not ignore_errors and self.errors:
+            err_str = "\n\t".join([":".join(err) for err in self.errors])
+            raise Exception("Errors present:"+\
+                f'\n\t{err_str}'+\
+                "\nUse ignore_errors=True, to generate output inspite of errors")
+
         if filters is None:
             filters = list(Filter)
         if Filter.PARAGRAPHS in filters:
             filters.remove(Filter.PARAGRAPHS)
-        scripture_json = self.to_dict(filters)
+        scripture_json = self.to_dict(filters, ignore_errors=ignore_errors)
         table_output = [["Book","Chapter","Verse","Verse-Text","Notes","Milestone","Other"]]
         book = scripture_json['book']['bookCode']
         verse_num = 0
@@ -831,8 +847,15 @@ def to_markdown(self):
         return "yet to be implemeneted"
 
 
-    def to_usx(self):
+    def to_usx(self, ignore_errors=False):
         '''convert the syntax_tree to the XML format USX'''
+        if not ignore_errors and self.errors:
+            err_str = "\n\t".join([":".join(err) for err in self.errors])
+            raise Exception("Errors present:"+\
+                f'\n\t{err_str}'+\
+                "\nUse ignore_errors=True, to generate output inspite of errors")
+
+
         usx_root = etree.Element("usx")
         usx_root.set("version", "3.0")
         try:

diff --git a/python-usfm-parser/tests/test_parsing.py b/python-usfm-parser/tests/test_parsing.py
@@ -1,6 +1,8 @@
 '''To test parsing success/errors for USFM/X committee's test suite'''
 import pytest
+from lxml import etree
 
+from src.usfm_grammar import USFMParser
 from tests import all_usfm_files, initialise_parser, is_valid_usfm,\
     doubtful_usfms, negative_tests, find_all_markers
 
@@ -47,3 +49,59 @@ def test_all_markers_are_in_output(file_path):
         if marker in ['qt', 'ts'] or marker.startswith("z"):
             marker = "milestone"
         assert marker in all_nodes_in_st, marker
+
+USFM_WITH_ERROR = '''
+\\id GEN
+\\c 1
+\\p
+\\v 1 correct verse one
+\\v 2 correct verse two
+\\p
+\\v3 wrong verse
+\\c 3
+\\v 1 verse in chapter without paragraph
+\\p
+\\v 2 a correct verse following one without para
+\\c 4
+\\s5
+\\p
+\\v 1 correct verse three after s5
+'''
+
+def test_partial_parsing_with_errors():
+    '''Test use of ignore_errors flag to obtain some output even when input has errors'''
+    test_parser = USFMParser(USFM_WITH_ERROR)
+    assert test_parser.errors
+
+    # without ignore_errors flag
+    def use_API_negative(test_parser, api_str_expression):
+        '''negative tests to ensure exception is raised'''
+        threw_error = False
+        try:
+            eval(api_str_expression)
+        except Exception as exe:
+            assert "Errors present:" in str(exe), api_str_expression
+            assert "Use ignore_errors=True" in str(exe)
+            threw_error = True
+        assert threw_error
+
+    use_API_negative(test_parser, 'test_parser.to_dict()')
+    use_API_negative(test_parser, 'test_parser.to_list()')
+    use_API_negative(test_parser, 'test_parser.to_usx()')
+
+    # with ignore_errors=True
+    def use_API_positive(test_parser, api_str_expression):
+        '''positive tests to ensure correct portions are made available in output'''
+        output = eval(api_str_expression)
+        if isinstance(output, etree._Element):
+            str_output = etree.tostring(output).decode('utf-8')
+        else:
+            str_output = str(output)
+        assert "correct verse one" in str_output, api_str_expression
+        assert "correct verse two" in str_output, api_str_expression
+        assert "correct verse three after s5" in str_output, api_str_expression
+
+    use_API_positive(test_parser, "test_parser.to_dict(ignore_errors=True)")
+    use_API_positive(test_parser, "test_parser.to_list(ignore_errors=True)")
+    use_API_positive(test_parser, "test_parser.to_usx(ignore_errors=True)")
+