diff --git a/.github/workflows/check-on-push.yml b/.github/workflows/check-on-push.yml index 03cd0e5d..84a65a9c 100644 --- a/.github/workflows/check-on-push.yml +++ b/.github/workflows/check-on-push.yml @@ -1,12 +1,12 @@ name: Test-on-push # Run this workflow every time a new commit pushed to your repository -on: - push: - pull_request: - branches: - - master - - version-3 +on: [push, pull_request ] + # push: + # pull_request: + # branches: + # - master + # - version-3 jobs: # Set the job key. The key is displayed as the job name @@ -73,3 +73,58 @@ jobs: run: # pytest -k "not compare_usx_with_testsuite_samples and not testsuite_usx_with_rnc_grammar and not samples-from-wild and not 57-TIT.partial" -n auto python -m pytest -k "not compare_usx_with_testsuite_samples and not testsuite_usx_with_rnc_grammar and not generated_usx_with_rnc_grammar and not samples-from-wild" -n auto + + Run-Node-tests: + name: Run Node tests + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Setup node and npm + uses: actions/setup-node@v4 + with: + node-version: 20 + - name: Run tests + run: | + cd tree-sitter-usfm3 + npm install --save nan + npm install --save-dev tree-sitter-cli + ./node_modules/.bin/tree-sitter generate + - name: Install dependencies + run: | + cd node-usfm-parser + npm install . + npm install ../tree-sitter-usfm3 + node_modules/mocha/bin/mocha.js --timeout=40000 --grep "Include|Exclude|wild|Compare" --invert + + Run-Web-tests: + name: Run Web tests + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Setup node and npm + uses: actions/setup-node@v4 + with: + node-version: 20 + - name: Run tests + run: | + cd tree-sitter-usfm3 + npm install --save nan + npm install --save-dev tree-sitter-cli + ./node_modules/.bin/tree-sitter generate + ./node_modules/.bin/tree-sitter build --wasm + + - name: Install dependencies + run: | + cd web-usfm-parser + npm install . + cp node_modules/web-tree-sitter/tree-sitter.js src/web-tree-sitter/ + cp node_modules/web-tree-sitter/tree-sitter.wasm ./ + cp ../tree-sitter-usfm3/tree-sitter-usfm3.wasm ./tree-sitter-usfm.wasm + + node_modules/mocha/bin/mocha.js --timeout=40000 --grep "Include|Exclude|wild|Compare" --invert + + + + diff --git a/.gitignore b/.gitignore index d42a9d19..4154bcda 100644 --- a/.gitignore +++ b/.gitignore @@ -81,4 +81,6 @@ tree-sitter-usfm3/src/ */dist/* */src/usfm_grammar.egg-info/* **/my-languages.so -**/wheelhouse/ \ No newline at end of file +**/wheelhouse/ +js-usfm-parser/src/web-tree-sitter/tree-sitter.js +js-usfm-parser/src/web-tree-sitter/tree-sitter.wasm diff --git a/docs/Dev_notes.md b/docs/Dev_notes.md index 6356103b..ac350092 100644 --- a/docs/Dev_notes.md +++ b/docs/Dev_notes.md @@ -3,14 +3,14 @@ ## How to build the binary for python module? First compile the grammar -``` +```bash cd tree-sitter-usfm3 export PATH=$PATH:./node_modules/.bin tree-sitter generate tree-sitter test ``` To use the grammar module still in developement from within the py-usfm-grammar module -``` +```bash cd py-usfm-parser source ENV-dev/bin/actiavte pip install ../tree-sitter-usfm3 @@ -21,7 +21,7 @@ To make the changes reflect automatically `pip install -e ../tree-sitter-usfm3`. ## How to change version number in files? In python module, -``` +```bash cd usfm-grammar source py-usfm-parser/ENV-dev/bin/activate bumpversion --new-version 3.0.0-alpha.28 num @@ -31,7 +31,7 @@ The github action is configured to automatically build and publish to PyPI and N ## Run tests To check Syntax trees in Grammar module -``` +```bash cd tree-sitter-usfm3 export PATH=$PATH:./node_modules/.bin tree-sitter generate @@ -40,7 +40,7 @@ tree-sitter test In python module alone -``` +```bash cd py-usfm-parser python -m pytest -n auto @@ -49,3 +49,43 @@ pytest -k "not compare_usx_with_testsuite_samples and not testsuite_usx_with_rnc ``` +## How to build and publish JS web module for local Development + +First compile the grammar and get the wasm file +```bash +cd tree-sitter-usfm3 +export PATH=$PATH:./node_modules/.bin +tree-sitter generate +tree-sitter build --wasm +cp tree-sitter-usfm.wasm ../web-usfm-parser/ +cd .. +``` +After npm install, copy the `tree-sitter.js` file from `node_modules/web-tree-sitter` to the `js-usfm-parser/src/web-tree-sitter` folder to include it in the bundle. Also copy the `tree-sitter.wasm` file to `js-usfm-parser/` to be included in the npm packaging. + +```bash +cd web-usfm-parser/ +npm install . +cp node_modules/web-tree-sitter/tree-sitter.js src/web-tree-sitter/ +cp node_modules/web-tree-sitter/tree-sitter.wasm ./ + +``` + +### To publish the node and web modules + +Build the code base generating both cjs and esm versions of the same code base. This used parcel and its configs are in package.json(main, module, source, etc). These steps can be followed in both the node module directory and web module directory. + +```bash +rm -fr ./dist +npm run build +``` + +Use a local publishing registry for local development and testing + +```bash +npm install -g verdaccio # need not do again if done once +verdaccio # runs a server at localhost:4873 +touch .npmrc +echo "registry=http://localhost:4873 # OR http://0.0.0.0:4873" > .npmrc +npm publish . +``` + diff --git a/docs/react-usage.md b/docs/react-usage.md new file mode 100644 index 00000000..827b7e20 --- /dev/null +++ b/docs/react-usage.md @@ -0,0 +1,36 @@ +## How to use the usfm-grammar npm package from React + +To use the library from a react app, there are a few extra handling required. +1. The modules `fs`, `path` and `process` used by tree-sitter is not required when using from front end. But bundling may cause issue. Hence use `react-app-rewired` and in the `config-overrides.js` file add following settings: + + ```javascript + const { override } = require('customize-cra'); + + module.exports = override( + config => { + config.resolve.fallback = { + fs: false, + path: false, + process: false + }; + return config; + }, + ); + ``` +2. When initializing the `USFMParser` class use the links to required wasm files as shown below: +```javascript +import React, { useEffect } from 'react'; +import { USFMParser } from 'usfm-grammar'; + +function App() { + ... + useEffect(() => { + const initParser = async () => { + await USFMParser.init("https://cdn.jsdelivr.net/npm/usfm-grammar@3.0.0-alpha.6/tree-sitter-usfm.wasm", + "https://cdn.jsdelivr.net/npm/usfm-grammar@3.0.0-alpha.6/tree-sitter.wasm"); + }; + initParser(); + }, []); + ... +} +``` \ No newline at end of file diff --git a/js-usfm-parser/README.md b/js-usfm-parser/README.md deleted file mode 100644 index 7ae01f04..00000000 --- a/js-usfm-parser/README.md +++ /dev/null @@ -1,55 +0,0 @@ -# USFM-Grammar - -The Javascript module, that uses the grammar implementation of [USFM](https://ubsicap.github.io/usfm/) language, via the [tree-sitter-USFM3](https://www.npmjs.com/package/tree-sitter-usfm3) package, to convert the USFM inputs to other formats like JSON, table, [USX](https://ubsicap.github.io/usx/) etc. - -## Installation - -`npm install usfm-grammar` - -## Usage - -### Command-line-interface(CLI) - -`usfm-grammar /path/to/file.usfm` - -### Javascript APIs - - -## Development instructions - -Make sure a good undertsanding and familarity with the USFM format is achieved. - -1. Setup the code base - -- Fork Base repo to your personal github account: `https://github.com/Bridgeconn/usfm-grammar` -- Clone specific branch locally: `git clone --branch version-3 https://github.com//usfm-grammar` -- Set remotes : `git remote set upstream https://github.com/Bridgeconn/usfm-grammar` - -2. Install - -Esnure stable version of node by - -`nvm install --lts` - -`cd usfm-grammar/js-usfm-parser` - -`npm install .` - -3. Test while developing - -For the time being use the bottom portion of the script `usfm-grammar/js-usfm-parser/usfm_parser.js` to change the input usfm string, try different class methods and print the result to console. - -`node usfm_parser.js` - -4. How to implement? - -This JS module is supposed to copy all the fucntionalities available in the [python module](../python-usfm-parser). Refer the python script `python-usfm-parser/src/usfm_grammar/usfm_parser.py` and rewrite the functionalities in it to JS in the `usfm-grammar/js-usfm-parser/usfm_parser.js` file.s - -For knowing the available APIs and syntaxes of tree-sitter(`node-tree-sitter`) library, refer the [tests in their git repo](https://github.com/tree-sitter/node-tree-sitter/tree/master/test). They will have corresponding implementations for the fuctionalities we have used from their python library(`py-tree-sitter`). - -5. Contribute back - -The following development practices are recommended to be able to contribute back to the main code base -- update from upstream `git pull --rebase upstream version3` -- commit local changes and push to your github repo(` git push origin `) -- send PR from your repo to `version-3` branch of base repo diff --git a/js-usfm-parser/package-lock.json b/js-usfm-parser/package-lock.json deleted file mode 100644 index c53b0870..00000000 --- a/js-usfm-parser/package-lock.json +++ /dev/null @@ -1,1010 +0,0 @@ -{ - "name": "usfm-grammar", - "version": "3.0.0-alpha.1", - "lockfileVersion": 2, - "requires": true, - "packages": { - "": { - "name": "usfm-grammar", - "version": "3.0.0-alpha.1", - "license": "MIT", - "dependencies": { - "tree-sitter": "^0.20.0", - "tree-sitter-usfm3": "^3.0.0-alpha.2" - } - }, - "node_modules/ansi-regex": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-2.1.1.tgz", - "integrity": "sha512-TIGnTpdo+E3+pCyAluZvtED5p5wCqLdezCyhPZzKPcxvFplEt4i+W7OONCKgeZFT3+y5NZZfOOS/Bdcanm1MYA==", - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/aproba": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/aproba/-/aproba-1.2.0.tgz", - "integrity": "sha512-Y9J6ZjXtoYh8RnXVCMOU/ttDmk1aBjunq9vO0ta5x85WDQiQfUF9sIPBITdbiiIVcBo03Hi3jMxigBtsddlXRw==" - }, - "node_modules/are-we-there-yet": { - "version": "1.1.7", - "resolved": "https://registry.npmjs.org/are-we-there-yet/-/are-we-there-yet-1.1.7.tgz", - "integrity": "sha512-nxwy40TuMiUGqMyRHgCSWZ9FM4VAoRP4xUYSTv5ImRog+h9yISPbVH7H8fASCIzYn9wlEv4zvFL7uKDMCFQm3g==", - "dependencies": { - "delegates": "^1.0.0", - "readable-stream": "^2.0.6" - } - }, - "node_modules/base64-js": { - "version": "1.5.1", - "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", - "integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ] - }, - "node_modules/bl": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz", - "integrity": "sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==", - "dependencies": { - "buffer": "^5.5.0", - "inherits": "^2.0.4", - "readable-stream": "^3.4.0" - } - }, - "node_modules/bl/node_modules/readable-stream": { - "version": "3.6.0", - "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.0.tgz", - "integrity": "sha512-BViHy7LKeTz4oNnkcLJ+lVSL6vpiFeX6/d3oSH8zCW7UxP2onchk+vTGB143xuFjHS3deTgkKoXXymXqymiIdA==", - "dependencies": { - "inherits": "^2.0.3", - "string_decoder": "^1.1.1", - "util-deprecate": "^1.0.1" - }, - "engines": { - "node": ">= 6" - } - }, - "node_modules/buffer": { - "version": "5.7.1", - "resolved": "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz", - "integrity": "sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ], - "dependencies": { - "base64-js": "^1.3.1", - "ieee754": "^1.1.13" - } - }, - "node_modules/chownr": { - "version": "1.1.4", - "resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz", - "integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==" - }, - "node_modules/code-point-at": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/code-point-at/-/code-point-at-1.1.0.tgz", - "integrity": "sha512-RpAVKQA5T63xEj6/giIbUEtZwJ4UFIc3ZtvEkiaUERylqe8xb5IvqcgOurZLahv93CLKfxcw5YI+DZcUBRyLXA==", - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/console-control-strings": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/console-control-strings/-/console-control-strings-1.1.0.tgz", - "integrity": "sha512-ty/fTekppD2fIwRvnZAVdeOiGd1c7YXEixbgJTNzqcxJWKQnjJ/V1bNEEE6hygpM3WjwHFUVK6HTjWSzV4a8sQ==" - }, - "node_modules/core-util-is": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz", - "integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==" - }, - "node_modules/decompress-response": { - "version": "4.2.1", - "resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-4.2.1.tgz", - "integrity": "sha512-jOSne2qbyE+/r8G1VU+G/82LBs2Fs4LAsTiLSHOCOMZQl2OKZ6i8i4IyHemTe+/yIXOtTcRQMzPcgyhoFlqPkw==", - "dependencies": { - "mimic-response": "^2.0.0" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/deep-extend": { - "version": "0.6.0", - "resolved": "https://registry.npmjs.org/deep-extend/-/deep-extend-0.6.0.tgz", - "integrity": "sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==", - "engines": { - "node": ">=4.0.0" - } - }, - "node_modules/delegates": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/delegates/-/delegates-1.0.0.tgz", - "integrity": "sha512-bd2L678uiWATM6m5Z1VzNCErI3jiGzt6HGY8OVICs40JQq/HALfbyNJmp0UDakEY4pMMaN0Ly5om/B1VI/+xfQ==" - }, - "node_modules/detect-libc": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-1.0.3.tgz", - "integrity": "sha512-pGjwhsmsp4kL2RTz08wcOlGN83otlqHeD/Z5T8GXZB+/YcpQ/dgo+lbU8ZsGxV0HIvqqxo9l7mqYwyYMD9bKDg==", - "bin": { - "detect-libc": "bin/detect-libc.js" - }, - "engines": { - "node": ">=0.10" - } - }, - "node_modules/end-of-stream": { - "version": "1.4.4", - "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.4.tgz", - "integrity": "sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==", - "dependencies": { - "once": "^1.4.0" - } - }, - "node_modules/expand-template": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/expand-template/-/expand-template-2.0.3.tgz", - "integrity": "sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==", - "engines": { - "node": ">=6" - } - }, - "node_modules/fs-constants": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz", - "integrity": "sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==" - }, - "node_modules/gauge": { - "version": "2.7.4", - "resolved": "https://registry.npmjs.org/gauge/-/gauge-2.7.4.tgz", - "integrity": "sha512-14x4kjc6lkD3ltw589k0NrPD6cCNTD6CWoVUNpB85+DrtONoZn+Rug6xZU5RvSC4+TZPxA5AnBibQYAvZn41Hg==", - "dependencies": { - "aproba": "^1.0.3", - "console-control-strings": "^1.0.0", - "has-unicode": "^2.0.0", - "object-assign": "^4.1.0", - "signal-exit": "^3.0.0", - "string-width": "^1.0.1", - "strip-ansi": "^3.0.1", - "wide-align": "^1.1.0" - } - }, - "node_modules/github-from-package": { - "version": "0.0.0", - "resolved": "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz", - "integrity": "sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==" - }, - "node_modules/has-unicode": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/has-unicode/-/has-unicode-2.0.1.tgz", - "integrity": "sha512-8Rf9Y83NBReMnx0gFzA8JImQACstCYWUplepDa9xprwwtmgEZUF0h/i5xSA625zB/I37EtrswSST6OXxwaaIJQ==" - }, - "node_modules/ieee754": { - "version": "1.2.1", - "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz", - "integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ] - }, - "node_modules/inherits": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", - "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==" - }, - "node_modules/ini": { - "version": "1.3.8", - "resolved": "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz", - "integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==" - }, - "node_modules/is-fullwidth-code-point": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-1.0.0.tgz", - "integrity": "sha512-1pqUqRjkhPJ9miNq9SwMfdvi6lBJcd6eFxvfaivQhaH3SgisfiuudvFntdKOmxuee/77l+FPjKrQjWvmPjWrRw==", - "dependencies": { - "number-is-nan": "^1.0.0" - }, - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/isarray": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", - "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==" - }, - "node_modules/mimic-response": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-2.1.0.tgz", - "integrity": "sha512-wXqjST+SLt7R009ySCglWBCFpjUygmCIfD790/kVbiGmUgfYGuB14PiTd5DwVxSV4NcYHjzMkoj5LjQZwTQLEA==", - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/minimist": { - "version": "1.2.6", - "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.6.tgz", - "integrity": "sha512-Jsjnk4bw3YJqYzbdyBiNsPWHPfO++UGG749Cxs6peCu5Xg4nrena6OVxOYxrQTqww0Jmwt+Ref8rggumkTLz9Q==" - }, - "node_modules/mkdirp-classic": { - "version": "0.5.3", - "resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz", - "integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==" - }, - "node_modules/nan": { - "version": "2.16.0", - "resolved": "https://registry.npmjs.org/nan/-/nan-2.16.0.tgz", - "integrity": "sha512-UdAqHyFngu7TfQKsCBgAA6pWDkT8MAO7d0jyOecVhN5354xbLqdn8mV9Tat9gepAupm0bt2DbeaSC8vS52MuFA==" - }, - "node_modules/napi-build-utils": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/napi-build-utils/-/napi-build-utils-1.0.2.tgz", - "integrity": "sha512-ONmRUqK7zj7DWX0D9ADe03wbwOBZxNAfF20PlGfCWQcD3+/MakShIHrMqx9YwPTfxDdF1zLeL+RGZiR9kGMLdg==" - }, - "node_modules/node-abi": { - "version": "2.30.1", - "resolved": "https://registry.npmjs.org/node-abi/-/node-abi-2.30.1.tgz", - "integrity": "sha512-/2D0wOQPgaUWzVSVgRMx+trKJRC2UG4SUc4oCJoXx9Uxjtp0Vy3/kt7zcbxHF8+Z/pK3UloLWzBISg72brfy1w==", - "dependencies": { - "semver": "^5.4.1" - } - }, - "node_modules/npmlog": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/npmlog/-/npmlog-4.1.2.tgz", - "integrity": "sha512-2uUqazuKlTaSI/dC8AzicUck7+IrEaOnN/e0jd3Xtt1KcGpwx30v50mL7oPyr/h9bL3E4aZccVwpwP+5W9Vjkg==", - "dependencies": { - "are-we-there-yet": "~1.1.2", - "console-control-strings": "~1.1.0", - "gauge": "~2.7.3", - "set-blocking": "~2.0.0" - } - }, - "node_modules/number-is-nan": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/number-is-nan/-/number-is-nan-1.0.1.tgz", - "integrity": "sha512-4jbtZXNAsfZbAHiiqjLPBiCl16dES1zI4Hpzzxw61Tk+loF+sBDBKx1ICKKKwIqQ7M0mFn1TmkN7euSncWgHiQ==", - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/object-assign": { - "version": "4.1.1", - "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", - "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==", - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/once": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", - "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==", - "dependencies": { - "wrappy": "1" - } - }, - "node_modules/prebuild-install": { - "version": "6.1.4", - "resolved": "https://registry.npmjs.org/prebuild-install/-/prebuild-install-6.1.4.tgz", - "integrity": "sha512-Z4vpywnK1lBg+zdPCVCsKq0xO66eEV9rWo2zrROGGiRS4JtueBOdlB1FnY8lcy7JsUud/Q3ijUxyWN26Ika0vQ==", - "dependencies": { - "detect-libc": "^1.0.3", - "expand-template": "^2.0.3", - "github-from-package": "0.0.0", - "minimist": "^1.2.3", - "mkdirp-classic": "^0.5.3", - "napi-build-utils": "^1.0.1", - "node-abi": "^2.21.0", - "npmlog": "^4.0.1", - "pump": "^3.0.0", - "rc": "^1.2.7", - "simple-get": "^3.0.3", - "tar-fs": "^2.0.0", - "tunnel-agent": "^0.6.0" - }, - "bin": { - "prebuild-install": "bin.js" - }, - "engines": { - "node": ">=6" - } - }, - "node_modules/process-nextick-args": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz", - "integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==" - }, - "node_modules/pump": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", - "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", - "dependencies": { - "end-of-stream": "^1.1.0", - "once": "^1.3.1" - } - }, - "node_modules/rc": { - "version": "1.2.8", - "resolved": "https://registry.npmjs.org/rc/-/rc-1.2.8.tgz", - "integrity": "sha512-y3bGgqKj3QBdxLbLkomlohkvsA8gdAiUQlSBJnBhfn+BPxg4bc62d8TcBW15wavDfgexCgccckhcZvywyQYPOw==", - "dependencies": { - "deep-extend": "^0.6.0", - "ini": "~1.3.0", - "minimist": "^1.2.0", - "strip-json-comments": "~2.0.1" - }, - "bin": { - "rc": "cli.js" - } - }, - "node_modules/readable-stream": { - "version": "2.3.7", - "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.7.tgz", - "integrity": "sha512-Ebho8K4jIbHAxnuxi7o42OrZgF/ZTNcsZj6nRKyUmkhLFq8CHItp/fy6hQZuZmP/n3yZ9VBUbp4zz/mX8hmYPw==", - "dependencies": { - "core-util-is": "~1.0.0", - "inherits": "~2.0.3", - "isarray": "~1.0.0", - "process-nextick-args": "~2.0.0", - "safe-buffer": "~5.1.1", - "string_decoder": "~1.1.1", - "util-deprecate": "~1.0.1" - } - }, - "node_modules/safe-buffer": { - "version": "5.1.2", - "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", - "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==" - }, - "node_modules/semver": { - "version": "5.7.1", - "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz", - "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==", - "bin": { - "semver": "bin/semver" - } - }, - "node_modules/set-blocking": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz", - "integrity": "sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw==" - }, - "node_modules/signal-exit": { - "version": "3.0.7", - "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz", - "integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==" - }, - "node_modules/simple-concat": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/simple-concat/-/simple-concat-1.0.1.tgz", - "integrity": "sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ] - }, - "node_modules/simple-get": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/simple-get/-/simple-get-3.1.1.tgz", - "integrity": "sha512-CQ5LTKGfCpvE1K0n2us+kuMPbk/q0EKl82s4aheV9oXjFEz6W/Y7oQFVJuU6QG77hRT4Ghb5RURteF5vnWjupA==", - "dependencies": { - "decompress-response": "^4.2.0", - "once": "^1.3.1", - "simple-concat": "^1.0.0" - } - }, - "node_modules/string_decoder": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", - "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", - "dependencies": { - "safe-buffer": "~5.1.0" - } - }, - "node_modules/string-width": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-1.0.2.tgz", - "integrity": "sha512-0XsVpQLnVCXHJfyEs8tC0zpTVIr5PKKsQtkT29IwupnPTjtPmQ3xT/4yCREF9hYkV/3M3kzcUTSAZT6a6h81tw==", - "dependencies": { - "code-point-at": "^1.0.0", - "is-fullwidth-code-point": "^1.0.0", - "strip-ansi": "^3.0.0" - }, - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/strip-ansi": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-3.0.1.tgz", - "integrity": "sha512-VhumSSbBqDTP8p2ZLKj40UjBCV4+v8bUSEpUb4KjRgWk9pbqGF4REFj6KEagidb2f/M6AzC0EmFyDNGaw9OCzg==", - "dependencies": { - "ansi-regex": "^2.0.0" - }, - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/strip-json-comments": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-2.0.1.tgz", - "integrity": "sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ==", - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/tar-fs": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.1.tgz", - "integrity": "sha512-V0r2Y9scmbDRLCNex/+hYzvp/zyYjvFbHPNgVTKfQvVrb6guiE/fxP+XblDNR011utopbkex2nM4dHNV6GDsng==", - "dependencies": { - "chownr": "^1.1.1", - "mkdirp-classic": "^0.5.2", - "pump": "^3.0.0", - "tar-stream": "^2.1.4" - } - }, - "node_modules/tar-stream": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-2.2.0.tgz", - "integrity": "sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==", - "dependencies": { - "bl": "^4.0.3", - "end-of-stream": "^1.4.1", - "fs-constants": "^1.0.0", - "inherits": "^2.0.3", - "readable-stream": "^3.1.1" - }, - "engines": { - "node": ">=6" - } - }, - "node_modules/tar-stream/node_modules/readable-stream": { - "version": "3.6.0", - "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.0.tgz", - "integrity": "sha512-BViHy7LKeTz4oNnkcLJ+lVSL6vpiFeX6/d3oSH8zCW7UxP2onchk+vTGB143xuFjHS3deTgkKoXXymXqymiIdA==", - "dependencies": { - "inherits": "^2.0.3", - "string_decoder": "^1.1.1", - "util-deprecate": "^1.0.1" - }, - "engines": { - "node": ">= 6" - } - }, - "node_modules/tree-sitter": { - "version": "0.20.0", - "resolved": "https://registry.npmjs.org/tree-sitter/-/tree-sitter-0.20.0.tgz", - "integrity": "sha512-tqTdtD1T2cN4aEES0sZCjKTQrc9Ls8H/iYlzpskhGy8yCwNPKBIbK9YuuCg/AxACr8RAY4wMoeCigM1X/A79yg==", - "hasInstallScript": true, - "dependencies": { - "nan": "^2.14.0", - "prebuild-install": "^6.0.1" - } - }, - "node_modules/tree-sitter-usfm3": { - "version": "3.0.0-alpha.2", - "resolved": "https://registry.npmjs.org/tree-sitter-usfm3/-/tree-sitter-usfm3-3.0.0-alpha.2.tgz", - "integrity": "sha512-seLLMTlY9y3TFKcIz/idX0kXMEVAT0SSjBpHHiwO8zcCENHFcm0C4d5wwPfnzePGNFPi23PI03+LxvimQv+6xQ==", - "hasInstallScript": true, - "dependencies": { - "nan": "^2.15.0" - } - }, - "node_modules/tunnel-agent": { - "version": "0.6.0", - "resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz", - "integrity": "sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==", - "dependencies": { - "safe-buffer": "^5.0.1" - }, - "engines": { - "node": "*" - } - }, - "node_modules/util-deprecate": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", - "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==" - }, - "node_modules/wide-align": { - "version": "1.1.5", - "resolved": "https://registry.npmjs.org/wide-align/-/wide-align-1.1.5.tgz", - "integrity": "sha512-eDMORYaPNZ4sQIuuYPDHdQvf4gyCF9rEEV/yPxGfwPkRodwEgiMUUXTx/dex+Me0wxx53S+NgUHaP7y3MGlDmg==", - "dependencies": { - "string-width": "^1.0.2 || 2 || 3 || 4" - } - }, - "node_modules/wrappy": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", - "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==" - } - }, - "dependencies": { - "ansi-regex": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-2.1.1.tgz", - "integrity": "sha512-TIGnTpdo+E3+pCyAluZvtED5p5wCqLdezCyhPZzKPcxvFplEt4i+W7OONCKgeZFT3+y5NZZfOOS/Bdcanm1MYA==" - }, - "aproba": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/aproba/-/aproba-1.2.0.tgz", - "integrity": "sha512-Y9J6ZjXtoYh8RnXVCMOU/ttDmk1aBjunq9vO0ta5x85WDQiQfUF9sIPBITdbiiIVcBo03Hi3jMxigBtsddlXRw==" - }, - "are-we-there-yet": { - "version": "1.1.7", - "resolved": "https://registry.npmjs.org/are-we-there-yet/-/are-we-there-yet-1.1.7.tgz", - "integrity": "sha512-nxwy40TuMiUGqMyRHgCSWZ9FM4VAoRP4xUYSTv5ImRog+h9yISPbVH7H8fASCIzYn9wlEv4zvFL7uKDMCFQm3g==", - "requires": { - "delegates": "^1.0.0", - "readable-stream": "^2.0.6" - } - }, - "base64-js": { - "version": "1.5.1", - "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", - "integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==" - }, - "bl": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz", - "integrity": "sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==", - "requires": { - "buffer": "^5.5.0", - "inherits": "^2.0.4", - "readable-stream": "^3.4.0" - }, - "dependencies": { - "readable-stream": { - "version": "3.6.0", - "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.0.tgz", - "integrity": "sha512-BViHy7LKeTz4oNnkcLJ+lVSL6vpiFeX6/d3oSH8zCW7UxP2onchk+vTGB143xuFjHS3deTgkKoXXymXqymiIdA==", - "requires": { - "inherits": "^2.0.3", - "string_decoder": "^1.1.1", - "util-deprecate": "^1.0.1" - } - } - } - }, - "buffer": { - "version": "5.7.1", - "resolved": "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz", - "integrity": "sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==", - "requires": { - "base64-js": "^1.3.1", - "ieee754": "^1.1.13" - } - }, - "chownr": { - "version": "1.1.4", - "resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz", - "integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==" - }, - "code-point-at": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/code-point-at/-/code-point-at-1.1.0.tgz", - "integrity": "sha512-RpAVKQA5T63xEj6/giIbUEtZwJ4UFIc3ZtvEkiaUERylqe8xb5IvqcgOurZLahv93CLKfxcw5YI+DZcUBRyLXA==" - }, - "console-control-strings": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/console-control-strings/-/console-control-strings-1.1.0.tgz", - "integrity": "sha512-ty/fTekppD2fIwRvnZAVdeOiGd1c7YXEixbgJTNzqcxJWKQnjJ/V1bNEEE6hygpM3WjwHFUVK6HTjWSzV4a8sQ==" - }, - "core-util-is": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz", - "integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==" - }, - "decompress-response": { - "version": "4.2.1", - "resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-4.2.1.tgz", - "integrity": "sha512-jOSne2qbyE+/r8G1VU+G/82LBs2Fs4LAsTiLSHOCOMZQl2OKZ6i8i4IyHemTe+/yIXOtTcRQMzPcgyhoFlqPkw==", - "requires": { - "mimic-response": "^2.0.0" - } - }, - "deep-extend": { - "version": "0.6.0", - "resolved": "https://registry.npmjs.org/deep-extend/-/deep-extend-0.6.0.tgz", - "integrity": "sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==" - }, - "delegates": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/delegates/-/delegates-1.0.0.tgz", - "integrity": "sha512-bd2L678uiWATM6m5Z1VzNCErI3jiGzt6HGY8OVICs40JQq/HALfbyNJmp0UDakEY4pMMaN0Ly5om/B1VI/+xfQ==" - }, - "detect-libc": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-1.0.3.tgz", - "integrity": "sha512-pGjwhsmsp4kL2RTz08wcOlGN83otlqHeD/Z5T8GXZB+/YcpQ/dgo+lbU8ZsGxV0HIvqqxo9l7mqYwyYMD9bKDg==" - }, - "end-of-stream": { - "version": "1.4.4", - "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.4.tgz", - "integrity": "sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==", - "requires": { - "once": "^1.4.0" - } - }, - "expand-template": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/expand-template/-/expand-template-2.0.3.tgz", - "integrity": "sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==" - }, - "fs-constants": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz", - "integrity": "sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==" - }, - "gauge": { - "version": "2.7.4", - "resolved": "https://registry.npmjs.org/gauge/-/gauge-2.7.4.tgz", - "integrity": "sha512-14x4kjc6lkD3ltw589k0NrPD6cCNTD6CWoVUNpB85+DrtONoZn+Rug6xZU5RvSC4+TZPxA5AnBibQYAvZn41Hg==", - "requires": { - "aproba": "^1.0.3", - "console-control-strings": "^1.0.0", - "has-unicode": "^2.0.0", - "object-assign": "^4.1.0", - "signal-exit": "^3.0.0", - "string-width": "^1.0.1", - "strip-ansi": "^3.0.1", - "wide-align": "^1.1.0" - } - }, - "github-from-package": { - "version": "0.0.0", - "resolved": "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz", - "integrity": "sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==" - }, - "has-unicode": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/has-unicode/-/has-unicode-2.0.1.tgz", - "integrity": "sha512-8Rf9Y83NBReMnx0gFzA8JImQACstCYWUplepDa9xprwwtmgEZUF0h/i5xSA625zB/I37EtrswSST6OXxwaaIJQ==" - }, - "ieee754": { - "version": "1.2.1", - "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz", - "integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==" - }, - "inherits": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", - "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==" - }, - "ini": { - "version": "1.3.8", - "resolved": "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz", - "integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==" - }, - "is-fullwidth-code-point": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-1.0.0.tgz", - "integrity": "sha512-1pqUqRjkhPJ9miNq9SwMfdvi6lBJcd6eFxvfaivQhaH3SgisfiuudvFntdKOmxuee/77l+FPjKrQjWvmPjWrRw==", - "requires": { - "number-is-nan": "^1.0.0" - } - }, - "isarray": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", - "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==" - }, - "mimic-response": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-2.1.0.tgz", - "integrity": "sha512-wXqjST+SLt7R009ySCglWBCFpjUygmCIfD790/kVbiGmUgfYGuB14PiTd5DwVxSV4NcYHjzMkoj5LjQZwTQLEA==" - }, - "minimist": { - "version": "1.2.6", - "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.6.tgz", - "integrity": "sha512-Jsjnk4bw3YJqYzbdyBiNsPWHPfO++UGG749Cxs6peCu5Xg4nrena6OVxOYxrQTqww0Jmwt+Ref8rggumkTLz9Q==" - }, - "mkdirp-classic": { - "version": "0.5.3", - "resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz", - "integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==" - }, - "nan": { - "version": "2.16.0", - "resolved": "https://registry.npmjs.org/nan/-/nan-2.16.0.tgz", - "integrity": "sha512-UdAqHyFngu7TfQKsCBgAA6pWDkT8MAO7d0jyOecVhN5354xbLqdn8mV9Tat9gepAupm0bt2DbeaSC8vS52MuFA==" - }, - "napi-build-utils": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/napi-build-utils/-/napi-build-utils-1.0.2.tgz", - "integrity": "sha512-ONmRUqK7zj7DWX0D9ADe03wbwOBZxNAfF20PlGfCWQcD3+/MakShIHrMqx9YwPTfxDdF1zLeL+RGZiR9kGMLdg==" - }, - "node-abi": { - "version": "2.30.1", - "resolved": "https://registry.npmjs.org/node-abi/-/node-abi-2.30.1.tgz", - "integrity": "sha512-/2D0wOQPgaUWzVSVgRMx+trKJRC2UG4SUc4oCJoXx9Uxjtp0Vy3/kt7zcbxHF8+Z/pK3UloLWzBISg72brfy1w==", - "requires": { - "semver": "^5.4.1" - } - }, - "npmlog": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/npmlog/-/npmlog-4.1.2.tgz", - "integrity": "sha512-2uUqazuKlTaSI/dC8AzicUck7+IrEaOnN/e0jd3Xtt1KcGpwx30v50mL7oPyr/h9bL3E4aZccVwpwP+5W9Vjkg==", - "requires": { - "are-we-there-yet": "~1.1.2", - "console-control-strings": "~1.1.0", - "gauge": "~2.7.3", - "set-blocking": "~2.0.0" - } - }, - "number-is-nan": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/number-is-nan/-/number-is-nan-1.0.1.tgz", - "integrity": "sha512-4jbtZXNAsfZbAHiiqjLPBiCl16dES1zI4Hpzzxw61Tk+loF+sBDBKx1ICKKKwIqQ7M0mFn1TmkN7euSncWgHiQ==" - }, - "object-assign": { - "version": "4.1.1", - "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", - "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==" - }, - "once": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", - "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==", - "requires": { - "wrappy": "1" - } - }, - "prebuild-install": { - "version": "6.1.4", - "resolved": "https://registry.npmjs.org/prebuild-install/-/prebuild-install-6.1.4.tgz", - "integrity": "sha512-Z4vpywnK1lBg+zdPCVCsKq0xO66eEV9rWo2zrROGGiRS4JtueBOdlB1FnY8lcy7JsUud/Q3ijUxyWN26Ika0vQ==", - "requires": { - "detect-libc": "^1.0.3", - "expand-template": "^2.0.3", - "github-from-package": "0.0.0", - "minimist": "^1.2.3", - "mkdirp-classic": "^0.5.3", - "napi-build-utils": "^1.0.1", - "node-abi": "^2.21.0", - "npmlog": "^4.0.1", - "pump": "^3.0.0", - "rc": "^1.2.7", - "simple-get": "^3.0.3", - "tar-fs": "^2.0.0", - "tunnel-agent": "^0.6.0" - } - }, - "process-nextick-args": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz", - "integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==" - }, - "pump": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", - "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", - "requires": { - "end-of-stream": "^1.1.0", - "once": "^1.3.1" - } - }, - "rc": { - "version": "1.2.8", - "resolved": "https://registry.npmjs.org/rc/-/rc-1.2.8.tgz", - "integrity": "sha512-y3bGgqKj3QBdxLbLkomlohkvsA8gdAiUQlSBJnBhfn+BPxg4bc62d8TcBW15wavDfgexCgccckhcZvywyQYPOw==", - "requires": { - "deep-extend": "^0.6.0", - "ini": "~1.3.0", - "minimist": "^1.2.0", - "strip-json-comments": "~2.0.1" - } - }, - "readable-stream": { - "version": "2.3.7", - "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.7.tgz", - "integrity": "sha512-Ebho8K4jIbHAxnuxi7o42OrZgF/ZTNcsZj6nRKyUmkhLFq8CHItp/fy6hQZuZmP/n3yZ9VBUbp4zz/mX8hmYPw==", - "requires": { - "core-util-is": "~1.0.0", - "inherits": "~2.0.3", - "isarray": "~1.0.0", - "process-nextick-args": "~2.0.0", - "safe-buffer": "~5.1.1", - "string_decoder": "~1.1.1", - "util-deprecate": "~1.0.1" - } - }, - "safe-buffer": { - "version": "5.1.2", - "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", - "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==" - }, - "semver": { - "version": "5.7.1", - "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz", - "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==" - }, - "set-blocking": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz", - "integrity": "sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw==" - }, - "signal-exit": { - "version": "3.0.7", - "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz", - "integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==" - }, - "simple-concat": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/simple-concat/-/simple-concat-1.0.1.tgz", - "integrity": "sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==" - }, - "simple-get": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/simple-get/-/simple-get-3.1.1.tgz", - "integrity": "sha512-CQ5LTKGfCpvE1K0n2us+kuMPbk/q0EKl82s4aheV9oXjFEz6W/Y7oQFVJuU6QG77hRT4Ghb5RURteF5vnWjupA==", - "requires": { - "decompress-response": "^4.2.0", - "once": "^1.3.1", - "simple-concat": "^1.0.0" - } - }, - "string_decoder": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", - "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", - "requires": { - "safe-buffer": "~5.1.0" - } - }, - "string-width": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-1.0.2.tgz", - "integrity": "sha512-0XsVpQLnVCXHJfyEs8tC0zpTVIr5PKKsQtkT29IwupnPTjtPmQ3xT/4yCREF9hYkV/3M3kzcUTSAZT6a6h81tw==", - "requires": { - "code-point-at": "^1.0.0", - "is-fullwidth-code-point": "^1.0.0", - "strip-ansi": "^3.0.0" - } - }, - "strip-ansi": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-3.0.1.tgz", - "integrity": "sha512-VhumSSbBqDTP8p2ZLKj40UjBCV4+v8bUSEpUb4KjRgWk9pbqGF4REFj6KEagidb2f/M6AzC0EmFyDNGaw9OCzg==", - "requires": { - "ansi-regex": "^2.0.0" - } - }, - "strip-json-comments": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-2.0.1.tgz", - "integrity": "sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ==" - }, - "tar-fs": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.1.tgz", - "integrity": "sha512-V0r2Y9scmbDRLCNex/+hYzvp/zyYjvFbHPNgVTKfQvVrb6guiE/fxP+XblDNR011utopbkex2nM4dHNV6GDsng==", - "requires": { - "chownr": "^1.1.1", - "mkdirp-classic": "^0.5.2", - "pump": "^3.0.0", - "tar-stream": "^2.1.4" - } - }, - "tar-stream": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-2.2.0.tgz", - "integrity": "sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==", - "requires": { - "bl": "^4.0.3", - "end-of-stream": "^1.4.1", - "fs-constants": "^1.0.0", - "inherits": "^2.0.3", - "readable-stream": "^3.1.1" - }, - "dependencies": { - "readable-stream": { - "version": "3.6.0", - "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.0.tgz", - "integrity": "sha512-BViHy7LKeTz4oNnkcLJ+lVSL6vpiFeX6/d3oSH8zCW7UxP2onchk+vTGB143xuFjHS3deTgkKoXXymXqymiIdA==", - "requires": { - "inherits": "^2.0.3", - "string_decoder": "^1.1.1", - "util-deprecate": "^1.0.1" - } - } - } - }, - "tree-sitter": { - "version": "0.20.0", - "resolved": "https://registry.npmjs.org/tree-sitter/-/tree-sitter-0.20.0.tgz", - "integrity": "sha512-tqTdtD1T2cN4aEES0sZCjKTQrc9Ls8H/iYlzpskhGy8yCwNPKBIbK9YuuCg/AxACr8RAY4wMoeCigM1X/A79yg==", - "requires": { - "nan": "^2.14.0", - "prebuild-install": "^6.0.1" - } - }, - "tree-sitter-usfm3": { - "version": "3.0.0-alpha.2", - "resolved": "https://registry.npmjs.org/tree-sitter-usfm3/-/tree-sitter-usfm3-3.0.0-alpha.2.tgz", - "integrity": "sha512-seLLMTlY9y3TFKcIz/idX0kXMEVAT0SSjBpHHiwO8zcCENHFcm0C4d5wwPfnzePGNFPi23PI03+LxvimQv+6xQ==", - "requires": { - "nan": "^2.15.0" - } - }, - "tunnel-agent": { - "version": "0.6.0", - "resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz", - "integrity": "sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==", - "requires": { - "safe-buffer": "^5.0.1" - } - }, - "util-deprecate": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", - "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==" - }, - "wide-align": { - "version": "1.1.5", - "resolved": "https://registry.npmjs.org/wide-align/-/wide-align-1.1.5.tgz", - "integrity": "sha512-eDMORYaPNZ4sQIuuYPDHdQvf4gyCF9rEEV/yPxGfwPkRodwEgiMUUXTx/dex+Me0wxx53S+NgUHaP7y3MGlDmg==", - "requires": { - "string-width": "^1.0.2 || 2 || 3 || 4" - } - }, - "wrappy": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", - "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==" - } - } -} diff --git a/js-usfm-parser/package.json b/js-usfm-parser/package.json deleted file mode 100644 index de465f24..00000000 --- a/js-usfm-parser/package.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "name": "usfm-grammar", - "version": "3.0.0-alpha.1", - "description": "Parser using tree-sitter-usfm3, to covert USFM to other file formats", - "main": "index.js", - "scripts": { - "test": "npm test" - }, - "repository": { - "type": "git", - "url": "https://github.com/Bridgeconn/usfm-grammar/js-usfm-parser" - }, - "keywords": [ - "USFM", - "tree-sitter", - "USX", - "Parser" - ], - "author": "BCS Team", - "license": "MIT", - "dependencies": { - "tree-sitter": "^0.20.0", - "tree-sitter-usfm3": "^3.0.0-alpha.2" - } -} diff --git a/js-usfm-parser/usfm_parser.js b/js-usfm-parser/usfm_parser.js deleted file mode 100644 index 1590ecd1..00000000 --- a/js-usfm-parser/usfm_parser.js +++ /dev/null @@ -1,56 +0,0 @@ -const Parser = require('tree-sitter'); -const USFM3 = require('tree-sitter-usfm3'); -const fs = require('fs'); - -const parser = new Parser(); -parser.setLanguage(USFM3); - - -class USFMParser{ - /* Parser class with usfmstring, syntax_tree and methods for convertions to different formats */ - constructor(usfmString){ - this.usfm = usfmString - this.syntaxTree = null - this.errors = null - let tree = null - - try{ - tree = parser.parse(this.usfm) - } catch(err){ - console.log(err.toString()) - } - this.syntaxTree = tree.rootNode - } - - toSyntaxTree(){ - return this.syntaxTree.toString() - } - - toJSON(){ - /* Coverts syntax tree to JSON, based on filter option given */ - return 'to be implemented' - } -} - -/* ------------------------------------------------- -For Testing during development -Either chanage the string value of sourceCode -or give an inputPath to usfm file --------------------------------------------------*/ - -let sourceCode = '\\id GEN\n\\c 1\n\\p\n\\v 1 In the begining..'; -let parserObj = new USFMParser(sourceCode) -console.log(parserObj.toSyntaxTree()) -console.log('--------------------------------------------------------') -console.log(parserObj.toJSON()) -console.log('********************************************************\n\n') -let inputPath = "../tests/basic/minimal/origin.usfm" -fs.readFile(inputPath, 'utf8', function (err, data) { - if (err) throw err; - parserObj = new USFMParser(data.toString()); - console.log(parserObj.toSyntaxTree()) - console.log('--------------------------------------------------------') - console.log(parserObj.toJSON()) - console.log('********************************************************') -}); - diff --git a/js-usfm-parser/LICENSE b/node-usfm-parser/LICENSE similarity index 100% rename from js-usfm-parser/LICENSE rename to node-usfm-parser/LICENSE diff --git a/node-usfm-parser/README.md b/node-usfm-parser/README.md new file mode 100644 index 00000000..4af31b74 --- /dev/null +++ b/node-usfm-parser/README.md @@ -0,0 +1,57 @@ +# USFM Grammar + +## Description +USFM Grammar is a JavaScript library for parsing and converting USFM (Unified Standard Format Markers) to/from USJ (Unified Standard JSON) format. This library provides functionalities to parse USFM strings into a syntax tree and convert them into a JSON-like structure (USJ), and vice versa. + +## Installation +You can install USFM Grammar via npm: + +```bash +npm install usfm-grammar +``` + +## Usage +Here's how you can use USFM Grammar in your JavaScript/TypeScript projects: + +```javascript +const usfmParser = new USFMParser(USFM); + +const USFM = '\\id GEN\n\\c 1\n\\p\n\\v 1 In the begining..\\v 2 some more text' +const USJ = usfmParser.toUSJ() +console.log(USJ); + +const usfmParser2 = new USFMParser(usfmString=null, fromUsj=USJ) +const usfmGen = usfmParser2.usfm; +console.log(usfmGen); +``` + +When using in an ESModule, if `import {USFMParser} from 'usfm-grammar` doesnt work for you, you could try: +```javascript +import pkg from 'usfm-grammar'; +const {USFMParser} = pkg; + +... +``` + +## API Documentation + + +### `USFMParser.toUSJ(): Object` +Converts a USFM string to a USJ object. + +- `usfmString`: The input USFM string. + +Returns: A JSON-like object representing the USJ. + +### `USFMParser.usjToUsfm(usjObject: Object): string` +Converts a USJ object to a USFM string. + +- `usjObject`: The input USJ object. + +Returns: The converted USFM string. + +## Contributing +Contributions are welcome! If you find any issues or have suggestions for improvements, feel free to open an issue or create a pull request on [GitHub](https://github.com/your-username/usfm-grammar). + +## License +This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. diff --git a/node-usfm-parser/package.json b/node-usfm-parser/package.json new file mode 100644 index 00000000..1711652a --- /dev/null +++ b/node-usfm-parser/package.json @@ -0,0 +1,40 @@ +{ + "name": "usfm-grammar", + "version": "3.0.0-alpha.9", + "description": "Parser using tree-sitter-usfm3, to convert usfm to usj format.", + "main": "./dist/cjs/index.cjs", + "module": "./dist/es/index.mjs", + "scripts": { + "build": "parcel build ./src/index.js", + "test": "mocha --timeout 40000 --parallel" + }, + "repository": { + "type": "git", + "url": "https://github.com/Bridgeconn/usfm-grammar/js-usfm-parser" + }, + "keywords": [ + "USFM", + "tree-sitter", + "USJ", + "Parser" + ], + "license": "MIT", + "author": "BCS Team", + "contributors": [ + "Kavitha Raju (https://github.com/kavitharaju)", + "Joel Mathew (https://github.com/joelthe1)", + "Samuel JD (https://github.com/samueljd)" + ], + "dependencies": { + "tree-sitter": "0.21.1", + "tree-sitter-usfm3": "file:../tree-sitter-usfm3", + "xmldom": "^0.6.0", + "xpath": "^0.0.34" + }, + "devDependencies": { + "ajv": "^8.17.1", + "glob": "^11.0.0", + "mocha": "^10.7.3", + "parcel": "^2.12.0" + } +} diff --git a/node-usfm-parser/src/filters.js b/node-usfm-parser/src/filters.js new file mode 100644 index 00000000..0ab55b17 --- /dev/null +++ b/node-usfm-parser/src/filters.js @@ -0,0 +1,202 @@ +class Filter { + // Defines the values of filter options + static BOOK_HEADERS = [ + "ide", "usfm", "h", "toc", "toca", // identification + "imt", "is", "ip", "ipi", "im", "imi", "ipq", "imq", "ipr", "iq", "ib", + "ili", "iot", "io", "iex", "imte", "ie" // intro + ]; + + static TITLES = [ + "mt", "mte", "cl", "cd", "ms", "mr", "s", "sr", "r", "d", "sp", "sd" // headings + ]; + + static COMMENTS = ["sts", "rem", "lit", "restore"]; // comment markers + + static PARAGRAPHS = [ + "p", "m", "po", "pr", "cls", "pmo", "pm", "pmc", // paragraphs-quotes-lists-tables + "pmr", "pi", "mi", "nb", "pc", "ph", "q", "qr", "qc", "qa", "qm", "qd", + "lh", "li", "lf", "lim", "litl", 'tr', "tc", "th", "tcr", "thr", 'table', "b" + ]; + + static CHARACTERS = [ + "add", "bk", "dc", "ior", "iqt", "k", "litl", "nd", "ord", "pn", + "png", "qac", "qs", "qt", "rq", "sig", "sls", "tl", "wj", // Special-text + "em", "bd", "bdit", "it", "no", "sc", "sup", // character styling + "rb", "pro", "w", "wh", "wa", "wg", // special-features + "lik", "liv", // structured list entries + "jmp" + ]; + + static NOTES = [ + "f", "fe", "ef", "efe", "x", "ex", // footnotes-and-crossrefs + "fr", "ft", "fk", "fq", "fqa", "fl", "fw", "fp", "fv", "fdc", + "xo", "xop", "xt", "xta", "xk", "xq", "xot", "xnt", "xdc" + ]; + + static STUDY_BIBLE = ['esb', 'cat']; // sidebars-extended-contents + + static BCV = ['id', 'c', 'v']; + + static TEXT = ['text-in-excluded-parent']; + + static keepOnly(inputUsj, includeMarkers, combineTexts=true) { + // let flattenedList = [].concat(...includeMarkers); + let cleanedMarkers = includeMarkers.map(marker => marker.replace(trailingNumPattern, '')); + let filteredUSJ = includeMarkersInUsj(inputUsj, cleanedMarkers, combineTexts); + + return filteredUSJ; + } + + static remove(inputUsj, excludeMarkers, combineTexts=true) { + // let flattenedList = [].concat(...excludeMarkers); + let cleanedMarkers = excludeMarkers.map(marker => marker.replace(trailingNumPattern, '')); + let filteredUSJ = excludeMarkersInUsj(inputUsj, cleanedMarkers, combineTexts); + + return filteredUSJ; + } + +} + +const MARKERS_WITH_DISCARDABLE_CONTENTS = [ + "ide", "usfm", "h", "toc", "toca", "imt", "is", "ip", "ipi", "im", "imi", + "ipq", "imq", "ipr", "iq", "ib", "ili", "iot", "io", "iex", "imte", "ie", + "mt", "mte", "cl", "cd", "ms", "mr", "s", "sr", "r", "d", "sp", "sd", + "sts", "rem", "lit", "restore", "f", "fe", "ef", "efe", "x", "ex", + "fr", "ft", "fk", "fq", "fqa", "fl", "fw", "fp", "fv", "fdc", + "xo", "xop", "xt", "xta", "xk", "xq", "xot", "xnt", "xdc", + "jmp", "fig", "cat", "esb", "b" +]; + +const trailingNumPattern = /\d+$/; +const punctPatternNoSpaceBefore = /^[,.\-—/;:!?@$%^)}\]>”»]/; +const punctPatternNoSpaceAfter = /[\-—/`@^&({[<“«]$/; + +function combineConsecutiveTextContents(contentsList) { + let textCombinedContents = []; + let textContents = ''; + contentsList.forEach(item => { + if (typeof item === 'string') { + if (!(textContents.endsWith(" ") || item.startsWith(" ") || textContents === '' || + punctPatternNoSpaceBefore.test(item) || punctPatternNoSpaceAfter.test(textContents))) { + textContents += " "; + } + textContents += item; + } else { + if (textContents !== "") { + textCombinedContents.push(textContents); + textContents = ""; + } + textCombinedContents.push(item); + } + }); + if (textContents !== "") { + textCombinedContents.push(textContents); + } + return textCombinedContents; +} + +function excludeMarkersInUsj(inputUsj, excludeMarkers, combineTexts = true, excludedParent = false) { + let cleanedKids = []; + if (typeof inputUsj === 'string') { + if (excludedParent || excludeMarkers.includes('text-in-excluded-parent')) { + return []; + } + return [inputUsj]; + } + + let thisMarker = ''; + if ('marker' in inputUsj) { + thisMarker = inputUsj.marker.replace(trailingNumPattern, ''); + } else if (inputUsj.type === 'ref') { + thisMarker = "ref"; + } + let thisMarkerNeeded = true; + let innerContentNeeded = true; + excludedParent = false; + + if (excludeMarkers.includes(thisMarker)) { + thisMarkerNeeded = false; + excludedParent = true; + if (MARKERS_WITH_DISCARDABLE_CONTENTS.includes(thisMarker)) { + innerContentNeeded = false; + } + } + + if ((thisMarkerNeeded || innerContentNeeded) && "content" in inputUsj) { + inputUsj.content.forEach(item => { + let cleaned = excludeMarkersInUsj(item, excludeMarkers, combineTexts, excludedParent); + if (Array.isArray(cleaned)) { + cleanedKids.push(...cleaned); + } else { + cleanedKids.push(cleaned); + } + }); + if (combineTexts) { + cleanedKids = combineConsecutiveTextContents(cleanedKids); + } + } + + if (thisMarkerNeeded) { + inputUsj.content = cleanedKids; + return inputUsj; + } + return cleanedKids; +} + +function includeMarkersInUsj(inputUsj, includeMarkers, combineTexts = true, excludedParent = false) { + let cleanedKids = []; + + if (typeof inputUsj === 'string') { + if (includeMarkers.includes(Filter.TEXT[0])) { + return [inputUsj] + } + return [] + } + let thisMarker = ''; + if ('marker' in inputUsj) { + thisMarker = inputUsj.marker.replace(trailingNumPattern, ''); + } else if (inputUsj.type === 'ref') { + thisMarker = "ref"; + } + let thisMarkerNeeded = includeMarkers.includes(thisMarker) || thisMarker === ''; + let innerContentNeeded = thisMarkerNeeded || !MARKERS_WITH_DISCARDABLE_CONTENTS.includes(thisMarker); + + if (innerContentNeeded && "content" in inputUsj) { + inputUsj.content.forEach(item => { + let cleaned = includeMarkersInUsj(item, includeMarkers, combineTexts, !thisMarkerNeeded); + if (Array.isArray(cleaned)) { + cleanedKids.push(...cleaned); + } else { + cleanedKids.push(cleaned); + } + }); + if (combineTexts) { + cleanedKids = combineConsecutiveTextContents(cleanedKids); + } + } + + if (thisMarker === 'c') { + if (!includeMarkers.includes('ca')) + delete inputUsj.altnumber; + if (!includeMarkers.includes('cp')) + delete inputUsj.pubnumber; + } else if (thisMarker === 'v') { + if (!includeMarkers.includes('va')) + delete inputUsj.altnumber; + if (!includeMarkers.includes('vp')) + delete inputUsj.pubnumber; + } + + + + if (thisMarkerNeeded) { + inputUsj.content = cleanedKids; + return inputUsj; + } + return cleanedKids; +} + + +exports.excludeMarkersInUsj = excludeMarkersInUsj; +exports.includeMarkersInUsj = includeMarkersInUsj; +exports.Filter = Filter; diff --git a/node-usfm-parser/src/index.js b/node-usfm-parser/src/index.js new file mode 100644 index 00000000..36f63372 --- /dev/null +++ b/node-usfm-parser/src/index.js @@ -0,0 +1,5 @@ +const {USFMParser, Filter, Format } = require("./usfmParser"); + +exports.USFMParser = USFMParser; +exports.Filter = Filter; +exports.Format = Format; \ No newline at end of file diff --git a/node-usfm-parser/src/listGenerator.js b/node-usfm-parser/src/listGenerator.js new file mode 100644 index 00000000..e5d238f7 --- /dev/null +++ b/node-usfm-parser/src/listGenerator.js @@ -0,0 +1,56 @@ +class ListGenerator { + /* Combines the methods used for List generation from USJ */ + constructor() { + /* Variables shared by functions */ + this.book = ""; + this.currentChapter = ""; + this.currentVerse = ""; + this.list = [["Book", "Chapter", "Verse", "Text", "Type", "Marker"]]; + } + + usjToListId(obj) { + /* Update book code */ + this.book = obj.code; + } + + usjToListC(obj) { + /* Update current chapter */ + this.currentChapter = obj.number; + } + + usjToListV(obj) { + /* Update current verse */ + this.currentVerse = obj.number; + } + + usjToList(obj) { + /* Traverse the USJ dict and build the table in this.list */ + if (obj.type === "book") { + this.usjToListId(obj); + } else if (obj.type === "chapter") { + this.usjToListC(obj); + } else if (obj.type === "verse") { + this.usjToListV(obj); + } + + let markerType = obj.type; + let markerName = obj.marker ? obj.marker : ''; + + if (markerType === "USJ") { + // This would occur if the JSON got flattened after removing paragraph markers + markerType = ""; + } + + if (obj.content) { + for (let item of obj.content) { + if (typeof item === "string") { + this.list.push([this.book, this.currentChapter, this.currentVerse, item, markerType, markerName]); + } else { + this.usjToList(item); + } + } + } + } +} + +exports.ListGenerator = ListGenerator; diff --git a/node-usfm-parser/src/usfmGenerator.js b/node-usfm-parser/src/usfmGenerator.js new file mode 100644 index 00000000..8c8ee550 --- /dev/null +++ b/node-usfm-parser/src/usfmGenerator.js @@ -0,0 +1,201 @@ +const { NO_USFM_USJ_TYPES, CLOSING_USJ_TYPES, NON_ATTRIB_USJ_KEYS, NO_NEWLINE_USJ_TYPES } = require("./utils/types"); +const { NON_ATTRIB_USX_KEYS, NO_NEWLINE_USX_TYPES } = require("./utils/types"); +const { DOMParser } = require('xmldom'); + +class USFMGenerator { + constructor() { + this.usfmString = ""; + } + + usjToUsfm(usjObj, nested = false) { + if (usjObj.type === "ref") { + usjObj.marker = "ref"; + } + if (!NO_USFM_USJ_TYPES.includes(usjObj.type)) { + this.usfmString += "\\"; + if (nested && usjObj.type === "char") { + this.usfmString += "+"; + } + this.usfmString += `${usjObj.marker} `; + } + ["code", "number", "caller"].forEach((key) => { + if (usjObj[key]) { + this.usfmString += `${usjObj[key]} `; + } + }); + if (usjObj.category) { + this.usfmString += `\\cat ${usjObj.category}\\cat*\n`; + } + if (usjObj.altnumber) { + if (usjObj.marker === "c") { + this.usfmString += `\\ca ${usjObj.altnumber} \\ca*\n` + }else if (usjObj.marker === "v") { + this.usfmString += `\\va ${usjObj.altnumber} \\va* ` + } + } + if (usjObj.pubnumber) { + if (usjObj.marker === "c") { + this.usfmString += `\\cp ${usjObj.pubnumber}\n` + }else if (usjObj.marker === "v") { + this.usfmString += `\\vp ${usjObj.pubnumber} \\vp* ` + } + } + if (Array.isArray(usjObj.content)) { + usjObj.content.forEach((item) => { + if (typeof item === "string") { + this.usfmString += item; + } else { + this.usjToUsfm(item, usjObj.type === "char" && item.marker !== "fv"); + } + }); + } + + let attributes = []; + Object.keys(usjObj).forEach((key) => { + if (!NON_ATTRIB_USJ_KEYS.includes(key)) { + attributes.push(`${key}="${usjObj[key]}"`); + } + }); + + if (attributes.length > 0) { + this.usfmString += `|${attributes.join(" ")}`; + } + + if (CLOSING_USJ_TYPES.includes(usjObj.type)) { + this.usfmString += `\\`; + if (nested && usjObj.type === "char") { + this.usfmString += "+"; + } + this.usfmString += `${usjObj.marker}* `; + } + if ( + !NO_NEWLINE_USJ_TYPES.includes(usjObj.type) && + this.usfmString[this.usfmString.length - 1] !== "\n" + ) { + this.usfmString += "\n"; + } + return this.usfmString; + } + + usxToUsfm(xmlObj, nested=false) { + // Check if xmlObj is a string + // if (typeof xmlObj === 'string') { + // // this.usfmString += xmlObj; + // return; + // } + + const objType = xmlObj.tagName; + let marker = null; + let usfmAttributes = []; + + if (['verse', 'chapter'].includes(objType) && xmlObj.hasAttribute('eid')) { + return; + } + + if (!NO_NEWLINE_USX_TYPES.includes(objType)) { + this.usfmString += '\n'; + } + + if (objType === 'optbreak') { + if (this.usfmString !== '' && !['\n', '\r', ' ', '\t'].includes(this.usfmString.slice(-1))) { + this.usfmString += ' '; + } + this.usfmString += '// '; + } + + if (xmlObj.hasAttribute('style')) { + marker = xmlObj.getAttribute('style'); + if (nested && objType === 'char' && !['xt', 'fv', 'ref'].includes(marker)) { + marker = `+${marker}`; + } + this.usfmString += `\\${marker} `; + } else if (objType === 'ref') { + marker = 'ref' + this.usfmString += `\\${marker} `; + } + + if (xmlObj.hasAttribute('code')) { + this.usfmString += xmlObj.getAttribute('code'); + } + + if (xmlObj.hasAttribute('number')) { + this.usfmString += `${xmlObj.getAttribute('number')} `; + } + + if (xmlObj.hasAttribute('caller')) { + this.usfmString += `${xmlObj.getAttribute('caller')} `; + } + + if (xmlObj.hasAttribute('altnumber')) { + if (objType === 'verse') { + this.usfmString += `\\va ${xmlObj.getAttribute('altnumber')}\\va*`; + } else if (objType === 'chapter') { + this.usfmString += `\n\\ca ${xmlObj.getAttribute('altnumber')}\\ca*`; + } + } + + if (xmlObj.hasAttribute('pubnumber')) { + if (objType === 'verse') { + this.usfmString += `\\vp ${xmlObj.getAttribute('pubnumber')}\\vp*`; + } else if (objType === 'chapter') { + this.usfmString += `\n\\cp ${xmlObj.getAttribute('pubnumber')}`; + } + } + + if (xmlObj.hasAttribute('category')) { + this.usfmString += `\n\\cat ${xmlObj.getAttribute('category')} \\cat*`; + } + + const children = Array.from(xmlObj.childNodes); + for (const child of children) { + if (child.nodeType === 1) { // Check if child is an element node + if (objType === 'char') { + this.usxToUsfm(child, true); + } else { + this.usxToUsfm(child, false); + } + } + if (child.nodeType === 3 && child.nodeValue.trim()) { // Check if child is a text node with content + if (this.usfmString !== '' && !['\n', '\r', ' ', '\t'].includes(this.usfmString.slice(-1))) { + this.usfmString += ' '; + } + this.usfmString += child.nodeValue.trim(); + } + } + + const attributes = Array.from(xmlObj.attributes); + for (const attrNode of attributes) { + let key = attrNode.name; + let val = attrNode.value.replace(/"/g, ''); + if (key === 'file' && objType === 'figure') { + usfmAttributes.push(`src="${val}"`); + } else if (!NON_ATTRIB_USX_KEYS.includes(key)) { + usfmAttributes.push(`${key}="${val}"`); + } + if (['sid', 'eid'].includes(key) && objType === 'ms') { + usfmAttributes.push(`${key}="${val}"`); + } + } + + if (usfmAttributes.length > 0) { + this.usfmString += '|'; + this.usfmString += usfmAttributes.join(' '); + } + + if ((xmlObj.hasAttribute('closed') && xmlObj.getAttribute('closed') === 'true') + || CLOSING_USJ_TYPES.includes(objType) + || usfmAttributes.length > 0) { + if (objType === 'ms') { + this.usfmString += '\\*'; + } else { + this.usfmString += `\\${marker}*`; + } + } + + if (objType === 'sidebar') { + this.usfmString += '\n\\esbe\n'; + } + } +} + +exports.USFMGenerator = USFMGenerator; diff --git a/node-usfm-parser/src/usfmParser.js b/node-usfm-parser/src/usfmParser.js new file mode 100644 index 00000000..9eccbf33 --- /dev/null +++ b/node-usfm-parser/src/usfmParser.js @@ -0,0 +1,281 @@ +const Parser = require('tree-sitter'); +const assert = require('assert'); + +const {USFMGenerator} = require("./usfmGenerator"); +const {USJGenerator} = require("./usjGenerator"); +const {ListGenerator} = require("./listGenerator"); +const {USXGenerator} = require("./usxGenerator") +const { includeMarkersInUsj, excludeMarkersInUsj, Filter } = require("./filters.js"); +const USFM3 = require('tree-sitter-usfm3'); +const { Query } = Parser; + +class USFMParser { + + constructor(usfmString=null, fromUsj=null, fromUsx=null) { + let inputsGiven = 0 + if (usfmString !== null) { + inputsGiven += 1 + } + if (fromUsj !== null) { + inputsGiven += 1 + } + if (fromUsx !== null) { + inputsGiven += 1 + } + + if (inputsGiven > 1) { + throw new Error(`Found more than one input! +Only one of USFM, USJ or USX is supported in one object.`) + } + if (inputsGiven === 0) { + throw Error("Missing input! Either USFM, USJ or USX is to be provided.") + } + + if (usfmString !== null) { + if (typeof usfmString !== "string" || usfmString === null) { + throw new Error("Invalid input for USFM. Expected a string."); + } + this.usfm = usfmString; + } else if(fromUsj !== null) { + this.usj = fromUsj; + this.usfm = this.convertUSJToUSFM() + } else if (fromUsx !== null) { + this.usx = fromUsx; + this.usfm = this.convertUSXToUSFM() + } + this.parser = null; + this.initializeParser(); + + this.syntaxTree = null; + this.errors = []; + this.warnings = []; + this.parseUSFM(); + + } + initializeParser() { + this.parser = new Parser(); + this.parser.setLanguage(USFM3); + this.parserOptions = Parser.Options = { + bufferSize: 1024 * 1024, + }; + } + + toSyntaxTree() { + return this.syntaxTree.toString(); + } + + toUSJ(excludeMarkers = null, + includeMarkers = null, + ignoreErrors = false, + combineTexts = true,) { + this.usj = this.convertUSFMToUSJ(excludeMarkers = excludeMarkers, + includeMarkers = includeMarkers, + ignoreErrors = ignoreErrors, + combineTexts = combineTexts,); + return this.usj; + } + + usjToUsfm(usjObject) { + if (typeof usjObject !== "object" || usjObject === null) { + throw new Error("Invalid input for USJ. Expected an object."); + } + if (!this.parser) { + this.initializeParser(); + } + this.usj = usjObject; + this.usfm = this.convertUSJToUSFM(); + return this.usfm; + } + + parseUSFM() { + let tree = null; + try { + if (this.usfm.length > 25000) { + tree = this.parser.parse(this.usfm, null, this.parserOptions); + } + else { + tree = this.parser.parse(this.usfm); + } + } catch (err) { + throw err; + // console.log("Error in parser.parse()"); + // console.log(err.toString()); + // console.log(this.usfm); + } + this.checkForErrors(tree); + this.checkforMissing(tree.rootNode); + // if (error) throw error; + this.syntaxTree = tree.rootNode; + } + + + checkForErrors(tree) { + const errorQuery = new Query(USFM3, "(ERROR) @errors"); + const errors = errorQuery.captures(tree.rootNode); + + if (errors.length > 0) { + this.errors = errors.map( + (error) => + `At ${error.node.startPosition.row}:${error.node.startPosition.column}, Error: ${this.usfm.substring(error.node.startIndex, error.node.endIndex)}`, + ); + return new Error(`Errors found in USFM: ${this.errors.join(", ")}`); + } + } + + checkforMissing(node) { + for (let n of node.children) { + if (n.isMissing){ + this.errors.push( + `At ${n.startPosition.row+1}:${n.startPosition.column}, Error: Missing ${n.type}`) + } + this.checkforMissing(n); + } + } + + + convertUSJToUSFM() { + const outputUSFM = new USFMGenerator().usjToUsfm(this.usj); // Simulated conversion + return outputUSFM; + } + + convertUSXToUSFM() { + try { + assert(1 <= this.usx.nodeType && this.usx.nodeType <= 12 , + 'Input must be an instance of xmldom Document or Element' + ); + if (this.usx.tagName !== "usx") { + assert(this.usx.getElementsByTagName('usx').length === 1, + 'Expects a node. Refer docs: https://docs.usfm.bible/usfm/3.1/syntax.html#_usx_usfm_xml'); + + this.usx = this.usx.getElementsByTagName('usx')[0] + } + // assert(this.usx.childNodes[0].tagName === 'book', " expected as first element in ") + + } catch(err) { + throw new Error("USX not in expected format. "+err.message) + } + try { + const usfmGen = new USFMGenerator() + usfmGen.usxToUsfm(this.usx); + // console.log(usfmGen.usfmString) + return usfmGen.usfmString; + } catch(err) { + let message = "Unable to do the conversion from USX to USFM. "; + throw new Error(message, { cause: err }); + } + } + + convertUSFMToUSJ( + excludeMarkers = null, + includeMarkers = null, + ignoreErrors = false, + combineTexts = true,) { + if (!ignoreErrors && this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + throw new Error( + `Errors present:\n\t${errorString}\nUse ignoreErrors = true to generate output despite errors.`, + ); + } + + let outputUSJ; + try { + let usjGenerator = new USJGenerator( + USFM3, + this.usfm + ); + + usjGenerator.nodeToUSJ(this.syntaxTree, usjGenerator.jsonRootObj); + outputUSJ = usjGenerator.jsonRootObj; + } catch (err) { + let message = "Unable to do the conversion. "; + if (this.errors) { + let errorString = this.errors.join("\n\t"); + message += `Could be due to an error in the USFM\n\t${errorString}`; + } + else { + message = err.message; + } + return {error: message}; + } + + if (includeMarkers) { + outputUSJ = Filter.keepOnly(outputUSJ, [...includeMarkers, 'USJ'], combineTexts); + } + if (excludeMarkers) { + outputUSJ = Filter.remove(outputUSJ, excludeMarkers, combineTexts); + } + + return outputUSJ; + } + + toList( + excludeMarkers = null, + includeMarkers = null, + ignoreErrors = false, + combineTexts = true + ) { + /* Uses the toJSON function and converts JSON to CSV + To be re-implemented to work with the flat JSON schema */ + + if (!ignoreErrors && this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + throw new Error(`Errors present:\n\t${errorString}\nUse ignoreErrors=true to generate output despite errors`); + } + + try { + const usjDict = this.toUSJ(excludeMarkers, includeMarkers, ignoreErrors, combineTexts); + + const listGenerator = new ListGenerator(); + listGenerator.usjToList(usjDict); + return listGenerator.list; + + } catch (exe) { + let message = "Unable to do the conversion. "; + if (this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + message += `Could be due to an error in the USFM\n\t${errorString}`; + } + throw new Error(message, { cause: exe }); + } + + } + + toUSX(ignoreErrors = false) { + /* Convert the syntax_tree to the XML format (USX) */ + + if (!ignoreErrors && this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + throw new Error(`Errors present:\n\t${errorString}\nUse ignoreErrors=true to generate output despite errors`); + } + let xmlContent = null; + + try { + // Initialize the USX generator (assuming the constructor is already implemented in JS) + const usxGenerator = new USXGenerator(USFM3, + this.usfm); + + // Process the syntax tree and convert to USX format + usxGenerator.node2Usx(this.syntaxTree, usxGenerator.xmlRootNode); + + // xmlContent = usxSerializer.serializeToString(usxGenerator.xmlRootNode); + xmlContent = usxGenerator.xmlRootNode; + } catch (exe) { + let message = "Unable to do the conversion. "; + if (this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + message += `Could be due to an error in the USFM\n\t${errorString}`; + } + throw new Error(message, { cause: exe }); + } + + // Return the generated XML structure (in JSON format) + return xmlContent; + } + + +} + + +exports.USFMParser = USFMParser; +exports.Filter = Filter; +// exports.Format = Format; \ No newline at end of file diff --git a/node-usfm-parser/src/usjGenerator.js b/node-usfm-parser/src/usjGenerator.js new file mode 100644 index 00000000..edf9f9bf --- /dev/null +++ b/node-usfm-parser/src/usjGenerator.js @@ -0,0 +1,557 @@ +//Logics for syntax-tree to dict(USJ) conversions +const Parser = require("tree-sitter"); +const {Query} = Parser; + +const { PARA_STYLE_MARKERS, NOTE_MARKERS, CHAR_STYLE_MARKERS, NESTED_CHAR_STYLE_MARKERS, DEFAULT_ATTRIB_MAP, TABLE_CELL_MARKERS, MISC_MARKERS } = require("./utils/markers"); +class USJGenerator { + + + constructor(treeSitterLanguageObj, usfmString, usjRootObj=null) { + this.usfmLanguage = treeSitterLanguageObj; + this.usfm = usfmString; + this.jsonRootObj = usjRootObj || { + type: "USJ", + version: "3.1", + content: [], + }; + } + + findLastFromJson(jsonObj, typeValue) { + let output = null; + if ( + typeValue === jsonObj.type || + (jsonObj.marker && typeValue === jsonObj.marker) + ) { + output = jsonObj; + } + if (jsonObj.content) { + jsonObj.content.forEach((child) => { + if (typeof child === "string") { + return; + } + const childOutput = this.findLastFromJson(child, typeValue); + if (childOutput !== null) { + output = childOutput; + } + }); + } + return output; + } + + nodeToUSJId(node, parentJsonObj) { + const idCaptures = new Query(this.usfmLanguage, + "(id (bookcode) @book-code (description)? @desc)") + .captures(node); + let code = null; + let desc = null; + idCaptures.forEach((capture) => { + if (capture.name === "book-code") { + code = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); + } else if (capture.name === "desc") { + desc = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); + } + }); + const bookJsonObj = { + type: "book", + marker: "id", + code: code, + content: [], + }; + if (desc && desc.trim() !== "") { + bookJsonObj.content.push(desc.trim()); + } + parentJsonObj.content.push(bookJsonObj); + } + + // Similar conversion methods for other node types + nodeToUSJC(node, parentJsonObj) { + // Build c, the chapter milestone node in usj + const chapCap = new Query(this.usfmLanguage, + `(c (chapterNumber) @chap-num + (ca (chapterNumber) @alt-num)? + (cp (text) @pub-num)?)`, + ) + .captures(node); + const chapNum = this.usfm.slice( + chapCap[0].node.startIndex, + chapCap[0].node.endIndex, + ); + let chapRef = null; + this.jsonRootObj.content.forEach((child) => { + if (child.type === "book") { + chapRef = `${child.code} ${chapNum}`; + return; + } + }); + + const chapJsonObj = { + type: "chapter", + marker: "c", + number: chapNum, + sid: chapRef, + }; + + chapCap.forEach((cap) => { + if (cap.name === "alt-num") { + chapJsonObj.altnumber = this.usfm + .substring(cap.node.startIndex, cap.node.endIndex) + .trim(); + } + if (cap.name === "pub-num") { + chapJsonObj.pubnumber = this.usfm + .substring(cap.node.startIndex, cap.node.endIndex) + .trim(); + } + }); + + parentJsonObj.content.push(chapJsonObj); + + node.children.forEach((child) => { + if (["cl", "cd"].includes(child.type)) { + this.nodeToUSJ(child, parentJsonObj); + } + }); + } + + nodeToUSJChapter(node, parentJsonObj) { + // Build chapter node in USJ + node.children.forEach((child) => { + if (child.type === "c") { + this.nodeToUSJC(child, parentJsonObj); + } else { + this.nodeToUSJ(child, parentJsonObj); + } + }); + } + + nodeToUSJVerse(node, parentJsonObj) { + // Build verse node in USJ + const verseNumCap = new Query(this.usfmLanguage, + ` + (v + (verseNumber) @vnum + (va (verseNumber) @alt)? + (vp (text) @vp)? + )`, + ) + .captures(node); + + const verseNum = this.usfm.substring( + verseNumCap[0].node.startIndex, + verseNumCap[0].node.endIndex, + ); + + const vJsonObj = { + type: "verse", + marker: "v", + number: verseNum.trim(), + }; + + verseNumCap.forEach((capture) => { + if (capture.name === "alt") { + const altNum = this.usfm.slice( + capture.node.startIndex, + capture.node.endIndex, + ); + vJsonObj.altnumber = altNum; + } else if (capture.name === "vp") { + const vpText = this.usfm.substring( + capture.node.startIndex, + capture.node.endIndex, + ); + vJsonObj.pubnumber = vpText; + } + }); + + const ref = `${this.findLastFromJson(this.jsonRootObj, "chapter").sid}:${verseNum}`; + vJsonObj.sid = ref.trim(); + + parentJsonObj.content.push(vJsonObj); + } + + nodeToUSJCaVa(node, parentJsonObj) { + // Build elements for independent ca and va away from c and v + const style = node.type; + const charJsonObj = { + type: "char", + marker: style, + }; + + const altNumMatch = new Query(this.usfmLanguage, + `([ + (chapterNumber) + (verseNumber) + ] @alt-num)`, + ) + .captures(node); + + const altNum = this.usfm + .slice(altNumMatch[0].node.startIndex, altNumMatch[0].node.endIndex) + .trim(); + + charJsonObj.altnumber = altNum; + parentJsonObj.content.push(charJsonObj); + } + + nodeToUSJPara(node, parentJsonObj) { + // Build paragraph nodes in USJ + if (node.children[0].type.endsWith("Block")) { + node.children[0].children.forEach((child) => { + this.nodeToUSJPara(child, parentJsonObj); + }); + } else if (node.type === "paragraph") { + const paraTagCap = new Query(this.usfmLanguage, + "(paragraph (_) @para-marker)") + .captures(node)[0]; + const paraMarker = paraTagCap.node.type; + let paraJsonObj = null; + if (paraMarker === "b") { + parentJsonObj.content.push( { type: "para", marker: paraMarker} ); + } else if (!paraMarker.endsWith("Block")) { + let paraJsonObj = { type: "para", marker: paraMarker, content: [] }; + paraTagCap.node.children.forEach((child) => { + this.nodeToUSJ(child, paraJsonObj); + }); + parentJsonObj.content.push(paraJsonObj); + } + } else if (["pi", "ph"].includes(node.type)) { + const paraMarker = this.usfm + .substring(node.children[0].startIndex, node.children[0].endIndex) + .replace("\\", "") + .trim(); + let paraJsonObj = { type: "para", marker: paraMarker, content: [] }; + node.children.slice(1).forEach((child) => { + this.nodeToUSJ(child, paraJsonObj); + }); + parentJsonObj.content.push(paraJsonObj); + } + } + + nodeToUSJNotes(node, parentJsonObj) { + // Build USJ nodes for footnotes and cross-references + const tagNode = node.children[0]; + const callerNode = node.children[1]; + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .trim(); + const noteJsonObj = { + type: "note", + marker: style, + content: [], + }; + + noteJsonObj.caller = this.usfm + .substring(callerNode.startIndex, callerNode.endIndex) + .trim(); + + for (let i = 2; i < node.children.length - 1; i++) { + this.nodeToUSJ(node.children[i], noteJsonObj); + } + + parentJsonObj.content.push(noteJsonObj); + } + + nodeToUSJChar(node, parentJsonObj) { + // Build USJ nodes for character markups, both regular and nested + const tagNode = node.children[0]; + let childrenRange = node.children.length; + if (node.children[node.children.length - 1].type.startsWith("\\")) { + childrenRange -= 1; // Exclude the last node if it starts with '\', treating it as a closing node + } + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .replace("+", "") + .trim(); + const charJsonObj = { + type: "char", + marker: style, + content: [], + }; + + // Assume a flag for closed markup, toggle this if your conditions and data structure require + // charJsonObj.closed = node.children[node.children.length - 1].type.startsWith('\\'); + + for (let i = 1; i < childrenRange; i++) { + this.nodeToUSJ(node.children[i], charJsonObj); + } + + parentJsonObj.content.push(charJsonObj); + } + + nodeToUSJTable(node, parentJsonObj) { + // Handle table related components and convert to USJ + if (node.type === "table") { + const tableJsonObj = { type: "table", content: [] }; + node.children.forEach((child) => { + this.nodeToUSJ(child, tableJsonObj); + }); + parentJsonObj.content.push(tableJsonObj); + } else if (node.type === "tr") { + const rowJsonObj = { type: "table:row", marker: "tr", content: [] }; + node.children.slice(1).forEach((child) => { + this.nodeToUSJ(child, rowJsonObj); + }); + parentJsonObj.content.push(rowJsonObj); + } else if (TABLE_CELL_MARKERS.includes(node.type)) { + const tagNode = node.children[0]; + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .trim(); + const cellJsonObj = { + type: "table:cell", + marker: style, + content: [], + align: style.includes("r") ? "end" : "start", + }; + node.children.slice(1).forEach((child) => { + this.nodeToUSJ(child, cellJsonObj); + }); + parentJsonObj.content.push(cellJsonObj); + } + } + + nodeToUSJAttrib(node, parentJsonObj) { + // Add attribute values to USJ elements + const attribNameNode = node.children[0]; + let attribName = this.usfm + .slice(attribNameNode.startIndex, attribNameNode.endIndex) + .trim(); + + // Handling special cases for attribute names + if (attribName === "|") { + attribName = DEFAULT_ATTRIB_MAP[node.parent.type]; + } + if (attribName === "src") { + // for \fig + attribName = "file"; + } + + const attribValCap = new Query(this.usfmLanguage, + "((attributeValue) @attrib-val)") + .captures(node); + + let attribValue = ""; + if (attribValCap.length > 0) { + attribValue = this.usfm + .substring( + attribValCap[0].node.startIndex, + attribValCap[0].node.endIndex, + ) + .trim(); + } + + parentJsonObj[attribName] = attribValue; + } + + nodeToUSJMilestone(node, parentJsonObj) { + // Create ms node in USJ + + const msNameCap = new Query(this.usfmLanguage, + `( + [(milestoneTag) + (milestoneStartTag) + (milestoneEndTag) + (zSpaceTag) + ] @ms-name)`, + ) + .captures(node)[0]; + + const style = this.usfm + .slice(msNameCap.node.startIndex, msNameCap.node.endIndex) + .replace("\\", "") + .trim(); + const msJsonObj = { type: "ms", marker: style, content: [] }; + + node.children.forEach((child) => { + if (child.type.endsWith("Attribute")) { + this.nodeToUSJ(child, msJsonObj); + } + }); + + // Though normally milestones don't have contents, custom z-namespaces could have them + if (!msJsonObj.content.length) { + delete msJsonObj.content; // Remove empty content array if not used + } + + parentJsonObj.content.push(msJsonObj); + } + + nodeToUSJSpecial(node, parentJsonObj) { + // Build nodes for esb, cat, fig, optbreak in USJ + + if (node.type === "esb") { + const sidebarJsonObj = { type: "sidebar", marker: "esb", content: [] }; + node.children.slice(1, -1).forEach((child) => { + this.nodeToUSJ(child, sidebarJsonObj); + }); + parentJsonObj.content.push(sidebarJsonObj); + } else if (node.type === "cat") { + const catCap = new Query(this.usfmLanguage, + "((category) @category)") + .captures(node)[0]; + const category = this.usfm + .substring(catCap.node.startIndex, catCap.node.endIndex) + .trim(); + parentJsonObj.category = category; + } else if (node.type === "fig") { + const figJsonObj = { type: "figure", marker: "fig", content: [] }; + node.children.slice(1, -1).forEach((child) => { + this.nodeToUSJ(child, figJsonObj); + }); + parentJsonObj.content.push(figJsonObj); + } else if (node.type === "ref") { + const refJsonObj = { type: "ref", content: [] }; + node.children.slice(1, -1).forEach((child) => { + this.nodeToUSJ(child, refJsonObj); + }); + parentJsonObj.content.push(refJsonObj); + } + } + + nodeToUSJGeneric(node, parentJsonObj) { + // Build nodes for para style markers in USJ + const tagNode = node.children[0]; + + let style = this.usfm.substring(tagNode.startIndex, tagNode.endIndex); + if (style.startsWith("\\")) { + style = style.replace("\\", "").trim(); + // } else { + // style = node.type; + } + + // console.log(node.children.length, node.children[0].type, node.children[1].type) + let childrenRangeStart = 1; + // if ( + // node.children.length > 1 && + // node.children[1].type.startsWith("numbered") + // ) { + // const numNode = node.children[1]; + // const num = this.usfm.substring(numNode.startIndex, numNode.endIndex); + // style += num; + // childrenRangeStart = 2; + // } + const paraJsonObj = { type: "para", marker: style, content: [] }; + parentJsonObj.content.push(paraJsonObj); + + for (let i = childrenRangeStart; i < node.children.length; i++) { + const child = node.children[i]; + if ( + CHAR_STYLE_MARKERS.includes(child.type) || + NESTED_CHAR_STYLE_MARKERS.includes(child.type) || + [ + "text", + "footnote", + "crossref", + "verseText", + "v", + "b", + "milestone", + "zNameSpace", + ].includes(child.type) + ) { + // Only nest these types inside the upper para style node + this.nodeToUSJ(child, paraJsonObj); + } else { + this.nodeToUSJ(child, parentJsonObj); + } + } + } + + nodeToUSJ(node, parentJsonObj) { + // Check each node and based on the type convert to corresponding XML element + switch (node.type) { + case "id": + this.nodeToUSJId(node, parentJsonObj); + break; + case "chapter": + this.nodeToUSJChapter(node, parentJsonObj); + break; + case "cl": + case "cp": + case "cd": + case "vp": + this.nodeToUSJGeneric(node, parentJsonObj); + break; + case "ca": + case "va": + this.nodeToUSJCaVa(node, parentJsonObj); + break; + case "v": + this.nodeToUSJVerse(node, parentJsonObj); + break; + case "verseText": + node.children.forEach((child) => this.nodeToUSJ(child, parentJsonObj)); + break; + case "paragraph": + case "pi": + case "ph": + this.nodeToUSJPara(node, parentJsonObj); + break; + case "text": + let textVal = this.usfm + .substring(node.startIndex, node.endIndex) + .trim(); + textVal = textVal.replace("~", " ") + if (textVal !== "") { + parentJsonObj.content.push(textVal); + } + break; + case "table": + case "tr": + this.nodeToUSJTable(node, parentJsonObj); + break; + case "milestone": + case "zNameSpace": + this.nodeToUSJMilestone(node, parentJsonObj); + break; + case "esb": + case "cat": + case "fig": + case "ref": + this.nodeToUSJSpecial(node, parentJsonObj); + break; + case "usfm": + break + default: + if (NOTE_MARKERS.includes(node.type)) { + this.nodeToUSJNotes(node, parentJsonObj) + } + else if ( + CHAR_STYLE_MARKERS.includes(node.type) || + NESTED_CHAR_STYLE_MARKERS.includes(node.type) || + ["xt_standalone"].includes(node.type) + ) { + this.nodeToUSJChar(node, parentJsonObj); + } else if (TABLE_CELL_MARKERS.includes(node.type)) { + this.nodeToUSJTable(node, parentJsonObj) + } else if (node.type.endsWith("Attribute")) { + this.nodeToUSJAttrib(node, parentJsonObj); + } else if ( + PARA_STYLE_MARKERS.includes(node.type) || + PARA_STYLE_MARKERS.includes( + node.type.replace("\\", "").trim(), + ) + ) { + this.nodeToUSJGeneric(node, parentJsonObj); + } else if (["", "|"].includes(node.type.trim())) { + // Skip white space nodes + break; + } else if (node.children.length > 0) { + node.children.forEach((child) => + this.nodeToUSJ(child, parentJsonObj), + ); + } + // else { + // + // console.error("Encountered unknown element ", node.type); + + // } + break; + } + } +} + +exports.USJGenerator = USJGenerator; diff --git a/node-usfm-parser/src/usxGenerator.js b/node-usfm-parser/src/usxGenerator.js new file mode 100644 index 00000000..37c61c1d --- /dev/null +++ b/node-usfm-parser/src/usxGenerator.js @@ -0,0 +1,573 @@ +//Logics for syntax-tree to xml(USX) conversions +const { DOMImplementation, XMLSerializer } = require('xmldom'); +const xpath = require('xpath'); +const Parser = require("tree-sitter"); +const {Query} = Parser; + +const { PARA_STYLE_MARKERS, NOTE_MARKERS, CHAR_STYLE_MARKERS, NESTED_CHAR_STYLE_MARKERS, DEFAULT_ATTRIB_MAP, TABLE_CELL_MARKERS, MISC_MARKERS } = require("./utils/markers"); + + +class USXGenerator { + /** + * A binding for all methods used in generating USX from Syntax tree + * @param {object} treeSitterLanguageObj - The Tree-sitter language object + * @param {Buffer} usfmString - The USFM byte data + * @param {Element} [usxRootElement] - The root element of the USX (optional) + */ + constructor(treeSitterLanguageObj, usfmString, usxRootElement = null) { + this.usfmLanguage = treeSitterLanguageObj; + this.usfm = usfmString; + + const domImpl = new DOMImplementation(); + const doc = domImpl.createDocument(null, 'usx', null); + + if (usxRootElement === null) { + this.xmlRootNode = doc.documentElement; + this.xmlRootNode.setAttribute('version', '3.1'); + } else { + this.xmlRootNode = usxRootElement; + } + } + + /** + * Builds the ID node in USX + * @param {SyntaxNode} node - The syntax node + * @param {Element} parentXmlNode - The parent XML node to append the ID to + */ + node2UsxId(node, parentXmlNode) { + const idCaptures = new Query(this.usfmLanguage, + "(id (bookcode) @book-code (description)? @desc)") + .captures(node); + + let code = null; + let desc = null; + + idCaptures.forEach(capture => { + if (capture.name === 'book-code') { + code = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); + } else if (capture.name === 'desc') { + desc = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); + } + }); + + const bookXmlNode = parentXmlNode.ownerDocument.createElement('book'); + bookXmlNode.setAttribute('code', code); + bookXmlNode.setAttribute('style', 'id'); + + if (desc && desc.trim() !== '') { + const textNode = parentXmlNode.ownerDocument.createTextNode(desc.trim()); + bookXmlNode.appendChild(textNode); + } + + parentXmlNode.appendChild(bookXmlNode); + } + + node2UsxC(node, parentXmlNode) { + // Build c, the chapter milestone node in usj + const chapCap = new Query(this.usfmLanguage, + `(c (chapterNumber) @chap-num + (ca (chapterNumber) @alt-num)? + (cp (text) @pub-num)?)`, + ) + .captures(node); + const chapNum = this.usfm.slice( + chapCap[0].node.startIndex, + chapCap[0].node.endIndex, + ); + const bookNode = xpath.select1("book", parentXmlNode); + const bookCode = bookNode.getAttribute("code"); + const chapRef = `${bookCode} ${chapNum}`; + + // Create the 'chapter' element + const chapXmlNode = parentXmlNode.ownerDocument.createElement('chapter'); + chapXmlNode.setAttribute("number", chapNum); + chapXmlNode.setAttribute("style", "c"); + chapXmlNode.setAttribute("sid", chapRef); + + chapCap.forEach((cap) => { + if (cap.name === "alt-num") { + const altNum = this.usfm + .substring(cap.node.startIndex, cap.node.endIndex) + .trim(); + chapXmlNode.setAttribute('altnumber', altNum); + } + if (cap.name === "pub-num") { + const pubNum = this.usfm + .substring(cap.node.startIndex, cap.node.endIndex) + .trim(); + chapXmlNode.setAttribute('pubnumber', pubNum); + } + }); + + parentXmlNode.appendChild(chapXmlNode); + + node.children.forEach((child) => { + if (["cl", "cd"].includes(child.type)) { + this.node2Usx(child, parentXmlNode); + } + }); + } + + + + node2UsxChapter(node, parentXmlNode) { + // Build chapter node in USJ + node.children.forEach((child) => { + if (child.type === "c") { + this.node2UsxC(child, parentXmlNode); + } else { + this.node2Usx(child, parentXmlNode); + } + }); + + const prevVerses = xpath.select("//verse", this.xmlRootNode); + if (prevVerses.length > 0 && prevVerses[prevVerses.length - 1].hasAttribute('sid')) { + const vEndXmlNode = parentXmlNode.ownerDocument.createElement('verse'); + vEndXmlNode.setAttribute('eid', prevVerses[prevVerses.length - 1].getAttribute('sid')); + const sibblingCount = parentXmlNode.childNodes.length; + const lastSibbling = parentXmlNode.childNodes[sibblingCount-1]; + if (lastSibbling.tagName === "para") { + lastSibbling.appendChild(vEndXmlNode); + } else if (lastSibbling.tagName === "table") { + const rows = lastSibbling.getElementsByTagName('row'); + rows[rows.length - 1].appendChild(vEndXmlNode); + } else { + parentXmlNode.appendChild(vEndXmlNode); + } + } + + } + + findPrevUncle(parentXmlNode) { + // Get the grandparent node + const grandParent = parentXmlNode.parentNode; + let uncleIndex = grandParent.childNodes.length - 2; // Start from the previous sibling + + while (uncleIndex >= 0) { + const uncle = grandParent.childNodes[uncleIndex]; + + // Skip 'sidebar' and 'ms' elements + if (uncle.tagName === "sidebar" || uncle.tagName === "ms") { + uncleIndex--; + } + // Skip elements with 'ca' or 'cp' in the style attribute + else if (uncle.getAttribute('style') === 'ca' || uncle.getAttribute('style') === 'cp') { + uncleIndex--; + } + // Return the found uncle element + else { + return uncle; + } + } + return null; // No suitable uncle found + } + + node2UsxVerse(node, parentXmlNode) { + // Find all previous 'verse' elements + const prevVerses = xpath.select("//verse", this.xmlRootNode); + + // Check if there are previous verses and if the last one has a 'sid' attribute + if (prevVerses.length > 0 && prevVerses[prevVerses.length - 1].hasAttribute('sid')) { + let vEndXmlNode; + if (parentXmlNode.textContent.trim() !== "") { + // If there is verse text in the current parent + vEndXmlNode = parentXmlNode.ownerDocument.createElement('verse'); + parentXmlNode.appendChild(vEndXmlNode); + } else { + // If no text, find the previous uncle and attach the end verse + const prevUncle = this.findPrevUncle(parentXmlNode); + if (prevUncle.tagName === "para") { + vEndXmlNode = prevUncle.ownerDocument.createElement('verse'); + prevUncle.appendChild(vEndXmlNode); + } else if (prevUncle.tagName === "table") { + const rows = prevUncle.getElementsByTagName('row'); + vEndXmlNode = prevUncle.ownerDocument.createElement('verse'); + rows[rows.length - 1].appendChild(vEndXmlNode); + } else { + throw new Error(`prev_uncle is ${String(prevUncle)}`); + } + } + vEndXmlNode.setAttribute('eid', prevVerses[prevVerses.length - 1].getAttribute('sid')); + } + + // Query to capture verse-related elements + const verseNumCap = new Query(this.usfmLanguage, + ` + (v + (verseNumber) @vnum + (va (verseNumber) @alt)? + (vp (text) @vp)? + )`, + ) + .captures(node); + + const verseNum = this.usfm.substring( + verseNumCap[0].node.startIndex, + verseNumCap[0].node.endIndex, + ); + const vXmlNode = parentXmlNode.ownerDocument.createElement('verse'); + parentXmlNode.appendChild(vXmlNode); + + // Loop through the captured elements and set the attributes + verseNumCap.forEach(capture => { + if (capture.name === 'alt') { + const altNum = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); + vXmlNode.setAttribute('altnumber', altNum); + } else if (capture.name === 'vp') { + const vpText = this.usfm.slice(capture.node.startIndex, capture.node.endIndex).trim(); + vXmlNode.setAttribute('pubnumber', vpText); + } + }); + + // Get the last chapter's 'sid' attribute to form the verse reference + const chapterSid = xpath.select("//chapter", this.xmlRootNode).pop().getAttribute('sid'); + const ref = `${chapterSid}:${verseNum}`; + + // Set attributes on the newly created 'verse' element + vXmlNode.setAttribute('number', verseNum.trim()); + vXmlNode.setAttribute('style', 'v'); + vXmlNode.setAttribute('sid', ref.trim()); + } + + node2UsxCaVa(node, parentXmlNode) { + // Build elements for independent ca and va away from c and v + const style = node.type; + + // Create a new 'char' element under the parent XML node + const charXmlNode = parentXmlNode.ownerDocument.createElement('char'); + charXmlNode.setAttribute('style', style); + + // Query to capture chapterNumber or verseNumber + const altNumMatch = new Query(this.usfmLanguage, + `([ + (chapterNumber) + (verseNumber) + ] @alt-num)`, + ) + .captures(node); + + // Extract the alternate number from the captured range + const altNum = this.usfm + .slice(altNumMatch[0].node.startIndex, altNumMatch[0].node.endIndex) + .trim(); + + // Set the attributes on the 'char' element + charXmlNode.setAttribute('altnumber', altNum); + charXmlNode.setAttribute('closed', 'true'); + + // Append the 'char' element to the parent XML node + parentXmlNode.appendChild(charXmlNode); + } + + node2UsxPara(node, parentXmlNode) { + // Build paragraph nodes in USX + if (node.children[0].type.endsWith('Block')) { + for (const child of node.children[0].children) { + this.node2UsxPara(child, parentXmlNode); + } + } else if (node.type === 'paragraph') { + const paraTagCap = new Query(this.usfmLanguage, + "(paragraph (_) @para-marker)").captures(node)[0]; + const paraMarker = paraTagCap.node.type; + + if (!paraMarker.endsWith("Block")) { + const paraXmlNode = parentXmlNode.ownerDocument.createElement("para"); + paraXmlNode.setAttribute("style", paraMarker); + parentXmlNode.appendChild(paraXmlNode); + + for (const child of paraTagCap.node.children.slice(1)) { + this.node2Usx(child, paraXmlNode); + } + + } + } else if (['pi', 'ph'].includes(node.type)) { + const paraMarker = this.usfm.slice(node.children[0].startIndex, node.children[0].endIndex) + .replace("\\", "") + .trim(); + const paraXmlNode = parentXmlNode.ownerDocument.createElement("para"); + paraXmlNode.setAttribute("style", paraMarker); + parentXmlNode.appendChild(paraXmlNode); + + for (const child of node.children.slice(1)) { + this.node2Usx(child, paraXmlNode); + } + + } + } + + + node2UsxNotes(node, parentXmlNode) { + // Build USJ nodes for footnotes and cross-references + const tagNode = node.children[0]; + const callerNode = node.children[1]; + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .trim(); + const noteXmlNode = parentXmlNode.ownerDocument.createElement('note'); + noteXmlNode.setAttribute('style', style); + const caller = this.usfm + .substring(callerNode.startIndex, callerNode.endIndex) + .trim(); + noteXmlNode.setAttribute('caller', caller); + parentXmlNode.appendChild(noteXmlNode); + for (let i = 2; i < node.children.length - 1; i++) { + this.node2Usx(node.children[i], noteXmlNode); + } + + } + + node2UsxChar(node, parentXmlNode) { + // Build USJ nodes for character markups, both regular and nested + const tagNode = node.children[0]; + let childrenRange = node.children.length; + if (node.children[node.children.length - 1].type.startsWith("\\")) { + childrenRange -= 1; // Exclude the last node if it starts with '\', treating it as a closing node + } + const charXmlNode = parentXmlNode.ownerDocument.createElement('char'); + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .replace("+", "") + .trim(); + charXmlNode.setAttribute('style', style); + parentXmlNode.appendChild(charXmlNode); + + for (let i = 1; i < childrenRange; i++) { + this.node2Usx(node.children[i], charXmlNode); + } + + } + + node2UsxAttrib(node, parentXmlNode) { + // Add attribute values to USJ elements + const attribNameNode = node.children[0]; + let attribName = this.usfm + .slice(attribNameNode.startIndex, attribNameNode.endIndex) + .trim(); + + // Handling special cases for attribute names + if (attribName === "|") { + attribName = DEFAULT_ATTRIB_MAP[node.parent.type]; + } + if (attribName === "src") { + // for \fig + attribName = "file"; + } + + const attribValCap = new Query(this.usfmLanguage, + "((attributeValue) @attrib-val)") + .captures(node); + + let attribValue = ""; + if (attribValCap.length > 0) { + attribValue = this.usfm + .substring( + attribValCap[0].node.startIndex, + attribValCap[0].node.endIndex, + ) + .trim(); + } + + parentXmlNode.setAttribute(attribName, attribValue); + } + + node2UsxTable(node, parentXmlNode) { + // Handle table related components and convert to USJ + if (node.type === "table") { + const tableXmlNode = parentXmlNode.ownerDocument.createElement('table'); + parentXmlNode.appendChild(tableXmlNode); + node.children.forEach((child) => { + this.node2Usx(child, tableXmlNode); + }); + } else if (node.type === "tr") { + const rowXmlNode = parentXmlNode.ownerDocument.createElement('row'); + rowXmlNode.setAttribute("style", "tr"); + parentXmlNode.appendChild(rowXmlNode); + node.children.slice(1).forEach((child) => { + this.node2Usx(child, rowXmlNode); + }); + } else if (TABLE_CELL_MARKERS.includes(node.type)) { + const tagNode = node.children[0]; + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .trim(); + const cellXmlNode = parentXmlNode.ownerDocument.createElement("cell"); + cellXmlNode.setAttribute("style", style); + cellXmlNode.setAttribute("align", style.includes("r") ? "end" : "start"); + parentXmlNode.appendChild(cellXmlNode); + node.children.slice(1).forEach((child) => { + this.node2Usx(child, cellXmlNode); + }); + } + } + + node2UsxMilestone(node, parentXmlNode) { + // Create ms node in USJ + + const msNameCap = new Query(this.usfmLanguage, + `( + [(milestoneTag) + (milestoneStartTag) + (milestoneEndTag) + (zSpaceTag) + ] @ms-name)`, + ) + .captures(node)[0]; + + const style = this.usfm + .slice(msNameCap.node.startIndex, msNameCap.node.endIndex) + .replace("\\", "") + .trim(); + const msXmlNode = parentXmlNode.ownerDocument.createElement("ms"); + msXmlNode.setAttribute("style", style); + parentXmlNode.appendChild(msXmlNode); + node.children.forEach((child) => { + if (child.type.endsWith("Attribute")) { + this.node2Usx(child, msXmlNode); + } + }); + } + + node2UsxSpecial(node, parentXmlNode) { + // Build nodes for esb, cat, fig, optbreak in USJ + + if (node.type === "esb") { + const sidebarXmlNode = parentXmlNode.ownerDocument.createElement('sidebar'); + sidebarXmlNode.setAttribute('style', "esb"); + parentXmlNode.appendChild(sidebarXmlNode); + node.children.slice(1, -1).forEach((child) => { + this.node2Usx(child, sidebarXmlNode); + }); + } else if (node.type === "cat") { + const catCap = new Query(this.usfmLanguage, + "((category) @category)") + .captures(node)[0]; + const category = this.usfm + .substring(catCap.node.startIndex, catCap.node.endIndex) + .trim(); + parentXmlNode.setAttribute("category", category); + } else if (node.type === "fig") { + const figXmlNode = parentXmlNode.ownerDocument.createElement('figure'); + figXmlNode.setAttribute("style", "fig"); + parentXmlNode.appendChild(figXmlNode); + node.children.slice(1, -1).forEach((child) => { + this.node2Usx(child, figXmlNode); + }); + } else if (node.type === "ref") { + const refXmlNode = parentXmlNode.ownerDocument.createElement('ref'); + parentXmlNode.appendChild(refXmlNode); + node.children.slice(1, -1).forEach((child) => { + this.node2Usx(child, refXmlNode); + }); + } + } + + node2UsxGeneric(node, parentXmlNode) { + const tagNode = node.children[0]; + let style = this.usfm.slice(tagNode.startIndex, tagNode.endIndex).trim(); + + // Strip leading backslashes from the style or use node type + if (style.startsWith('\\')) { + style = style.replace('\\', ''); + // } else { + // style = node.type; + } + + if (style === "usfm") { + return + } + + let childrenRangeStart = 1; + + // Create a 'para' element and set its style attribute + const paraXmlNode = parentXmlNode.ownerDocument.createElement('para'); + paraXmlNode.setAttribute('style', style); + parentXmlNode.appendChild(paraXmlNode); + + // Loop through the child nodes and recursively process them + for (let i = childrenRangeStart; i < node.children.length; i++) { + const child = node.children[i]; + if ( + CHAR_STYLE_MARKERS.includes(child.type) || + NESTED_CHAR_STYLE_MARKERS.includes(child.type) || + [ + "text", + "footnote", + "crossref", + "verseText", + "v", + "b", + "milestone", + "zNameSpace", + ].includes(child.type) + ) { + // If the child is of one of the allowed types, nest it inside the para node + this.node2Usx(child, paraXmlNode); + } else { + // Otherwise, append the child to the parent XML node + this.node2Usx(child, parentXmlNode); + } + } + + // Append the created para node to the parent XML node + } + + node2Usx(node, parentXmlNode) { + // Handling node types with respective functions + if (node.type === "id") { + this.node2UsxId(node, parentXmlNode); + } else if (node.type === "chapter") { + this.node2UsxChapter(node, parentXmlNode); + } else if (["cl", "cp", "cd", "vp"].includes(node.type)) { + this.node2UsxGeneric(node, parentXmlNode); + } else if (["ca", "va"].includes(node.type)) { + this.node2UsxCaVa(node, parentXmlNode); + } else if (node.type === "v") { + this.node2UsxVerse(node, parentXmlNode); + } else if (node.type === "verseText") { + node.children.forEach(child => { + this.node2Usx(child, parentXmlNode); + }); + } else if (["paragraph", "pi", "ph"].includes(node.type)) { + this.node2UsxPara(node, parentXmlNode); + } else if (NOTE_MARKERS.includes(node.type)) { + this.node2UsxNotes(node, parentXmlNode); + } else if ( + CHAR_STYLE_MARKERS.concat(NESTED_CHAR_STYLE_MARKERS, ["xt_standalone"]).includes(node.type) + ) { + this.node2UsxChar(node, parentXmlNode); + } else if (node.type.endsWith("Attribute")) { + this.node2UsxAttrib(node, parentXmlNode); + } else if (node.type === "text") { + let textVal = this.usfm.slice(node.startIndex, node.endIndex).trim(); + textVal = textVal.replace("~", " ") + const textNode = parentXmlNode.ownerDocument.createTextNode(textVal); + parentXmlNode.appendChild(textNode); + } else if (["table", "tr"].concat(TABLE_CELL_MARKERS).includes(node.type)) { + this.node2UsxTable(node, parentXmlNode); + } else if (node.type === "milestone" || node.type === "zNameSpace") { + this.node2UsxMilestone(node, parentXmlNode); + } else if (["esb", "cat", "fig", "ref"].includes(node.type)) { + this.node2UsxSpecial(node, parentXmlNode); + } else if ( + PARA_STYLE_MARKERS.includes(node.type) || + PARA_STYLE_MARKERS.includes(node.type.replace("\\", "").trim()) + ) { + this.node2UsxGeneric(node, parentXmlNode); + } else if (["", "|"].includes(node.type.trim())) { + // Skip whitespace nodes + } else if (node.children.length > 0) { + node.children.forEach(child => { + this.node2Usx(child, parentXmlNode); + }); + } + // else { + // throw new Error(`Encountered unknown element: ${node}`); + // } + } +} + + +exports.USXGenerator = USXGenerator; diff --git a/node-usfm-parser/src/utils/format.js b/node-usfm-parser/src/utils/format.js new file mode 100644 index 00000000..3ed2cea0 --- /dev/null +++ b/node-usfm-parser/src/utils/format.js @@ -0,0 +1,12 @@ +// src/Format.js + +const Format = { + JSON: "usj", + CSV: "table", + ST: "syntax-tree", + USX: "usx", + MD: "markdown", + USFM: "usfm", +}; + +exports.Format = Format; diff --git a/node-usfm-parser/src/utils/markers.js b/node-usfm-parser/src/utils/markers.js new file mode 100644 index 00000000..120de56d --- /dev/null +++ b/node-usfm-parser/src/utils/markers.js @@ -0,0 +1,126 @@ +const CHAR_STYLE_MARKERS = [ + "add", + "bk", + "dc", + "ior", + "iqt", + "k", + "litl", + "nd", + "ord", + "pn", + "png", + "qac", + "qs", + "qt", + "rq", + "sig", + "sls", + "tl", + "wj", // Special - text + "em", + "bd", + "bdit", + "it", + "no", + "sc", + "sup", // character styling + "rb", + "pro", + "w", + "wh", + "wa", + "wg", //special - features + "lik", + "liv", //structred list entries + "jmp", + "fr", + "ft", + "fk", + "fq", + "fqa", + "fl", + "fw", + "fp", + "fv", + "fdc", //footnote - content + "xo", + "xop", + "xt", + "xta", + "xk", + "xq", + "xot", + "xnt", + "xdc", //crossref - content +]; + +exports.PARA_STYLE_MARKERS = [ + "ide", + "usfm", + "h", + "toc", + "toca", //identification + "imt", + "is", + "ip", + "ipi", + "im", + "imi", + "ipq", + "imq", + "ipr", + "iq", + "ib", + "ili", + "iot", + "io", + "iex", + "imte", + "ie", // intro + "mt", + "mte", + "cl", + "cd", + "ms", + "mr", + "s", + "sr", + "r", + "d", + "sp", + "sd", //titles + "q", + "qr", + "qc", + "qa", + "qm", + "qd", //poetry + "lh", + "li", + "lf", + "lim", + "litl", //lists + "sts", + "rem", + "lit", + "restore", //comments +]; +exports.NOTE_MARKERS = ["f", "fe", "ef", "efe", "x", "ex"]; +exports.CHAR_STYLE_MARKERS = CHAR_STYLE_MARKERS; +exports.NESTED_CHAR_STYLE_MARKERS = CHAR_STYLE_MARKERS.map( + (item) => item + "Nested", +); +exports.DEFAULT_ATTRIB_MAP = { + w: "lemma", + rb: "gloss", + xt: "href", + fig: "alt", + xt_standalone: "href", + xtNested: "href", + ref: "loc", + "milestone": "who", + "k":"key" +}; +exports.TABLE_CELL_MARKERS = ["tc", "th", "tcr", "thr"]; +exports.MISC_MARKERS = ["fig", "cat", "esb", "b", "ph", "pi"]; diff --git a/node-usfm-parser/src/utils/types.js b/node-usfm-parser/src/utils/types.js new file mode 100644 index 00000000..d6484a19 --- /dev/null +++ b/node-usfm-parser/src/utils/types.js @@ -0,0 +1,32 @@ +exports.NO_USFM_USJ_TYPES = ["USJ", "table"]; +exports.CLOSING_USJ_TYPES + = ["char", "note", "figure", "ref"]; +exports.NON_ATTRIB_USJ_KEYS = [ + "type", + "marker", + "content", + "number", + "sid", + "code", + "caller", + "align", + "version", + "altnumber", + "pubnumber", + "category", +]; + +exports.NON_ATTRIB_USX_KEYS = [ + "style", + "number", + "sid", + "code", + "caller", + "align", + "version", + "altnumber", + "pubnumber", + "category", +]; +exports.NO_NEWLINE_USJ_TYPES = ["char", "note", "verse", "table:cell"]; +exports.NO_NEWLINE_USX_TYPES = ["char", "note", "verse", "cell"]; diff --git a/node-usfm-parser/test/basic.js b/node-usfm-parser/test/basic.js new file mode 100644 index 00000000..1c995862 --- /dev/null +++ b/node-usfm-parser/test/basic.js @@ -0,0 +1,101 @@ +const assert = require('assert'); +const {USFMParser} = require("../src/index"); + +const simpleUSFM = '\\id GEN\n\\c 1\n\\p\n\\v 1 In the begining..\\v 2'; +const simpleUSJ = { + type: 'USJ', + version: '0.3.0', + content: [ + { type: 'book', marker: 'id', code: 'GEN', content: [] }, + { type: 'chapter', marker: 'c', number: '1', sid: 'GEN 1' }, + { type: 'para', marker: 'p', content: [ + {type: 'verse', marker: 'v', number: 1 }, + "In the begining..", + {type: 'verse', marker: 'v', number: 2 } + ] } + ] +} +describe("Sanity Check for the testing pipeline", () => { + + it("Parse, toUSJ and back toUSFM", () => { + const usfmParser = new USFMParser(simpleUSFM); + const output = usfmParser.toUSJ() + assert.notStrictEqual(output, null, 'The result should not be null and no errors during conversion'); + + const usfm = usfmParser.usjToUsfm(output) + assert.notStrictEqual(usfm, null, 'The result should not be null and no errors during conversion'); + + + }); +}); + +describe("USFMParser Object initialization", () => { + + it("with USFM", () => { + const usfmParser = new USFMParser(simpleUSFM) + assert.strictEqual(usfmParser.usfm, simpleUSFM) + + }); + + it("with USJ", () => { + const usfmParser = new USFMParser(usfmString=null, fromUsj=simpleUSJ) + assert.strictEqual(usfmParser.usj, simpleUSJ) + + }); + + it("with nothing", () => { + let usfmParser = null; + try { + const usfmParser = new USFMParser() + + } catch(err) { + assert.strictEqual(err.message, "Missing input! Either USFM, USJ or USX is to be provided.") + } + assert.strictEqual(usfmParser, null); + }); + + it("with usfm and usj", () => { + let usfmParser = null; + try { + const usfmParser = new USFMParser(usfmString=simpleUSFM, fromUsj=simpleUSJ) + + } catch(err) { + assert.strictEqual(err.message, `Found more than one input! +Only one of USFM, USJ or USX is supported in one object.` ) + } + assert.strictEqual(usfmParser, null); + }); + + it("with usj in place of USFM", () => { + let usfmParser = null; + try { + const usfmParser = new USFMParser(usfmString=simpleUSJ) + + } catch(err) { + assert.strictEqual(err.message, "Invalid input for USFM. Expected a string.") + } + assert.strictEqual(usfmParser, null); + }); + + it("with usfm in place of USJ", () => { + let usfmParser = null; + try { + const usfmParser = new USFMParser(usfmString=null, fromUsj=simpleUSJ) + + } catch(err) { + assert.strictEqual(err.message, "Invalid input for USJ. Expected an object.") + } + assert.strictEqual(usfmParser, null); + }); + + it("with usj as default", () => { + let usfmParser = null; + try { + const usfmParser = new USFMParser(simpleUSJ) + + } catch(err) { + assert.strictEqual(err.message, "Invalid input for USFM. Expected a string.") + } + assert.strictEqual(usfmParser, null); + }); +}); diff --git a/node-usfm-parser/test/config.js b/node-usfm-parser/test/config.js new file mode 100644 index 00000000..815f3f3d --- /dev/null +++ b/node-usfm-parser/test/config.js @@ -0,0 +1,225 @@ +const {glob} = require('glob'); +const fs = require('node:fs'); +const { DOMParser } = require('xmldom') +const {USFMParser} = require("../src/index"); + +let allUsfmFiles = []; +let negativeTests = [] + +const TEST_DIR = "../tests"; + +allUsfmFiles = allUsfmFiles.concat( glob.sync(TEST_DIR+'/*/*/origin.usfm')); +allUsfmFiles = allUsfmFiles.concat( glob.sync(TEST_DIR+'/*/*/*/origin.usfm')); +// console.log(allUsfmFiles) + + + +let passFailOverrideList = { + //linkhref without - + "/paratextTests/Usfm30Usage/origin.usfm": "fail", + + // custom attribute without x- + "/paratextTests/InvalidAttributes/origin.usfm": "fail", + "/paratextTests/InvalidFigureAttributesReported/origin.usfm": "fail", + + // link attributes used without hyphen + "/paratextTests/LinkAttributesAreValid/origin.usfm": "fail", + + // significant space missing after \p , \q, \m, \b + "/paratextTests/CustomAttributesAreValid/origin.usfm": "fail", + "/paratextTests/NestingInFootnote/origin.usfm": "fail", + "/specExamples/cross-ref/origin.usfm": "fail", + "/paratextTests/MarkersMissingSpace/origin.usfm": "fail", + "/paratextTests/NestingInCrossReferences/origin.usfm": "fail", + "/special-cases/empty-para/origin.usfm": "fail", + // "/special-cases/sp/origin.usfm": "fail", + "/specExamples/extended/sidebars/origin.usfm":"fail", + + // No. of columns in table not validated by usfm-grammar + "/paratextTests/MissingColumnInTable/origin.usfm": "pass", + + // WordlistMarkerMissingFromGlossaryCitationForms from paratext. Something to do with \k or \w + "/paratextTests/WordlistMarkerMissingFromGlossaryCitationForms/origin.usfm": "pass", + + "/usfmjsTests/ts/origin.usfm": "pass", // Committee thinks these should fail though + "/usfmjsTests/chunk_footnote/origin.usfm": "pass", // Committee thinks these should fail though + "/usfmjsTests/ts_2/origin.usfm": "pass", // Committee thinks these should fail though + "/special-cases/newline-attributes/origin.usfm": "pass", // Committee thinks these should fail though + "/special-cases/empty-attributes5/origin.usfm": "pass", // Committee thinks these should fail though + + // no content in ide, rem, toc1, ip etc + "/paratextTests/NoErrorsPartiallyEmptyBook/origin.usfm": "fail", + "/paratextTests/NoErrorsEmptyBook/origin.usfm": "fail", + "/usfmjsTests/57-TIT.greek/origin.usfm": "fail", + "/paratextTests/EmptyMarkers/origin.usfm": "fail", + + // no \p (usually after \s) + "/usfmjsTests/missing_verses/origin.usfm": "fail", // has \s5 + "/usfmjsTests/isa_verse_span/origin.usfm": "fail", // has \s5 + "/usfmjsTests/isa_footnote/origin.usfm": "fail", // has \s5 + "/usfmjsTests/tit_extra_space_after_chapter/origin.usfm": "fail", // has \s5 + "/usfmjsTests/1ch_verse_span/origin.usfm": "fail", // has \s5 + "/usfmjsTests/usfmIntroTest/origin.usfm": "fail", + "/usfmjsTests/out_of_sequence_verses/origin.usfm": "fail", + "/usfmjsTests/acts_1_milestone/origin.usfm": "fail", + "/usfmjsTests/luk_quotes/origin.usfm": "fail", + "/biblica/BlankLinesWithFigures/origin.usfm": "fail", //\fig used without \p, only \b + + //no space after \s5 + "/usfmjsTests/usfmBodyTestD/origin.usfm": "fail", + "/usfmjsTests/usfm-body-testF/origin.usfm": "fail", + "/usfmjsTests/psa_quotes/origin.usfm": "fail", + "/usfmjsTests/pro_footnote/origin.usfm": "fail", + "/usfmjsTests/pro_quotes/origin.usfm": "fail", + "/samples-from-wild/doo43-1/origin.usfm": "fail", + "/usfmjsTests/gn_headers/origin.usfm": "fail", + "/usfmjsTests/isa_inline_quotes/origin.usfm": "fail", + "/usfmjsTests/job_footnote/origin.usfm": "fail", + "/usfmjsTests/mat-4-6.whitespace/origin.usfm": "fail", + "/usfmjsTests/out_of_sequence_chapters/origin.usfm": "fail", + + "/biblica/PublishingVersesWithFormatting/origin.usfm": "fail", // \c without number + + "/special-cases/figure_with_quotes_in_desc/origin.usfm": "fail", // quote within quote + "/specExamples/poetry/origin.usfm": "fail", // \b not followed by a \p or \q + + "/paratextTests/InvalidRubyMarkup/origin.usfm": "fail", // contradicts /paratextTests/MissingRequiredAttributesReported + "/special-cases/empty-book/origin.usfm": "pass", // Just says only \id is not enough. Not clear what else is mandatory + "/usfmjsTests/f10_gen12-2_empty_word/origin.usfm": "pass", // Empty \w \w* is accepted by us as of now + //########## Need to be fixed ####################### + "/paratextTests/NoErrorsShort/origin.usfm": "pass", // \c is mandatory! + // "/usfmjsTests/gn_headers/origin.usfm": "fail", # what is the valid position for mte and imt + "/usfmjsTests/acts_8-37-ugnt-footnote/origin.usfm": "fail", // no clue why it fails + + "/advanced/periph/origin.usfm": "fail", // Peripharals not implemented + "/advanced/nesting1/origin.usfm": "fail", // We dont support char within char w/o +, yet + "/samples-from-wild/doo43-4/origin.usfm": "fail", // ior surronded by a () leaves a stray ) at the end. + +}; + + +let excludeUSJs = [ + `${TEST_DIR}/biblica/CrossRefWithPipe/origin.json`, //ref object introduced which is not in usfm + `${TEST_DIR}/special-cases/empty-attributes/origin.json`, //lemma not given correctly. Issue from USX + `${TEST_DIR}/specExamples/character/origin.json`,// lit element treated as a body paragraph enclosing a verse! Issue from USX + + ] + +let excludeUSXs = [ + `${TEST_DIR}/specExamples/extended/contentCatogories2/origin.xml`, + // \ef not treated as inline content of paragraph + `${TEST_DIR}/specExamples/extended/sectionIntroductions/origin.xml`, + // verse number="+"!!! + `${TEST_DIR}/specExamples/character/origin.xml`, + // lit element treated as a body paragraph enclosing a verse! + `${TEST_DIR}/usfmjsTests/esb/origin.xml`, + // last verse text given outside of paragraph. + `${TEST_DIR}/special-cases/nbsp/origin.xml`, + // ~ not being replaced by nbsp in usfm-grammar + `${TEST_DIR}/special-cases/empty-attributes/origin.xml`, + // attributes treated as text content of marker + `${TEST_DIR}/biblica/CategoriesOnNotes/origin.xml`, + `${TEST_DIR}/biblica/CrossRefWithPipe/origin.xml`, + // ref node has type ref. Is it char or ref? + `${TEST_DIR}/usfmjsTests/usfmBodyTestD/origin.xml`, + // \v and other contents contained inside \lit. New docs doesnt have \lit + `${TEST_DIR}/usfmjsTests/usfm-body-testF/origin.xml`, + // does the ms go inside \s5 or after it? +] + +const initialiseParser = function (inputUsfmPath){ + `Open and parse the given file` + try { + const data = fs.readFileSync(inputUsfmPath, 'utf8'); + let testParser = new USFMParser(data); + if (testParser === null) { + throw Error(`Paring failed for ${inputUsfmPath}: ${data}`) + } + return testParser; + } catch (err) { + throw err; + } +} + +const checkValidUsfm = function (inputUsfmPath) { + `Checks the metadata.xml to see is the USFM is a valid one` + if (inputUsfmPath.replace(TEST_DIR, '') in passFailOverrideList){ + if (passFailOverrideList[inputUsfmPath.replace(TEST_DIR, '')] === "pass"){ + return true + } else if (passFailOverrideList[inputUsfmPath.replace(TEST_DIR, '')] === "fail") { + return false + } + } + let value = null; + let metaFilePath = inputUsfmPath.replace("origin.usfm", "metadata.xml") + let metadata = fs.readFileSync(metaFilePath, 'utf8') + + const doc = new DOMParser().parseFromString(metadata, 'text/xml'); + + value = doc.getElementsByTagName("validated")[0].textContent; + + if (value === "fail"){ + return false + } + else if (value === "pass") { + return true + } else { + throw Error(`Validation read as : ${value} for ${metaFilePath}`) + + } +} + +const findAllMarkers = function (usfmStr, keepId = false, keepNumber = true) { + // Regex pattern to find all markers in the USFM string + let allMarkersInInput = [...usfmStr.matchAll(/\\\+?(([A-Za-z]+)\d*(-[se])?)/g)]; + + // Processing based on `keepNumber` flag + if (keepNumber) { + allMarkersInInput = allMarkersInInput.map(match => match[1]); + } else { + allMarkersInInput = allMarkersInInput.map(match => match[1] + match[2]); + } + + // Remove duplicates + allMarkersInInput = [...new Set(allMarkersInInput)]; + + // Remove 'id' marker if `keepId` is false + if (!keepId) { + const idIndex = allMarkersInInput.indexOf('id'); + if (idIndex !== -1) allMarkersInInput.splice(idIndex, 1); + } + + // Handle 'esbe' and 'usfm' markers + const esbeIndex = allMarkersInInput.indexOf('esbe'); + if (esbeIndex !== -1) { + const esbIndex = allMarkersInInput.indexOf('esb'); + if (esbIndex === -1) throw new Error("'esb' must be present if 'esbe' is found"); + allMarkersInInput.splice(esbeIndex, 1); + } + + const usfmIndex = allMarkersInInput.indexOf('usfm'); + if (usfmIndex !== -1) { + allMarkersInInput.splice(usfmIndex, 1); + } + + return allMarkersInInput; +} + +let isValidUsfm = {} + +allUsfmFiles.forEach((filepath) => { + isValidUsfm[filepath] = checkValidUsfm(filepath) +}); +// console.log(allUsfmFiles[0]) + +// const test_parser = initialiseParser("../tests/samples-from-wild/WEB1/origin.usfm") + + +module.exports = { + allUsfmFiles: allUsfmFiles, + initialiseParser: initialiseParser, + isValidUsfm: isValidUsfm, + excludeUSJs: excludeUSJs, + excludeUSXs: excludeUSXs, + findAllMarkers: findAllMarkers +}; diff --git a/node-usfm-parser/test/test_list_conversion.js b/node-usfm-parser/test/test_list_conversion.js new file mode 100644 index 00000000..e16f087a --- /dev/null +++ b/node-usfm-parser/test/test_list_conversion.js @@ -0,0 +1,71 @@ +const assert = require('assert'); +const fs = require('node:fs'); +const {allUsfmFiles, initialiseParser, isValidUsfm, excludeUSJs, findAllMarkers} = require('./config'); +const {USFMParser, Filter} = require("../src/index"); + + +describe("Check successful USFM-List conversion for positive samples", () => { + + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Convert ${value} to List`, (inputUsfmPath=value) => { + //Tests if input parses without errors + const testParser = initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const list = testParser.toList(); + assert(list instanceof Array); + assert.deepStrictEqual(list[0], + [ 'Book', 'Chapter', 'Verse', 'Text', 'Type', 'Marker' ]); + + }); + } + }); +}); + + +describe("Test Exclude Marker option in List conversion", () => { + // Test Exclude Maker option by checking markers in the List + const excludeTests = [ + ['s', 'r'] + ] + excludeTests.forEach(function(exList) { + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Exclude ${exList.slice(0, 5)} from ${value}`, (inputUsfmPath=value) => { + const testParser = initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const list = testParser.toList(excludeMarkers=exList); + assert(list instanceof Array); + + const allTypes = list.map(row => row[5]); + let types = new Set(allTypes); + let intersection = exList.filter(value => types.has(value)); + assert.deepStrictEqual(intersection, []) + }); + } + }) + }) +}); + +describe("Test include Marker option in List conversion", () => { + // Test include Maker option by checking markers in the List + const includeTests = [ + ['id', 'c', 'v']+Filter.TEXT+Filter.PARAGRAPHS + ] + includeTests.forEach(function(inList) { + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`include ${inList.slice(0, 5)} of ${value} in List`, (inputUsfmPath=value) => { + const testParser = initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const list = testParser.toList(null, inList); + assert(list instanceof Array); + + const allTypes = list.slice(1).map(row => row[5]); + assert( allTypes.every(element => inList.includes(element)), allTypes) + + }); + } + }) + }) +}); \ No newline at end of file diff --git a/node-usfm-parser/test/test_parsing.js b/node-usfm-parser/test/test_parsing.js new file mode 100644 index 00000000..937e0716 --- /dev/null +++ b/node-usfm-parser/test/test_parsing.js @@ -0,0 +1,23 @@ +const assert = require('assert'); +const {allUsfmFiles, initialiseParser, isValidUsfm} = require('./config'); +const {USFMParser} = require("../src/index"); + + +describe("Check parsing pass or fail is correct", () => { + + allUsfmFiles.forEach(function(value) { + it(`Parse ${value} to ensure validity ${isValidUsfm[value]}`, (inputUsfmPath=value) => { + const testParser = initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + assert(testParser.errors instanceof Array) + if (isValidUsfm[inputUsfmPath] === true) { + assert.strictEqual(testParser.errors.length, 0); + } else { + assert.notStrictEqual(testParser.errors.length, 0); + } + + + }); + + }); +}); diff --git a/node-usfm-parser/test/test_usj_conversion.js b/node-usfm-parser/test/test_usj_conversion.js new file mode 100644 index 00000000..2c1107a0 --- /dev/null +++ b/node-usfm-parser/test/test_usj_conversion.js @@ -0,0 +1,269 @@ +const assert = require('assert'); +const fs = require('node:fs'); +const Ajv = require('ajv'); +const {allUsfmFiles, initialiseParser, isValidUsfm, excludeUSJs, findAllMarkers} = require('./config'); +const {USFMParser, Filter} = require("../src/index"); + +describe("Check successful USFM-USJ conversion for positive samples", () => { + + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Convert ${value} to USJ`, (inputUsfmPath=value) => { + //Tests if input parses without errors + const testParser = initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usj = testParser.toUSJ(); + assert(usj instanceof Object); + assert.strictEqual(usj["type"], "USJ"); + assert.strictEqual(usj["version"], "3.1"); + assert.strictEqual(usj.content[0].type, "book"); + assert.strictEqual(usj.content[0].marker, "id"); + }); + } + }); +}); + + +describe("Compare generated USJ with testsuite sample", () => { + + allUsfmFiles.forEach(function(value) { + const usjPath = value.replace(".usfm", ".json"); + if (isValidUsfm[value] && ! excludeUSJs.includes(usjPath)) { + it(`Compare generated USJ to ${usjPath}`, (inputUsfmPath=value) => { + const testParser = initialiseParser(inputUsfmPath) + const generatedUSJ = testParser.toUSJ(); + const filePath = usjPath; + let fileData = null; + try { + fileData = fs.readFileSync(filePath, "utf8"); + } catch(err) { + if (err.code === "ENOENT") { + return + } + } + const testsuiteUSJ = JSON.parse(fileData); + stripDefaultAttribValue(testsuiteUSJ) + removeNewlinesInText(testsuiteUSJ) + stripTextValue(testsuiteUSJ) + removeNewlinesInText(generatedUSJ) + stripTextValue(generatedUSJ) + + assert.deepEqual(generatedUSJ, testsuiteUSJ); + }); + } + }); +}); + + +describe("Test USFM-USJ-USFM roundtripping", () => { + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Roundtrip ${value} via USJ`, (inputUsfmPath=value) => { + const testParser = initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usj = testParser.toUSJ(); + assert(usj instanceof Object); + + const testParser2 = new USFMParser(usfmString=null, fromUsj=usj); + const generatedUSFM = testParser2.usfm; + assert.strictEqual(typeof generatedUSFM, 'string'); + assert(generatedUSFM.startsWith("\\id")); + + const inputMarkers = findAllMarkers(testParser.usfm) + const finalMarkers = findAllMarkers(generatedUSFM) + assert.deepStrictEqual(inputMarkers, finalMarkers, `Markers in input and generated USFMs differ`) + + + + }); + } + }); + +}); + +describe("Ensure all markers are in USJ", () => { + // Tests if all markers in USFM are present in output also + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Check for markers of ${value} in USJ`, (inputUsfmPath=value) => { + const testParser = initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usj = testParser.toUSJ(); + assert(usj instanceof Object); + + const inputMarkers = [... new Set(findAllMarkers(testParser.usfm, keepId=true))] + const allUSJTypes = getTypes(usj); + + assert.deepStrictEqual(inputMarkers, allUSJTypes, `Markers in input and generated USJ differ`) + }); + } + }); + +}); + + +describe("Validate USJ against schema", () => { + // Test generated USJ against USJ schema + const ajv = new Ajv(); + const schemaStr = fs.readFileSync("../schemas/usj.js", 'utf8'); + const schema = JSON.parse(schemaStr); + const validate = ajv.compile(schema); + + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Validate USJ generated from ${value}`, (inputUsfmPath=value) => { + const testParser = initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usj = testParser.toUSJ(); + assert(usj instanceof Object); + + assert(validate(usj)); + + }); + } + }); + +}); + + +describe("Test Exclude Marker option", () => { + // Test Exclude Maker option by checking markers in the USJ + const excludeTests = [ + ['v', 'c'], + Filter.PARAGRAPHS, + [...Filter.TITLES, ...Filter.BOOK_HEADERS ] + ] + excludeTests.forEach(function(exList) { + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Exclude ${exList.slice(0, 5)} from ${value}`, (inputUsfmPath=value) => { + const testParser = initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usj = testParser.toUSJ(excludeMarkers=exList); + assert(usj instanceof Object); + + const allUSJTypes = getTypes(usj) + let types = new Set(allUSJTypes); + let intersection = exList.filter(value => types.has(value)); + assert.deepStrictEqual(intersection, []) + }); + } + }) + }) +}); + +describe("Test Include Marker option", () => { + // Test Include Maker option by checking markers in the USJ + const includeTests = [ + ['v', 'c'], + Filter.PARAGRAPHS, + [...Filter.TITLES, ...Filter.BOOK_HEADERS ] + ] + includeTests.forEach(function(inList) { + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Include ${inList.slice(0, 5)} in ${value}`, (inputUsfmPath=value) => { + const testParser = initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usj = testParser.toUSJ(null, inList); + assert(usj instanceof Object); + + let allUSJTypes = getTypes(usj, keepNumber=false) + assert( allUSJTypes.every(element => inList.includes(element)), allUSJTypes) + }); + } + }) + }) +}); + + +function stripTextValue(usjObj) { + /* Trailing and preceding space handling can be different between tcdocs and our logic. + Strip both before comparison */ + if (usjObj.hasOwnProperty("content")) { + usjObj["content"].forEach((item, index) => { + if (typeof item === 'string') { + usjObj["content"][index] = item.trim(); // Strip spaces from strings + } else { + stripTextValue(item); // Recursively handle nested objects + } + }); + } +} + +function removeNewlinesInText(usjDict) { + /* The test samples in testsuite do not preserve new lines. But we do in usfm-grammar. + So removing them just for comparison */ + if (usjDict.hasOwnProperty("content")) { + usjDict["content"].forEach((item, index) => { + if (typeof item === 'string') { + // Replace newlines with spaces + usjDict["content"][index] = item.replace(/\n/g, " "); + // Replace multiple spaces with a single space + usjDict["content"][index] = usjDict["content"][index].replace(/\s+/g, " "); + } else { + removeNewlinesInText(item); // Recursively handle nested dictionaries + } + }); + } +} + + +function stripDefaultAttribValue(usjDict) { + /* The USX samples in test suite have space in lemma values when given as default attribute */ + if (usjDict.hasOwnProperty("content")) { + usjDict["content"].forEach(item => { + if (typeof item === 'object' && !Array.isArray(item)) { + if (item["type"] === "char" && item["marker"] === "w") { + if (item.hasOwnProperty("lemma")) { + item["lemma"] = item["lemma"].trim(); // Strip spaces from 'lemma' + } + } + stripDefaultAttribValue(item); // Recursively handle nested dictionaries + } + }); + } +} + + +function getTypes(element, keepNumber=true) { + // Recursive function to find all keys in the dict output + let types = []; + if (typeof element === 'string') { + return types; // Return empty array if element is a string + } else { + if ('marker' in element) { + types.push(element.marker); + } + if (element.type === 'ref') { + types.push("ref"); + } + if ('altnumber' in element) { + if (element.marker === 'c') { + types.push('ca'); + } else { + types.push('va'); + } + } + if ('pubnumber' in element) { + if (element.marker === 'c') { + types.push('cp'); + } else { + types.push('vp'); + } + } + if ('category' in element) { + types.push('cat'); + } + if ('content' in element) { + element.content.forEach(item => { + types = types.concat(getTypes(item)); // Recursively get types from content + }); + } + } + let uniqueTypes = [...new Set(types)]; + if (! keepNumber) { + uniqueTypes = uniqueTypes.map(item => item.replace(/\d+$/, '')); + } + return uniqueTypes; +} diff --git a/node-usfm-parser/test/test_usx_conversion.js b/node-usfm-parser/test/test_usx_conversion.js new file mode 100644 index 00000000..180e6b59 --- /dev/null +++ b/node-usfm-parser/test/test_usx_conversion.js @@ -0,0 +1,140 @@ +const assert = require('assert'); +const fs = require('node:fs'); +const { DOMImplementation, XMLSerializer, DOMParser } = require('xmldom'); +const {allUsfmFiles, initialiseParser, isValidUsfm, excludeUSXs, findAllMarkers} = require('./config'); +const {USFMParser, Filter} = require("../src/index"); + +describe("Check successful USFM-USX conversion for positive samples", () => { + const domImpl = new DOMImplementation(); + const sampleDoc = domImpl.createDocument(null, 'usx', null); + allUsfmFiles.forEach(function(value) { + + if (isValidUsfm[value]) { + it(`Convert ${value} to USX`, (inputUsfmPath=value) => { + //Tests if input parses without errors + const testParser = initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usx = testParser.toUSX(); + // assert(usx instanceof DOMImplementation.Document); + assert(usx.tagName === "usx"); + assert(usx.getAttribute("version") === "3.1"); + assert(usx.childNodes[0].tagName === "book"); + assert(usx.childNodes[0].getAttribute("style") === "id"); + }); + } + }); +}); + + + +describe("Ensure all markers are in USX", () => { + // Tests if all markers in USFM are present in output also + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Check for markers of ${value} in USX`, (inputUsfmPath=value) => { + const testParser = initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usx = testParser.toUSX(); + + const inputMarkers = [... new Set(findAllMarkers(testParser.usfm, keepId=true))] + const allUSXNodes = getNodes(usx); + + assert.deepStrictEqual(inputMarkers, allUSXNodes, `Markers in input and generated USJ differ`) + }); + } + }); + +}); + +describe("Test USFM-USX-USFM roundtripping", () => { + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Roundtrip ${value} via USX`, (inputUsfmPath=value) => { + const testParser = initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usx = testParser.toUSX(); + assert(usx.nodeType === 1); + + const testParser2 = new USFMParser(usfmString=null, fromUsj=null, fromUsx=usx); + const generatedUSFM = testParser2.usfm.trim(); + assert.strictEqual(typeof generatedUSFM, 'string'); + assert(generatedUSFM.startsWith("\\id")); + + const inputMarkers = findAllMarkers(testParser.usfm) + const finalMarkers = findAllMarkers(generatedUSFM) + assert.deepStrictEqual(inputMarkers, finalMarkers, `Markers in input and generated USFMs differ`) + + }); + } + }); + +}); + + +// describe("Compare generated USX with testsuite sample", () => { + +// allUsfmFiles.forEach(function(value) { +// const usxPath = value.replace(".usfm", ".xml"); +// if (isValidUsfm[value] && ! excludeUSXs.includes(usxPath)) { +// it(`Compare generated USX to ${usxPath}`, (inputUsfmPath=value) => { +// const testParser = initialiseParser(inputUsfmPath) +// const generatedUSX = testParser.toUSX(); +// const filePath = usxPath; +// let fileData = null; +// try { +// fileData = fs.readFileSync(filePath, "utf8"); +// } catch(err) { +// if (err.code === "ENOENT") { +// return +// } +// } +// const testsuiteUSX = new DOMParser().parseFromString( +// fileData, 'text/xml').getElementsByTagName("usx")[0]; + +// assert.deepEqual(generatedUSX, testsuiteUSX); +// }); +// } +// }); +// }); + +function getNodes(element, keepNumber=true) { + // Recursive function to find all keys in the dict output + let types = []; + if (element.nodeType === element.TEXT_NODE) { + return types; // Return empty array if element is a string + } else { + if (element.getAttribute('style')) { + types.push(element.getAttribute('style')); + } + if (element.tagName === "ref") { + types.push("ref"); + } + if (element.getAttribute('altnumber')) { + if (element.tagName === 'chapter') { + types.push('ca'); + } else { + types.push('va'); + } + } + if (element.getAttribute('pubnumber')) { + if (element.tagName === 'chapter') { + types.push('cp'); + } else { + types.push('vp'); + } + } + if (element.getAttribute('category')) { + types.push('cat'); + } + if (element.childNodes.length > 0) { + Array.from(element.childNodes).forEach(child => { + types = types.concat(getNodes(child)); // Recursively get types from content + }); + } + } + let uniqueTypes = [...new Set(types)]; + if (! keepNumber) { + uniqueTypes = uniqueTypes.map(item => item.replace(/\d+$/, '')); + } + return uniqueTypes; +} diff --git a/py-usfm-parser/src/usfm_grammar/usfm_parser.py b/py-usfm-parser/src/usfm_grammar/usfm_parser.py index d4fac650..ceca210c 100755 --- a/py-usfm-parser/src/usfm_grammar/usfm_parser.py +++ b/py-usfm-parser/src/usfm_grammar/usfm_parser.py @@ -104,7 +104,7 @@ def __init__(self, usfm_string:str=None, from_usj:dict=None, from_usx:etree.Elem self.usfm_bytes = None self.syntax_tree = None - self.errors = None + self.errors = [] self.warnings = [] # Some basic sanity checks @@ -125,6 +125,15 @@ def __init__(self, usfm_string:str=None, from_usj:dict=None, from_usx:etree.Elem self.errors = [(f"At {err[0].start_point}", self.usfm_bytes[err[0].start_byte: err[0].end_byte].decode('utf-8')) for err in errors] + self.check_for_missing(self.syntax_tree) + + def check_for_missing(self, node): + '''Identify and report the MISSING nodes also as errors''' + for child in node.children: + if child.is_missing : + self.errors.append((f"At {child.start_point}", f"Missing {child.type}")) + else: + self.check_for_missing(child) def to_syntax_tree(self, ignore_errors=False): diff --git a/py-usfm-parser/tests/__init__.py b/py-usfm-parser/tests/__init__.py index c7261385..b7c5320f 100644 --- a/py-usfm-parser/tests/__init__.py +++ b/py-usfm-parser/tests/__init__.py @@ -94,7 +94,7 @@ def find_all_markers(usfm_path, keep_id=False, keep_number=True): f"{TEST_DIR}/paratextTests/MarkersMissingSpace/origin.usfm": "fail", f"{TEST_DIR}/paratextTests/NestingInCrossReferences/origin.usfm": "fail", f"{TEST_DIR}/special-cases/empty-para/origin.usfm": "fail", - f"{TEST_DIR}/special-cases/sp/origin.usfm": "fail", + # f"{TEST_DIR}/special-cases/sp/origin.usfm": "fail", f"{TEST_DIR}/specExamples/extended/sidebars/origin.usfm":"fail", # No. of columns in table not validated by usfm-grammar @@ -127,6 +127,18 @@ def find_all_markers(usfm_path, keep_id=False, keep_number=True): f"{TEST_DIR}/usfmjsTests/luk_quotes/origin.usfm": "fail", f"{TEST_DIR}/biblica/BlankLinesWithFigures/origin.usfm": "fail", #\fig used without \p, only \b + # no space after \s5 + f"{TEST_DIR}/usfmjsTests/usfmBodyTestD/origin.usfm": "fail", + f"{TEST_DIR}/usfmjsTests/usfm-body-testF/origin.usfm": "fail", + f"{TEST_DIR}/usfmjsTests/psa_quotes/origin.usfm": "fail", + f"{TEST_DIR}/usfmjsTests/pro_footnote/origin.usfm": "fail", + f"{TEST_DIR}/usfmjsTests/pro_quotes/origin.usfm": "fail", + f"{TEST_DIR}/samples-from-wild/doo43-1/origin.usfm": "fail", + f"{TEST_DIR}/usfmjsTests/gn_headers/origin.usfm": "fail", + f"{TEST_DIR}/usfmjsTests/isa_inline_quotes/origin.usfm": "fail", + f"{TEST_DIR}/usfmjsTests/job_footnote/origin.usfm": "fail", + f"{TEST_DIR}/usfmjsTests/mat-4-6.whitespace/origin.usfm": "fail", + f"{TEST_DIR}/usfmjsTests/out_of_sequence_chapters/origin.usfm": "fail", f"{TEST_DIR}/biblica/PublishingVersesWithFormatting/origin.usfm": "fail", # \c without number @@ -143,6 +155,7 @@ def find_all_markers(usfm_path, keep_id=False, keep_number=True): f"{TEST_DIR}/advanced/periph/origin.usfm": "fail", # Peripharals not implemented f"{TEST_DIR}/advanced/nesting1/origin.usfm": "fail", # We dont support char within char w/o +, yet + f"{TEST_DIR}/samples-from-wild/doo43-4/origin.usfm": "fail", # ior surronded by a () leaves a stray ) at the end. } negative_tests = [] diff --git a/py-usfm-parser/tests/test_json_conversion.py b/py-usfm-parser/tests/test_json_conversion.py index 634b289c..674d5931 100644 --- a/py-usfm-parser/tests/test_json_conversion.py +++ b/py-usfm-parser/tests/test_json_conversion.py @@ -165,7 +165,7 @@ def strip_default_attrib_value(usj_dict): if item['type'] == "char" and item['marker'] == "w": if "lemma" in item: item['lemma'] = item['lemma'].strip() - strip_default_attrib_value(item) + strip_default_attrib_value(item) @pytest.mark.parametrize('file_path', test_files) diff --git a/schemas/usj.js b/schemas/usj.js index 5e93e9da..fc766da5 100644 --- a/schemas/usj.js +++ b/schemas/usj.js @@ -1,6 +1,6 @@ { - "$schema": "USJ-0.0.1", - "$id": "https://usfm-committee/usj.schema.json", + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://github.com/usfm-bible/tcdocs/blob/main/grammar/usj.js", "title": "Unified Scripture JSON", "description": "The JSON varient of USFM and USX data models", "type": "object", diff --git a/tree-sitter-usfm3/package-lock.json b/tree-sitter-usfm3/package-lock.json index fb732afc..db2bb52e 100644 --- a/tree-sitter-usfm3/package-lock.json +++ b/tree-sitter-usfm3/package-lock.json @@ -7,19 +7,343 @@ "": { "name": "tree-sitter-usfm3", "version": "3.0.0-beta.10", + "hasInstallScript": true, "license": "MIT License", "dependencies": { - "nan": "^2.15.0", + "node-addon-api": "^7.1.0", + "node-gyp-build": "^4.8.0", "tree-sitter": "^0.21.1" }, "devDependencies": { + "prebuildify": "^6.0.0", "tree-sitter-cli": "^0.22.6" + }, + "peerDependencies": { + "tree-sitter": "^0.21.0" + }, + "peerDependenciesMeta": { + "tree_sitter": { + "optional": true + } + } + }, + "node_modules/base64-js": { + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", + "integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT" + }, + "node_modules/bl": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz", + "integrity": "sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==", + "dev": true, + "license": "MIT", + "dependencies": { + "buffer": "^5.5.0", + "inherits": "^2.0.4", + "readable-stream": "^3.4.0" + } + }, + "node_modules/buffer": { + "version": "5.7.1", + "resolved": "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz", + "integrity": "sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT", + "dependencies": { + "base64-js": "^1.3.1", + "ieee754": "^1.1.13" + } + }, + "node_modules/chownr": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz", + "integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==", + "dev": true, + "license": "ISC" + }, + "node_modules/end-of-stream": { + "version": "1.4.4", + "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.4.tgz", + "integrity": "sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "once": "^1.4.0" + } + }, + "node_modules/fs-constants": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz", + "integrity": "sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==", + "dev": true, + "license": "MIT" + }, + "node_modules/ieee754": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz", + "integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "BSD-3-Clause" + }, + "node_modules/inherits": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", + "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==", + "dev": true, + "license": "ISC" + }, + "node_modules/minimist": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz", + "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==", + "dev": true, + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/mkdirp-classic": { + "version": "0.5.3", + "resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz", + "integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==", + "dev": true, + "license": "MIT" + }, + "node_modules/node-abi": { + "version": "3.67.0", + "resolved": "https://registry.npmjs.org/node-abi/-/node-abi-3.67.0.tgz", + "integrity": "sha512-bLn/fU/ALVBE9wj+p4Y21ZJWYFjUXLXPi/IewyLZkx3ApxKDNBWCKdReeKOtD8dWpOdDCeMyLh6ZewzcLsG2Nw==", + "dev": true, + "license": "MIT", + "dependencies": { + "semver": "^7.3.5" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/node-addon-api": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-7.1.1.tgz", + "integrity": "sha512-5m3bsyrjFWE1xf7nz7YXdN4udnVtXK6/Yfgn5qnahL6bCkf2yKt4k3nuTKAtT4r3IG8JNR2ncsIMdZuAzJjHQQ==", + "license": "MIT" + }, + "node_modules/node-gyp-build": { + "version": "4.8.1", + "resolved": "https://registry.npmjs.org/node-gyp-build/-/node-gyp-build-4.8.1.tgz", + "integrity": "sha512-OSs33Z9yWr148JZcbZd5WiAXhh/n9z8TxQcdMhIOlpN9AhWpLfvVFO73+m77bBABQMaY9XSvIa+qk0jlI7Gcaw==", + "bin": { + "node-gyp-build": "bin.js", + "node-gyp-build-optional": "optional.js", + "node-gyp-build-test": "build-test.js" } }, - "node_modules/nan": { - "version": "2.15.0", - "resolved": "https://registry.npmjs.org/nan/-/nan-2.15.0.tgz", - "integrity": "sha512-8ZtvEnA2c5aYCZYd1cvgdnU6cqwixRoYg70xPLWUws5ORTa/lnw+u4amixRS/Ac5U5mQVgp9pnlSUnbNWFaWZQ==" + "node_modules/npm-run-path": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-3.1.0.tgz", + "integrity": "sha512-Dbl4A/VfiVGLgQv29URL9xshU8XDY1GeLy+fsaZ1AA8JDSfjvr5P5+pzRbWqRSBxk6/DW7MIh8lTM/PaGnP2kg==", + "dev": true, + "license": "MIT", + "dependencies": { + "path-key": "^3.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/once": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", + "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==", + "dev": true, + "license": "ISC", + "dependencies": { + "wrappy": "1" + } + }, + "node_modules/path-key": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz", + "integrity": "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/prebuildify": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/prebuildify/-/prebuildify-6.0.1.tgz", + "integrity": "sha512-8Y2oOOateom/s8dNBsGIcnm6AxPmLH4/nanQzL5lQMU+sC0CMhzARZHizwr36pUPLdvBnOkCNQzxg4djuFSgIw==", + "dev": true, + "license": "MIT", + "dependencies": { + "minimist": "^1.2.5", + "mkdirp-classic": "^0.5.3", + "node-abi": "^3.3.0", + "npm-run-path": "^3.1.0", + "pump": "^3.0.0", + "tar-fs": "^2.1.0" + }, + "bin": { + "prebuildify": "bin.js" + } + }, + "node_modules/pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", + "dev": true, + "license": "MIT", + "dependencies": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } + }, + "node_modules/readable-stream": { + "version": "3.6.2", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz", + "integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==", + "dev": true, + "license": "MIT", + "dependencies": { + "inherits": "^2.0.3", + "string_decoder": "^1.1.1", + "util-deprecate": "^1.0.1" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/safe-buffer": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", + "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT" + }, + "node_modules/semver": { + "version": "7.6.3", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.3.tgz", + "integrity": "sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==", + "dev": true, + "license": "ISC", + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/string_decoder": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz", + "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==", + "dev": true, + "license": "MIT", + "dependencies": { + "safe-buffer": "~5.2.0" + } + }, + "node_modules/tar-fs": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.1.tgz", + "integrity": "sha512-V0r2Y9scmbDRLCNex/+hYzvp/zyYjvFbHPNgVTKfQvVrb6guiE/fxP+XblDNR011utopbkex2nM4dHNV6GDsng==", + "dev": true, + "license": "MIT", + "dependencies": { + "chownr": "^1.1.1", + "mkdirp-classic": "^0.5.2", + "pump": "^3.0.0", + "tar-stream": "^2.1.4" + } + }, + "node_modules/tar-stream": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-2.2.0.tgz", + "integrity": "sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "bl": "^4.0.3", + "end-of-stream": "^1.4.1", + "fs-constants": "^1.0.0", + "inherits": "^2.0.3", + "readable-stream": "^3.1.1" + }, + "engines": { + "node": ">=6" + + } + }, + "node_modules/tree-sitter": { + "version": "0.21.1", + "resolved": "https://registry.npmjs.org/tree-sitter/-/tree-sitter-0.21.1.tgz", + "integrity": "sha512-7dxoA6kYvtgWw80265MyqJlkRl4yawIjO7S5MigytjELkX43fV2WsAXzsNfO7sBpPPCF5Gp0+XzHk0DwLCq3xQ==", + "hasInstallScript": true, + "dependencies": { + "node-addon-api": "^8.0.0", + "node-gyp-build": "^4.8.0" + } }, "node_modules/node-addon-api": { "version": "8.0.0", @@ -58,13 +382,243 @@ "bin": { "tree-sitter": "cli.js" } + }, + "node_modules/tree-sitter/node_modules/node-addon-api": { + "version": "8.1.0", + "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-8.1.0.tgz", + "integrity": "sha512-yBY+qqWSv3dWKGODD6OGE6GnTX7Q2r+4+DfpqxHSHh8x0B4EKP9+wVGLS6U/AM1vxSNNmUEuIV5EGhYwPpfOwQ==", + "license": "MIT", + "engines": { + "node": "^18 || ^20 || >= 21" + } + }, + "node_modules/util-deprecate": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", + "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==", + "dev": true, + "license": "MIT" + }, + "node_modules/wrappy": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", + "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==", + "dev": true, + "license": "ISC" } }, "dependencies": { - "nan": { - "version": "2.15.0", - "resolved": "https://registry.npmjs.org/nan/-/nan-2.15.0.tgz", - "integrity": "sha512-8ZtvEnA2c5aYCZYd1cvgdnU6cqwixRoYg70xPLWUws5ORTa/lnw+u4amixRS/Ac5U5mQVgp9pnlSUnbNWFaWZQ==" + "base64-js": { + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", + "integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==", + "dev": true + }, + "bl": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz", + "integrity": "sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==", + "dev": true, + "requires": { + "buffer": "^5.5.0", + "inherits": "^2.0.4", + "readable-stream": "^3.4.0" + } + }, + "buffer": { + "version": "5.7.1", + "resolved": "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz", + "integrity": "sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==", + "dev": true, + "requires": { + "base64-js": "^1.3.1", + "ieee754": "^1.1.13" + } + }, + "chownr": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz", + "integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==", + "dev": true + }, + "end-of-stream": { + "version": "1.4.4", + "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.4.tgz", + "integrity": "sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==", + "dev": true, + "requires": { + "once": "^1.4.0" + } + }, + "fs-constants": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz", + "integrity": "sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==", + "dev": true + }, + "ieee754": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz", + "integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==", + "dev": true + }, + "inherits": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", + "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==", + "dev": true + }, + "minimist": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz", + "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==", + "dev": true + }, + "mkdirp-classic": { + "version": "0.5.3", + "resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz", + "integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==", + "dev": true + }, + "node-abi": { + "version": "3.67.0", + "resolved": "https://registry.npmjs.org/node-abi/-/node-abi-3.67.0.tgz", + "integrity": "sha512-bLn/fU/ALVBE9wj+p4Y21ZJWYFjUXLXPi/IewyLZkx3ApxKDNBWCKdReeKOtD8dWpOdDCeMyLh6ZewzcLsG2Nw==", + "dev": true, + "requires": { + "semver": "^7.3.5" + } + }, + "node-addon-api": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-7.1.1.tgz", + "integrity": "sha512-5m3bsyrjFWE1xf7nz7YXdN4udnVtXK6/Yfgn5qnahL6bCkf2yKt4k3nuTKAtT4r3IG8JNR2ncsIMdZuAzJjHQQ==" + }, + "node-gyp-build": { + "version": "4.8.1", + "resolved": "https://registry.npmjs.org/node-gyp-build/-/node-gyp-build-4.8.1.tgz", + "integrity": "sha512-OSs33Z9yWr148JZcbZd5WiAXhh/n9z8TxQcdMhIOlpN9AhWpLfvVFO73+m77bBABQMaY9XSvIa+qk0jlI7Gcaw==" + }, + "npm-run-path": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-3.1.0.tgz", + "integrity": "sha512-Dbl4A/VfiVGLgQv29URL9xshU8XDY1GeLy+fsaZ1AA8JDSfjvr5P5+pzRbWqRSBxk6/DW7MIh8lTM/PaGnP2kg==", + "dev": true, + "requires": { + "path-key": "^3.0.0" + } + }, + "once": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", + "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==", + "dev": true, + "requires": { + "wrappy": "1" + } + }, + "path-key": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz", + "integrity": "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==", + "dev": true + }, + "prebuildify": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/prebuildify/-/prebuildify-6.0.1.tgz", + "integrity": "sha512-8Y2oOOateom/s8dNBsGIcnm6AxPmLH4/nanQzL5lQMU+sC0CMhzARZHizwr36pUPLdvBnOkCNQzxg4djuFSgIw==", + "dev": true, + "requires": { + "minimist": "^1.2.5", + "mkdirp-classic": "^0.5.3", + "node-abi": "^3.3.0", + "npm-run-path": "^3.1.0", + "pump": "^3.0.0", + "tar-fs": "^2.1.0" + } + }, + "pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", + "dev": true, + "requires": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } + }, + "readable-stream": { + "version": "3.6.2", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz", + "integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==", + "dev": true, + "requires": { + "inherits": "^2.0.3", + "string_decoder": "^1.1.1", + "util-deprecate": "^1.0.1" + } + }, + "safe-buffer": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", + "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==", + "dev": true + }, + "semver": { + "version": "7.6.3", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.3.tgz", + "integrity": "sha512-oVekP1cKtI+CTDvHWYFUcMtsK/00wmAEfyqKfNdARm8u1wNVhSgaX7A8d4UuIlUI5e84iEwOhs7ZPYRmzU9U6A==", + "dev": true + }, + "string_decoder": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz", + "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==", + "dev": true, + "requires": { + "safe-buffer": "~5.2.0" + } + }, + "tar-fs": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.1.tgz", + "integrity": "sha512-V0r2Y9scmbDRLCNex/+hYzvp/zyYjvFbHPNgVTKfQvVrb6guiE/fxP+XblDNR011utopbkex2nM4dHNV6GDsng==", + "dev": true, + "requires": { + "chownr": "^1.1.1", + "mkdirp-classic": "^0.5.2", + "pump": "^3.0.0", + "tar-stream": "^2.1.4" + } + }, + "tar-stream": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-2.2.0.tgz", + "integrity": "sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==", + "dev": true, + "requires": { + "bl": "^4.0.3", + "end-of-stream": "^1.4.1", + "fs-constants": "^1.0.0", + "inherits": "^2.0.3", + "readable-stream": "^3.1.1" + } + }, + "tree-sitter": { + "version": "0.21.1", + "resolved": "https://registry.npmjs.org/tree-sitter/-/tree-sitter-0.21.1.tgz", + "integrity": "sha512-7dxoA6kYvtgWw80265MyqJlkRl4yawIjO7S5MigytjELkX43fV2WsAXzsNfO7sBpPPCF5Gp0+XzHk0DwLCq3xQ==", + "requires": { + "node-addon-api": "^8.0.0", + "node-gyp-build": "^4.8.0" + }, + "dependencies": { + "node-addon-api": { + "version": "8.1.0", + "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-8.1.0.tgz", + "integrity": "sha512-yBY+qqWSv3dWKGODD6OGE6GnTX7Q2r+4+DfpqxHSHh8x0B4EKP9+wVGLS6U/AM1vxSNNmUEuIV5EGhYwPpfOwQ==" + } + } }, "node-addon-api": { "version": "8.0.0", @@ -90,6 +644,18 @@ "resolved": "https://registry.npmjs.org/tree-sitter-cli/-/tree-sitter-cli-0.22.6.tgz", "integrity": "sha512-s7mYOJXi8sIFkt/nLJSqlYZP96VmKTc3BAwIX0rrrlRxWjWuCwixFqwzxWZBQz4R8Hx01iP7z3cT3ih58BUmZQ==", "dev": true + }, + "util-deprecate": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", + "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==", + "dev": true + }, + "wrappy": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", + "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==", + "dev": true } } } diff --git a/web-usfm-parser/LICENSE b/web-usfm-parser/LICENSE new file mode 100644 index 00000000..26313a92 --- /dev/null +++ b/web-usfm-parser/LICENSE @@ -0,0 +1,7 @@ +Copyright 2021 Bridge Connectivity Solutions + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/web-usfm-parser/README.md b/web-usfm-parser/README.md new file mode 100644 index 00000000..65c72f32 --- /dev/null +++ b/web-usfm-parser/README.md @@ -0,0 +1,102 @@ +# USFM Grammar + +## Description +This is the web alternative to the (USFM-Grammar 3.x)[https://www.npmjs.com/package/usfm-grammar] library to be used from HTML, react etc. USFM Grammar is a JavaScript library for parsing and converting USFM (Unified Standard Format Markers) to/from USJ (Unified Standard JSON) format. This library provides functionalities to parse USFM strings into a syntax tree and convert them into a JSON-like structure (USJ), and vice versa. + +## Installation +You can install USFM Grammar via npm: + + +## Usage +Here's how you can use USFM Grammar in your react projects: +```bash +npm install usfm-grammar-web +``` + +```javascript +import React, { useState, useEffect } from 'react'; +import { USFMParser } from 'usfm-grammar-web'; + +function App() { + const [result, setResult] = useState(null); + const [result2, setResult2] = useState(null); + + useEffect(() => { + const initParser = async () => { + await USFMParser.init("https://cdn.jsdelivr.net/npm/usfm-grammar-web@3.0.0-alpha.1/tree-sitter-usfm.wasm", + "https://cdn.jsdelivr.net/npm/usfm-grammar-web@3.0.0-alpha.1/tree-sitter.wasm"); + }; + initParser(); + }, []); + + const calculateValue = async () => { + const usfmParser = new USFMParser('\\id GEN\n\\c 1\n\\p\n\\v 1 In the begining..\\v 2 more text'); + const output = usfmParser.toUSJ(); + setResult(JSON.stringify(output)); + + const usfmParser2 = new USFMParser(null, output) //initialse from USJ + const usfm = usfmParser2.usfm; + setResult2(usfm); + }; + + return ( +
+
+ +

USJ: {result}

+

USFM: {result2}

+
+
+ ); +} + +export default App; +``` + +It can be used directly in the HTML script tag too. + +```html + +``` + + +If using from react, please refer the instructions for it [here](../docs/react-usage.md). + +## API Documentation + +### `USFMParser.init()` +Initializes the USFMParser. This function must be called before creating instances of `USFMParser`. And can take the grammar and the tree-sitter files (in wasm format) as arguments, that is included in the package. + +### `USFMParser.toUSJ(usfmString: string): Object` +Converts a USFM string to a USJ object. + +- `usfmString`: The input USFM string. + +Returns: A JSON-like object representing the USJ. + +### `new USFMParser(null, usjObject: Object)` +Initialize a parser object from USJ and also converts it to USFM. + +- `usjObject`: The input USJ object. + +Returns: The parser object. To obtain the USFM generated from USJ use `parserObject.usfm`. + +## Contributing +Contributions are welcome! If you find any issues or have suggestions for improvements, feel free to open an issue or create a pull request on [GitHub](https://github.com/your-username/usfm-grammar). + +## License +This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. diff --git a/web-usfm-parser/package.json b/web-usfm-parser/package.json new file mode 100644 index 00000000..19e81860 --- /dev/null +++ b/web-usfm-parser/package.json @@ -0,0 +1,54 @@ +{ + "name": "usfm-grammar-web", + "version": "3.0.0-alpha.9", + "description": "Parser using tree-sitter-usfm3, to convert usfm to usj format.", + "type": "module", + "module": "dist/bundle.mjs", + "alias": { + "fs": false, + "process": false, + "path": false + }, + "source": "src/index.js", + "scripts": { + "build": "parcel build src/index.js", + "test": "mocha --parallel --timeout 40000" + }, + "files": [ + "dist/", + "tree-sitter-usfm.wasm", + "tree-sitter.wasm" + ], + "repository": { + "type": "git", + "url": "https://github.com/Bridgeconn/usfm-grammar/js-usfm-parser" + }, + "keywords": [ + "USFM", + "tree-sitter", + "USJ", + "Parser" + ], + "author": "BCS Team", + "contributors": [ + "Kavitha Raju (https://github.com/kavitharaju)", + "Joel Mathew (https://github.com/joelthe1)", + "Samuel JD (https://github.com/samueljd)" + ], + "license": "MIT", + "devDependencies": { + "@babel/core": "^7.25.2", + "ajv": "^8.17.1", + "glob": "^11.0.0", + "mocha": "^10.7.3", + "parcel": "latest", + "path-browserify": "^1.0.1", + "process": "^0.11.10", + "web-tree-sitter": "^0.22.6", + "xml2js": "^0.6.2" + }, + "dependencies": { + "xmldom": "^0.6.0", + "xpath": "^0.0.34" + } +} diff --git a/web-usfm-parser/src/filters.js b/web-usfm-parser/src/filters.js new file mode 100644 index 00000000..c4194b05 --- /dev/null +++ b/web-usfm-parser/src/filters.js @@ -0,0 +1,199 @@ +class Filter { + // Defines the values of filter options + static BOOK_HEADERS = [ + "ide", "usfm", "h", "toc", "toca", // identification + "imt", "is", "ip", "ipi", "im", "imi", "ipq", "imq", "ipr", "iq", "ib", + "ili", "iot", "io", "iex", "imte", "ie" // intro + ]; + + static TITLES = [ + "mt", "mte", "cl", "cd", "ms", "mr", "s", "sr", "r", "d", "sp", "sd" // headings + ]; + + static COMMENTS = ["sts", "rem", "lit", "restore"]; // comment markers + + static PARAGRAPHS = [ + "p", "m", "po", "pr", "cls", "pmo", "pm", "pmc", // paragraphs-quotes-lists-tables + "pmr", "pi", "mi", "nb", "pc", "ph", "q", "qr", "qc", "qa", "qm", "qd", + "lh", "li", "lf", "lim", "litl", 'tr', "tc", "th", "tcr", "thr", 'table', "b" + ]; + + static CHARACTERS = [ + "add", "bk", "dc", "ior", "iqt", "k", "litl", "nd", "ord", "pn", + "png", "qac", "qs", "qt", "rq", "sig", "sls", "tl", "wj", // Special-text + "em", "bd", "bdit", "it", "no", "sc", "sup", // character styling + "rb", "pro", "w", "wh", "wa", "wg", // special-features + "lik", "liv", // structured list entries + "jmp" + ]; + + static NOTES = [ + "f", "fe", "ef", "efe", "x", "ex", // footnotes-and-crossrefs + "fr", "ft", "fk", "fq", "fqa", "fl", "fw", "fp", "fv", "fdc", + "xo", "xop", "xt", "xta", "xk", "xq", "xot", "xnt", "xdc" + ]; + + static STUDY_BIBLE = ['esb', 'cat']; // sidebars-extended-contents + + static BCV = ['id', 'c', 'v']; + + static TEXT = ['text-in-excluded-parent']; + + static keepOnly(inputUsj, includeMarkers, combineTexts=true) { + // let flattenedList = [].concat(...includeMarkers); + let cleanedMarkers = includeMarkers.map(marker => marker.replace(trailingNumPattern, '')); + let filteredUSJ = includeMarkersInUsj(inputUsj, cleanedMarkers, combineTexts); + + return filteredUSJ; + } + + static remove(inputUsj, excludeMarkers, combineTexts=true) { + // let flattenedList = [].concat(...excludeMarkers); + let cleanedMarkers = excludeMarkers.map(marker => marker.replace(trailingNumPattern, '')); + let filteredUSJ = excludeMarkersInUsj(inputUsj, cleanedMarkers, combineTexts); + + return filteredUSJ; + } + +} + +const MARKERS_WITH_DISCARDABLE_CONTENTS = [ + "ide", "usfm", "h", "toc", "toca", "imt", "is", "ip", "ipi", "im", "imi", + "ipq", "imq", "ipr", "iq", "ib", "ili", "iot", "io", "iex", "imte", "ie", + "mt", "mte", "cl", "cd", "ms", "mr", "s", "sr", "r", "d", "sp", "sd", + "sts", "rem", "lit", "restore", "f", "fe", "ef", "efe", "x", "ex", + "fr", "ft", "fk", "fq", "fqa", "fl", "fw", "fp", "fv", "fdc", + "xo", "xop", "xt", "xta", "xk", "xq", "xot", "xnt", "xdc", + "jmp", "fig", "cat", "esb", "b" +]; + +const trailingNumPattern = /\d+$/; +const punctPatternNoSpaceBefore = /^[,.\-—/;:!?@$%^)}\]>”»]/; +const punctPatternNoSpaceAfter = /[\-—/`@^&({[<“«]$/; + +function combineConsecutiveTextContents(contentsList) { + let textCombinedContents = []; + let textContents = ''; + contentsList.forEach(item => { + if (typeof item === 'string') { + if (!(textContents.endsWith(" ") || item.startsWith(" ") || textContents === '' || + punctPatternNoSpaceBefore.test(item) || punctPatternNoSpaceAfter.test(textContents))) { + textContents += " "; + } + textContents += item; + } else { + if (textContents !== "") { + textCombinedContents.push(textContents); + textContents = ""; + } + textCombinedContents.push(item); + } + }); + if (textContents !== "") { + textCombinedContents.push(textContents); + } + return textCombinedContents; +} + +function excludeMarkersInUsj(inputUsj, excludeMarkers, combineTexts = true, excludedParent = false) { + let cleanedKids = []; + if (typeof inputUsj === 'string') { + if (excludedParent || excludeMarkers.includes('text-in-excluded-parent')) { + return []; + } + return [inputUsj]; + } + + let thisMarker = ''; + if ('marker' in inputUsj) { + thisMarker = inputUsj.marker.replace(trailingNumPattern, ''); + } else if (inputUsj.type === 'ref') { + thisMarker = "ref"; + } + let thisMarkerNeeded = true; + let innerContentNeeded = true; + excludedParent = false; + + if (excludeMarkers.includes(thisMarker)) { + thisMarkerNeeded = false; + excludedParent = true; + if (MARKERS_WITH_DISCARDABLE_CONTENTS.includes(thisMarker)) { + innerContentNeeded = false; + } + } + + if ((thisMarkerNeeded || innerContentNeeded) && "content" in inputUsj) { + inputUsj.content.forEach(item => { + let cleaned = excludeMarkersInUsj(item, excludeMarkers, combineTexts, excludedParent); + if (Array.isArray(cleaned)) { + cleanedKids.push(...cleaned); + } else { + cleanedKids.push(cleaned); + } + }); + if (combineTexts) { + cleanedKids = combineConsecutiveTextContents(cleanedKids); + } + } + + if (thisMarkerNeeded) { + inputUsj.content = cleanedKids; + return inputUsj; + } + return cleanedKids; +} + +function includeMarkersInUsj(inputUsj, includeMarkers, combineTexts = true, excludedParent = false) { + let cleanedKids = []; + + if (typeof inputUsj === 'string') { + if (includeMarkers.includes(Filter.TEXT[0])) { + return [inputUsj] + } + return [] + } + let thisMarker = ''; + if ('marker' in inputUsj) { + thisMarker = inputUsj.marker.replace(trailingNumPattern, ''); + } else if (inputUsj.type === 'ref') { + thisMarker = "ref"; + } + let thisMarkerNeeded = includeMarkers.includes(thisMarker) || thisMarker === ''; + let innerContentNeeded = thisMarkerNeeded || !MARKERS_WITH_DISCARDABLE_CONTENTS.includes(thisMarker); + + if (innerContentNeeded && "content" in inputUsj) { + inputUsj.content.forEach(item => { + let cleaned = includeMarkersInUsj(item, includeMarkers, combineTexts, !thisMarkerNeeded); + if (Array.isArray(cleaned)) { + cleanedKids.push(...cleaned); + } else { + cleanedKids.push(cleaned); + } + }); + if (combineTexts) { + cleanedKids = combineConsecutiveTextContents(cleanedKids); + } + } + + if (thisMarker === 'c') { + if (!includeMarkers.includes('ca')) + delete inputUsj.altnumber; + if (!includeMarkers.includes('cp')) + delete inputUsj.pubnumber; + } else if (thisMarker === 'v') { + if (!includeMarkers.includes('va')) + delete inputUsj.altnumber; + if (!includeMarkers.includes('vp')) + delete inputUsj.pubnumber; + } + + + + if (thisMarkerNeeded) { + inputUsj.content = cleanedKids; + return inputUsj; + } + return cleanedKids; +} + +export { Filter }; \ No newline at end of file diff --git a/web-usfm-parser/src/index.js b/web-usfm-parser/src/index.js new file mode 100644 index 00000000..e7ea0450 --- /dev/null +++ b/web-usfm-parser/src/index.js @@ -0,0 +1,3 @@ +import {USFMParser, Filter} from "./usfmParser.js"; + +export { USFMParser, Filter }; \ No newline at end of file diff --git a/web-usfm-parser/src/listGenerator.js b/web-usfm-parser/src/listGenerator.js new file mode 100644 index 00000000..a7e10b43 --- /dev/null +++ b/web-usfm-parser/src/listGenerator.js @@ -0,0 +1,56 @@ +class ListGenerator { + /* Combines the methods used for List generation from USJ */ + constructor() { + /* Variables shared by functions */ + this.book = ""; + this.currentChapter = ""; + this.currentVerse = ""; + this.list = [["Book", "Chapter", "Verse", "Text", "Type", "Marker"]]; + } + + usjToListId(obj) { + /* Update book code */ + this.book = obj.code; + } + + usjToListC(obj) { + /* Update current chapter */ + this.currentChapter = obj.number; + } + + usjToListV(obj) { + /* Update current verse */ + this.currentVerse = obj.number; + } + + usjToList(obj) { + /* Traverse the USJ dict and build the table in this.list */ + if (obj.type === "book") { + this.usjToListId(obj); + } else if (obj.type === "chapter") { + this.usjToListC(obj); + } else if (obj.type === "verse") { + this.usjToListV(obj); + } + + let markerType = obj.type; + let markerName = obj.marker ? obj.marker : ''; + + if (markerType === "USJ") { + // This would occur if the JSON got flattened after removing paragraph markers + markerType = ""; + } + + if (obj.content) { + for (let item of obj.content) { + if (typeof item === "string") { + this.list.push([this.book, this.currentChapter, this.currentVerse, item, markerType, markerName]); + } else { + this.usjToList(item); + } + } + } + } +} + +export default ListGenerator; diff --git a/web-usfm-parser/src/usfmGenerator.js b/web-usfm-parser/src/usfmGenerator.js new file mode 100644 index 00000000..0ceac2d2 --- /dev/null +++ b/web-usfm-parser/src/usfmGenerator.js @@ -0,0 +1,203 @@ +import { NO_USFM_USJ_TYPES, CLOSING_USJ_TYPES, NON_ATTRIB_USJ_KEYS, NO_NEWLINE_USJ_TYPES } from "./utils/types.js"; +import { NON_ATTRIB_USX_KEYS, NO_NEWLINE_USX_TYPES } from "./utils/types.js"; + +class USFMGenerator { + constructor() { + this.usfmString = ""; + } + + usjToUsfm(usjObj, nested = false) { + if (usjObj.type === "ref") { + usjObj.marker = "ref"; + } + if (!NO_USFM_USJ_TYPES.includes(usjObj.type)) { + this.usfmString += "\\"; + if (nested && usjObj.type === "char") { + this.usfmString += "+"; + } + this.usfmString += `${usjObj.marker} `; + } + ["code", "number", "caller"].forEach((key) => { + if (usjObj[key]) { + this.usfmString += `${usjObj[key]} `; + } + }); + if (usjObj.category) { + this.usfmString += `\\cat ${usjObj.category}\\cat*\n`; + } + if (usjObj.altnumber) { + if (usjObj.marker === "c") { + this.usfmString += `\\ca ${usjObj.altnumber} \\ca*\n` + }else if (usjObj.marker === "v") { + this.usfmString += `\\va ${usjObj.altnumber} \\va* ` + } + } + if (usjObj.pubnumber) { + if (usjObj.marker === "c") { + this.usfmString += `\\cp ${usjObj.pubnumber}\n` + }else if (usjObj.marker === "v") { + this.usfmString += `\\vp ${usjObj.pubnumber} \\vp* ` + } + } + if (Array.isArray(usjObj.content)) { + usjObj.content.forEach((item) => { + if (typeof item === "string") { + this.usfmString += item; + } else { + this.usjToUsfm(item, usjObj.type === "char"); + } + }); + } + + let attributes = []; + Object.keys(usjObj).forEach((key) => { + if (!NON_ATTRIB_USJ_KEYS.includes(key)) { + attributes.push(`${key}="${usjObj[key]}"`); + } + }); + + if (attributes.length > 0) { + this.usfmString += `|${attributes.join(" ")}`; + } + + if (CLOSING_USJ_TYPES.includes(usjObj.type)) { + this.usfmString += `\\`; + if (nested && usjObj.type === "char") { + this.usfmString += "+"; + } + this.usfmString += `${usjObj.marker}* `; + } + if ( + !NO_NEWLINE_USJ_TYPES.includes(usjObj.type) && + this.usfmString[this.usfmString.length - 1] !== "\n" + ) { + this.usfmString += "\n"; + } + return this.usfmString; + } + + usxToUsfm(xmlObj, nested=false) { + // Check if xmlObj is a string + // if (typeof xmlObj === 'string') { + // // this.usfmString += xmlObj; + // return; + // } + + const objType = xmlObj.tagName; + let marker = null; + let usfmAttributes = []; + + if (['verse', 'chapter'].includes(objType) && xmlObj.hasAttribute('eid')) { + return; + } + + if (!NO_NEWLINE_USX_TYPES.includes(objType)) { + this.usfmString += '\n'; + } + + if (objType === 'optbreak') { + if (this.usfmString !== '' && !['\n', '\r', ' ', '\t'].includes(this.usfmString.slice(-1))) { + this.usfmString += ' '; + } + this.usfmString += '// '; + } + + if (xmlObj.hasAttribute('style')) { + marker = xmlObj.getAttribute('style'); + if (nested && objType === 'char' && !['xt', 'fv', 'ref'].includes(marker)) { + marker = `+${marker}`; + } + this.usfmString += `\\${marker} `; + } else if (objType === 'ref') { + marker = 'ref' + this.usfmString += `\\${marker} `; + } + + if (xmlObj.hasAttribute('code')) { + this.usfmString += xmlObj.getAttribute('code'); + } + + if (xmlObj.hasAttribute('number')) { + this.usfmString += `${xmlObj.getAttribute('number')} `; + } + + if (xmlObj.hasAttribute('caller')) { + this.usfmString += `${xmlObj.getAttribute('caller')} `; + } + + if (xmlObj.hasAttribute('altnumber')) { + if (objType === 'verse') { + this.usfmString += `\\va ${xmlObj.getAttribute('altnumber')}\\va*`; + } else if (objType === 'chapter') { + this.usfmString += `\n\\ca ${xmlObj.getAttribute('altnumber')}\\ca*`; + } + } + + if (xmlObj.hasAttribute('pubnumber')) { + if (objType === 'verse') { + this.usfmString += `\\vp ${xmlObj.getAttribute('pubnumber')}\\vp*`; + } else if (objType === 'chapter') { + this.usfmString += `\n\\cp ${xmlObj.getAttribute('pubnumber')}`; + } + } + + if (xmlObj.hasAttribute('category')) { + this.usfmString += `\n\\cat ${xmlObj.getAttribute('category')} \\cat*`; + } + + const children = Array.from(xmlObj.childNodes); + for (const child of children) { + if (child.nodeType === 1) { // Check if child is an element node + if (objType === 'char') { + this.usxToUsfm(child, true); + } else { + this.usxToUsfm(child, false); + } + } + if (child.nodeType === 3 && child.nodeValue.trim()) { // Check if child is a text node with content + if (this.usfmString !== '' && !['\n', '\r', ' ', '\t'].includes(this.usfmString.slice(-1))) { + this.usfmString += ' '; + } + this.usfmString += child.nodeValue.trim(); + } + } + + const attributes = Array.from(xmlObj.attributes); + for (const attrNode of attributes) { + let key = attrNode.name; + let val = attrNode.value.replace(/"/g, ''); + if (key === 'file' && objType === 'figure') { + usfmAttributes.push(`src="${val}"`); + } else if (!NON_ATTRIB_USX_KEYS.includes(key)) { + usfmAttributes.push(`${key}="${val}"`); + } + if (['sid', 'eid'].includes(key) && objType === 'ms') { + usfmAttributes.push(`${key}="${val}"`); + } + } + + if (usfmAttributes.length > 0) { + this.usfmString += '|'; + this.usfmString += usfmAttributes.join(' '); + } + + if ((xmlObj.hasAttribute('closed') && xmlObj.getAttribute('closed') === 'true') + || CLOSING_USJ_TYPES.includes(objType) + || usfmAttributes.length > 0) { + if (objType === 'ms') { + this.usfmString += '\\*'; + } else { + this.usfmString += `\\${marker}*`; + } + } + + if (objType === 'sidebar') { + this.usfmString += '\n\\esbe\n'; + } + } + + + +} + +export default USFMGenerator; diff --git a/web-usfm-parser/src/usfmParser.js b/web-usfm-parser/src/usfmParser.js new file mode 100644 index 00000000..81c3ad60 --- /dev/null +++ b/web-usfm-parser/src/usfmParser.js @@ -0,0 +1,292 @@ +import assert from 'assert'; +import Parser from './web-tree-sitter/tree-sitter.js'; + +import USFMGenerator from "./usfmGenerator.js"; +import USJGenerator from "./usjGenerator.js"; +import ListGenerator from "./listGenerator.js" +import USXGenerator from "./usxGenerator.js"; +import { Filter } from "./filters.js"; + + +class USFMParser { + static language = null; + static async init(grammarPath="node_modules/usfm-grammar/tree-sitter-usfm.wasm", + + parserPath="node_modules/usfm-grammar/tree-sitter.wasm") { + await Parser.init( { + locateFile() { + return parserPath; + }, + } ); + USFMParser.language = await Parser.Language.load(grammarPath); + } + + constructor(usfmString=null, fromUsj=null, fromUsx=null) { + let inputsGiven = 0 + if (usfmString !== null) { + inputsGiven += 1 + } + if (fromUsj !== null) { + inputsGiven += 1 + } + if (fromUsx !== null) { + inputsGiven += 1 + } + + if (inputsGiven > 1) { + throw new Error(`Found more than one input! +Only one of USFM, USJ or USX is supported in one object.`) + } + if (inputsGiven === 0) { + throw Error("Missing input! Either USFM, USJ or USX is to be provided.") + } + + if (usfmString !== null) { + if (typeof usfmString !== "string" || usfmString === null) { + throw new Error("Invalid input for USFM. Expected a string."); + } + this.usfm = usfmString; + } else if(fromUsj !== null) { + this.usj = fromUsj; + this.usfm = this.convertUSJToUSFM() + } else if (fromUsx !== null) { + this.usx = fromUsx; + this.usfm = this.convertUSXToUSFM() + } + this.parser = null; + this.initializeParser(); + + this.syntaxTree = null; + this.errors = []; + this.warnings = []; + this.parseUSFM(); + } + + initializeParser() { + if (!USFMParser.language) { + throw new Error( + "USFMParser not initialized. Call USFMParser.init() before creating instances.", + ); + } + this.parser = new Parser(); + this.parser.setLanguage(USFMParser.language); + this.parserOptions = Parser.Options = { + bufferSize: 1024 * 1024, + }; + } + + toSyntaxTree() { + return this.syntaxTree.toString(); + } + + toUSJ(excludeMarkers = null, + includeMarkers = null, + ignoreErrors = false, + combineTexts = true,) { + this.usj = this.convertUSFMToUSJ(excludeMarkers = excludeMarkers, + includeMarkers = includeMarkers, + ignoreErrors = ignoreErrors, + combineTexts = combineTexts,); + return this.usj; + } + + usjToUsfm(usjObject) { + if (typeof usjObject !== "object" || usjObject === null) { + throw new Error("Invalid input for USJ. Expected an object."); + } + if (!this.parser) { + this.initializeParser(); + } + this.usj = usjObject; + this.usfm = this.convertUSJToUSFM(); + return this.usfm; + } + + convertUSXToUSFM() { + try { + assert(1 <= this.usx.nodeType && this.usx.nodeType <= 12 , + 'Input must be an instance of xmldom Document or Element' + ); + if (this.usx.tagName !== "usx") { + assert(this.usx.getElementsByTagName('usx').length === 1, + 'Expects a node. Refer docs: https://docs.usfm.bible/usfm/3.1/syntax.html#_usx_usfm_xml'); + + this.usx = this.usx.getElementsByTagName('usx')[0] + } + // assert(this.usx.childNodes[0].tagName === 'book', " expected as first element in ") + + } catch(err) { + throw new Error("USX not in expected format. "+err.message) + } + try { + const usfmGen = new USFMGenerator() + usfmGen.usxToUsfm(this.usx); + // console.log(usfmGen.usfmString) + return usfmGen.usfmString; + } catch(err) { + let message = "Unable to do the conversion from USX to USFM. "; + throw new Error(message, { cause: err }); + } + } + + + parseUSFM() { + let tree = null; + try { + if (this.usfm.length > 25000) { + tree = this.parser.parse(this.usfm, null, this.parserOptions); + } + else { + tree = this.parser.parse(this.usfm); + } + } catch (err) { + throw err; + // console.log("Error in parser.parse()"); + // console.log(err.toString()); + // console.log(this.usfm); + } + this.checkForErrors(tree); + this.checkforMissing(tree.rootNode); + // if (error) throw error; + this.syntaxTree = tree.rootNode; + } + + + checkForErrors(tree) { + const errorQuery = this.parser.getLanguage().query("(ERROR) @errors"); + const errors = errorQuery.captures(tree.rootNode); + if (errors.length > 0) { + this.errors = errors.map( + (err) => + `At ${err.node.startPosition.row}:${err.node.startPosition.column}, Error: ${this.usfm.substring(err.node.startIndex, err.node.endIndex)}`, + ); + return new Error(`Errors found in USFM: ${this.errors.join(", ")}`); + } + return null; + } + + checkforMissing(node) { + for (let n of node.children) { + if (n.isMissing){ + this.errors.push( + `At ${n.startPosition.row+1}:${n.startPosition.column}, Error: Missing ${n.type}`) + } + this.checkforMissing(n); + } + } + + convertUSJToUSFM() { + const outputUSFM = new USFMGenerator().usjToUsfm(this.usj); // Simulated conversion + return outputUSFM; + } + + convertUSFMToUSJ( + excludeMarkers = null, + includeMarkers = null, + ignoreErrors = false, + combineTexts = true, + ) { + if (!ignoreErrors && this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + throw new Error( + `Errors present:\n\t${errorString}\nUse ignoreErrors = true to generate output despite errors.`, + ); + } + + let outputUSJ; + try { + let usjGenerator = new USJGenerator( + USFMParser.language, + this.usfm + ); + usjGenerator.nodeToUSJ(this.syntaxTree, usjGenerator.jsonRootObj); + outputUSJ = usjGenerator.jsonRootObj; + } catch (err) { + let message = "Unable to do the conversion. "; + if (this.errors) { + let errorString = this.errors.join("\n\t"); + message += `Could be due to an error in the USFM\n\t${errorString}`; + } + else { + message = err.message; + } + return {error: message}; + } + + if (includeMarkers) { + outputUSJ = Filter.keepOnly(outputUSJ, [...includeMarkers, 'USJ'], combineTexts); + } + if (excludeMarkers) { + outputUSJ = Filter.remove(outputUSJ, excludeMarkers, combineTexts); + } + + return outputUSJ; + } + + toList( + excludeMarkers = null, + includeMarkers = null, + ignoreErrors = false, + combineTexts = true + ) { + /* Uses the toJSON function and converts JSON to CSV + To be re-implemented to work with the flat JSON schema */ + + if (!ignoreErrors && this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + throw new Error(`Errors present:\n\t${errorString}\nUse ignoreErrors=true to generate output despite errors`); + } + + try { + const usjDict = this.toUSJ(excludeMarkers, includeMarkers, ignoreErrors, combineTexts); + + const listGenerator = new ListGenerator(); + listGenerator.usjToList(usjDict); + return listGenerator.list; + + } catch (exe) { + let message = "Unable to do the conversion. "; + if (this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + message += `Could be due to an error in the USFM\n\t${errorString}`; + } + throw new Error(message, { cause: exe }); + } + + } + + toUSX(ignoreErrors = false) { + /* Convert the syntax_tree to the XML format (USX) */ + + if (!ignoreErrors && this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + throw new Error(`Errors present:\n\t${errorString}\nUse ignoreErrors=true to generate output despite errors`); + } + let xmlContent = null; + + try { + // Initialize the USX generator (assuming the constructor is already implemented in JS) + const usxGenerator = new USXGenerator(USFMParser.language, + this.usfm); + + // Process the syntax tree and convert to USX format + usxGenerator.node2Usx(this.syntaxTree, usxGenerator.xmlRootNode); + + // xmlContent = usxSerializer.serializeToString(usxGenerator.xmlRootNode); + xmlContent = usxGenerator.xmlRootNode; + } catch (exe) { + let message = "Unable to do the conversion. "; + if (this.errors.length > 0) { + let errorString = this.errors.join("\n\t"); + message += `Could be due to an error in the USFM\n\t${errorString}`; + } + throw new Error(message, { cause: exe }); + } + + // Return the generated XML structure (in JSON format) + return xmlContent; + } + +} + +export {USFMParser, Filter}; diff --git a/web-usfm-parser/src/usjGenerator.js b/web-usfm-parser/src/usjGenerator.js new file mode 100644 index 00000000..980cc5a0 --- /dev/null +++ b/web-usfm-parser/src/usjGenerator.js @@ -0,0 +1,557 @@ +//Logics for syntax-tree to dict(USJ) conversions +import { PARA_STYLE_MARKERS, NOTE_MARKERS, CHAR_STYLE_MARKERS, NESTED_CHAR_STYLE_MARKERS, DEFAULT_ATTRIB_MAP, TABLE_CELL_MARKERS, MISC_MARKERS } from "./utils/markers.js"; +class USJGenerator { + + + constructor(treeSitterLanguageObj, usfmString, usjRootObj = null) { + this.usfmLanguage = treeSitterLanguageObj; + this.usfm = usfmString; + this.jsonRootObj = usjRootObj || { + type: "USJ", + version: "3.1", + content: [], + }; + } + + findLastFromJson(jsonObj, typeValue) { + let output = null; + if ( + typeValue === jsonObj.type || + (jsonObj.marker && typeValue === jsonObj.marker) + ) { + output = jsonObj; + } + if (jsonObj.content) { + jsonObj.content.forEach((child) => { + if (typeof child === "string") { + return; + } + const childOutput = this.findLastFromJson(child, typeValue); + if (childOutput !== null) { + output = childOutput; + } + }); + } + return output; + } + + nodeToUSJId(node, parentJsonObj) { + const idCaptures = this.usfmLanguage + .query("(id (bookcode) @book-code (description)? @desc)") + .captures(node); + let code = null; + let desc = null; + idCaptures.forEach((capture) => { + if (capture.name === "book-code") { + code = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); + } else if (capture.name === "desc") { + desc = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); + } + }); + const bookJsonObj = { + type: "book", + marker: "id", + code: code, + content: [], + }; + if (desc && desc.trim() !== "") { + bookJsonObj.content.push(desc.trim()); + } + parentJsonObj.content.push(bookJsonObj); + } + + // Similar conversion methods for other node types + nodeToUSJC(node, parentJsonObj) { + // Build c, the chapter milestone node in usj + const chapCap = this.usfmLanguage + .query( + `(c (chapterNumber) @chap-num + (ca (chapterNumber) @alt-num)? + (cp (text) @pub-num)?)`, + ) + .captures(node); + const chapNum = this.usfm.slice( + chapCap[0].node.startIndex, + chapCap[0].node.endIndex, + ); + let chapRef = null; + this.jsonRootObj.content.forEach((child) => { + if (child.type === "book") { + chapRef = `${child.code} ${chapNum}`; + return; + } + }); + + const chapJsonObj = { + type: "chapter", + marker: "c", + number: chapNum, + sid: chapRef, + }; + + chapCap.forEach((cap) => { + if (cap.name === "alt-num") { + chapJsonObj.altnumber = this.usfm + .substring(cap.node.startIndex, cap.node.endIndex) + .trim(); + } + if (cap.name === "pub-num") { + chapJsonObj.pubnumber = this.usfm + .substring(cap.node.startIndex, cap.node.endIndex) + .trim(); + } + }); + + parentJsonObj.content.push(chapJsonObj); + + node.children.forEach((child) => { + if (["cl", "cd"].includes(child.type)) { + this.nodeToUSJ(child, parentJsonObj); + } + }); + } + + nodeToUSJChapter(node, parentJsonObj) { + // Build chapter node in USJ + node.children.forEach((child) => { + if (child.type === "c") { + this.nodeToUSJC(child, parentJsonObj); + } else { + this.nodeToUSJ(child, parentJsonObj); + } + }); + } + + nodeToUSJVerse(node, parentJsonObj) { + // Build verse node in USJ + const verseNumCap = this.usfmLanguage + .query( + ` + (v + (verseNumber) @vnum + (va (verseNumber) @alt)? + (vp (text) @vp)? + )`, + ) + .captures(node); + + const verseNum = this.usfm.substring( + verseNumCap[0].node.startIndex, + verseNumCap[0].node.endIndex, + ); + + const vJsonObj = { + type: "verse", + marker: "v", + number: verseNum.trim(), + }; + + verseNumCap.forEach((capture) => { + if (capture.name === "alt") { + const altNum = this.usfm.slice( + capture.node.startIndex, + capture.node.endIndex, + ); + vJsonObj.altnumber = altNum; + } else if (capture.name === "vp") { + const vpText = this.usfm.substring( + capture.node.startIndex, + capture.node.endIndex, + ); + vJsonObj.pubnumber = vpText; + } + }); + + const ref = `${this.findLastFromJson(this.jsonRootObj, "chapter").sid}:${verseNum}`; + vJsonObj.sid = ref.trim(); + + parentJsonObj.content.push(vJsonObj); + } + + nodeToUSJCaVa(node, parentJsonObj) { + // Build elements for independent ca and va away from c and v + const style = node.type; + const charJsonObj = { + type: "char", + marker: style, + }; + + const altNumMatch = this.usfmLanguage + .query( + `([ + (chapterNumber) + (verseNumber) + ] @alt-num)`, + ) + .captures(node); + + const altNum = this.usfm + .slice(altNumMatch[0].node.startIndex, altNumMatch[0].node.endIndex) + .trim(); + + charJsonObj.altnumber = altNum; + parentJsonObj.content.push(charJsonObj); + } + + nodeToUSJPara(node, parentJsonObj) { + // Build paragraph nodes in USJ + if (node.children[0].type.endsWith("Block")) { + node.children[0].children.forEach((child) => { + this.nodeToUSJPara(child, parentJsonObj); + }); + } else if (node.type === "paragraph") { + const paraTagCap = this.usfmLanguage + .query("(paragraph (_) @para-marker)") + .captures(node)[0]; + const paraMarker = paraTagCap.node.type; + + if (paraMarker === "b") { + parentJsonObj.content.push( { type: "para", marker: paraMarker} ); + } else if (!paraMarker.endsWith("Block")) { + const paraJsonObj = { type: "para", marker: paraMarker, content: [] }; + paraTagCap.node.children.forEach((child) => { + this.nodeToUSJ(child, paraJsonObj); + }); + parentJsonObj.content.push(paraJsonObj); + } + } else if (["pi", "ph"].includes(node.type)) { + const paraMarker = this.usfm + .substring(node.children[0].startIndex, node.children[0].endIndex) + .replace("\\", "") + .trim(); + const paraJsonObj = { type: "para", marker: paraMarker, content: [] }; + node.children.slice(1).forEach((child) => { + this.nodeToUSJ(child, paraJsonObj); + }); + parentJsonObj.content.push(paraJsonObj); + } + } + + nodeToUSJNotes(node, parentJsonObj) { + // Build USJ nodes for footnotes and cross-references + const tagNode = node.children[0]; + const callerNode = node.children[1]; + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .trim(); + const noteJsonObj = { + type: "note", + marker: style, + content: [], + }; + + noteJsonObj.caller = this.usfm + .substring(callerNode.startIndex, callerNode.endIndex) + .trim(); + + for (let i = 2; i < node.children.length - 1; i++) { + this.nodeToUSJ(node.children[i], noteJsonObj); + } + + parentJsonObj.content.push(noteJsonObj); + } + + nodeToUSJChar(node, parentJsonObj) { + // Build USJ nodes for character markups, both regular and nested + const tagNode = node.children[0]; + let childrenRange = node.children.length; + if (node.children[node.children.length - 1].type.startsWith("\\")) { + childrenRange -= 1; // Exclude the last node if it starts with '\', treating it as a closing node + } + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .replace("+", "") + .trim(); + const charJsonObj = { + type: "char", + marker: style, + content: [], + }; + + // Assume a flag for closed markup, toggle this if your conditions and data structure require + // charJsonObj.closed = node.children[node.children.length - 1].type.startsWith('\\'); + + for (let i = 1; i < childrenRange; i++) { + this.nodeToUSJ(node.children[i], charJsonObj); + } + + parentJsonObj.content.push(charJsonObj); + } + + nodeToUSJTable(node, parentJsonObj) { + // Handle table related components and convert to USJ + if (node.type === "table") { + const tableJsonObj = { type: "table", content: [] }; + node.children.forEach((child) => { + this.nodeToUSJ(child, tableJsonObj); + }); + parentJsonObj.content.push(tableJsonObj); + } else if (node.type === "tr") { + const rowJsonObj = { type: "table:row", marker: "tr", content: [] }; + node.children.slice(1).forEach((child) => { + this.nodeToUSJ(child, rowJsonObj); + }); + parentJsonObj.content.push(rowJsonObj); + } else if (TABLE_CELL_MARKERS.includes(node.type)) { + const tagNode = node.children[0]; + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .trim(); + const cellJsonObj = { + type: "table:cell", + marker: style, + content: [], + align: style.includes("r") ? "end" : "start", + }; + node.children.slice(1).forEach((child) => { + this.nodeToUSJ(child, cellJsonObj); + }); + parentJsonObj.content.push(cellJsonObj); + } + } + + nodeToUSJAttrib(node, parentJsonObj) { + // Add attribute values to USJ elements + const attribNameNode = node.children[0]; + let attribName = this.usfm + .slice(attribNameNode.startIndex, attribNameNode.endIndex) + .trim(); + + // Handling special cases for attribute names + if (attribName === "|") { + attribName = DEFAULT_ATTRIB_MAP[node.parent.type]; + } + if (attribName === "src") { + // for \fig + attribName = "file"; + } + + const attribValCap = this.usfmLanguage + .query("((attributeValue) @attrib-val)") + .captures(node); + + let attribValue = ""; + if (attribValCap.length > 0) { + attribValue = this.usfm + .substring( + attribValCap[0].node.startIndex, + attribValCap[0].node.endIndex, + ) + .trim(); + } + + parentJsonObj[attribName] = attribValue; + } + + nodeToUSJMilestone(node, parentJsonObj) { + // Create ms node in USJ + + const msNameCap = this.usfmLanguage + .query( + `( + [(milestoneTag) + (milestoneStartTag) + (milestoneEndTag) + (zSpaceTag) + ] @ms-name)`, + ) + .captures(node)[0]; + + const style = this.usfm + .slice(msNameCap.node.startIndex, msNameCap.node.endIndex) + .replace("\\", "") + .trim(); + const msJsonObj = { type: "ms", marker: style, content: [] }; + + node.children.forEach((child) => { + if (child.type.endsWith("Attribute")) { + this.nodeToUSJ(child, msJsonObj); + } + }); + + // Though normally milestones don't have contents, custom z-namespaces could have them + if (!msJsonObj.content.length) { + delete msJsonObj.content; // Remove empty content array if not used + } + + parentJsonObj.content.push(msJsonObj); + } + + nodeToUSJSpecial(node, parentJsonObj) { + // Build nodes for esb, cat, fig, optbreak in USJ + + if (node.type === "esb") { + const sidebarJsonObj = { type: "sidebar", marker: "esb", content: [] }; + node.children.slice(1, -1).forEach((child) => { + this.nodeToUSJ(child, sidebarJsonObj); + }); + parentJsonObj.content.push(sidebarJsonObj); + } else if (node.type === "cat") { + const catCap = this.usfmLanguage + .query("((category) @category)") + .captures(node)[0]; + const category = this.usfm + .substring(catCap.node.startIndex, catCap.node.endIndex) + .trim(); + parentJsonObj.category = category; + } else if (node.type === "fig") { + const figJsonObj = { type: "figure", marker: "fig", content: [] }; + node.children.slice(1, -1).forEach((child) => { + this.nodeToUSJ(child, figJsonObj); + }); + parentJsonObj.content.push(figJsonObj); + } else if (node.type === "ref") { + const refJsonObj = { type: "ref", content: [] }; + node.children.slice(1, -1).forEach((child) => { + this.nodeToUSJ(child, refJsonObj); + }); + parentJsonObj.content.push(refJsonObj); + } + } + nodeToUSJGeneric(node, parentJsonObj) { + // Build nodes for para style markers in USJ + const tagNode = node.children[0]; + + let style = this.usfm.substring(tagNode.startIndex, tagNode.endIndex); + if (style.startsWith("\\")) { + style = style.replace("\\", "").trim(); + } else { + style = node.type; + } + + // console.log(node.children.length, node.children[0].type, node.children[1].type) + let childrenRangeStart = 1; + if ( + node.children.length > 1 && + node.children[1].type.startsWith("numbered") + ) { + const numNode = node.children[1]; + const num = this.usfm.substring(numNode.startIndex, numNode.endIndex); + style += num; + childrenRangeStart = 2; + } + const paraJsonObj = { type: "para", marker: style, content: [] }; + parentJsonObj.content.push(paraJsonObj); + + for (let i = childrenRangeStart; i < node.children.length; i++) { + const child = node.children[i]; + if ( + CHAR_STYLE_MARKERS.includes(child.type) || + NESTED_CHAR_STYLE_MARKERS.includes(child.type) || + [ + "text", + "footnote", + "crossref", + "verseText", + "v", + "b", + "milestone", + "zNameSpace", + ].includes(child.type) + ) { + // Only nest these types inside the upper para style node + this.nodeToUSJ(child, paraJsonObj); + } else { + this.nodeToUSJ(child, parentJsonObj); + } + } + } + + nodeToUSJ(node, parentJsonObj) { + // Check each node and based on the type convert to corresponding XML element + switch (node.type) { + case "id": + this.nodeToUSJId(node, parentJsonObj); + break; + case "chapter": + this.nodeToUSJChapter(node, parentJsonObj); + break; + case "cl": + case "cp": + case "cd": + case "vp": + this.nodeToUSJGeneric(node, parentJsonObj); + break; + case "ca": + case "va": + this.nodeToUSJCaVa(node, parentJsonObj); + break; + case "v": + this.nodeToUSJVerse(node, parentJsonObj); + break; + case "verseText": + node.children.forEach((child) => this.nodeToUSJ(child, parentJsonObj)); + break; + case "paragraph": + case "pi": + case "ph": + this.nodeToUSJPara(node, parentJsonObj); + break; + case "text": + let textVal = this.usfm + .substring(node.startIndex, node.endIndex) + .trim(); + textVal = textVal.replace("~", " ") + if (textVal !== "") { + parentJsonObj.content.push(textVal); + } + break; + case "table": + case "tr": + this.nodeToUSJTable(node, parentJsonObj); + break; + case "milestone": + case "zNameSpace": + this.nodeToUSJMilestone(node, parentJsonObj); + break; + case "esb": + case "cat": + case "fig": + case "ref": + this.nodeToUSJSpecial(node, parentJsonObj); + break; + case "usfm": + break; + default: + if (NOTE_MARKERS.includes(node.type)) { + this.nodeToUSJNotes(node, parentJsonObj) + } + else if ( + CHAR_STYLE_MARKERS.includes(node.type) || + NESTED_CHAR_STYLE_MARKERS.includes(node.type) || + ["xt_standalone"].includes(node.type) + ) { + this.nodeToUSJChar(node, parentJsonObj); + } else if (TABLE_CELL_MARKERS.includes(node.type)) { + this.nodeToUSJTable(node, parentJsonObj) + }else if (node.type.endsWith("Attribute")) { + this.nodeToUSJAttrib(node, parentJsonObj); + } else if ( + PARA_STYLE_MARKERS.includes(node.type) || + PARA_STYLE_MARKERS.includes( + node.type.replace("\\", "").trim(), + ) + ) { + this.nodeToUSJGeneric(node, parentJsonObj); + } else if (["", "|"].includes(node.type.trim())) { + // Skip white space nodes + break; + } else if (node.children.length > 0) { + node.children.forEach((child) => + this.nodeToUSJ(child, parentJsonObj), + ); + } + // else { + // + // console.error("Encountered unknown element ", node.type); + + // } + break; + } + } +} + +export default USJGenerator; diff --git a/web-usfm-parser/src/usxGenerator.js b/web-usfm-parser/src/usxGenerator.js new file mode 100644 index 00000000..00ade8a1 --- /dev/null +++ b/web-usfm-parser/src/usxGenerator.js @@ -0,0 +1,576 @@ +//Logics for syntax-tree to xml(USX) conversions +import { DOMImplementation, XMLSerializer } from 'xmldom'; +import xpath from 'xpath'; + +import { PARA_STYLE_MARKERS, NOTE_MARKERS, CHAR_STYLE_MARKERS, NESTED_CHAR_STYLE_MARKERS, DEFAULT_ATTRIB_MAP, TABLE_CELL_MARKERS, MISC_MARKERS } from "./utils/markers.js"; + + +class USXGenerator { + /** + * A binding for all methods used in generating USX from Syntax tree + * @param {object} treeSitterLanguageObj - The Tree-sitter language object + * @param {Buffer} usfmString - The USFM byte data + * @param {Element} [usxRootElement] - The root element of the USX (optional) + */ + constructor(treeSitterLanguageObj, usfmString, usxRootElement = null) { + this.usfmLanguage = treeSitterLanguageObj; + this.usfm = usfmString; + + const domImpl = new DOMImplementation(); + const doc = domImpl.createDocument(null, 'usx', null); + + if (usxRootElement === null) { + this.xmlRootNode = doc.documentElement; + this.xmlRootNode.setAttribute('version', '3.1'); + } else { + this.xmlRootNode = usxRootElement; + } + } + + /** + * Builds the ID node in USX + * @param {SyntaxNode} node - The syntax node + * @param {Element} parentXmlNode - The parent XML node to append the ID to + */ + node2UsxId(node, parentXmlNode) { + const idCaptures = this.usfmLanguage + .query("(id (bookcode) @book-code (description)? @desc)") + .captures(node); + + let code = null; + let desc = null; + + idCaptures.forEach(capture => { + if (capture.name === 'book-code') { + code = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); + } else if (capture.name === 'desc') { + desc = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); + } + }); + + const bookXmlNode = parentXmlNode.ownerDocument.createElement('book'); + bookXmlNode.setAttribute('code', code); + bookXmlNode.setAttribute('style', 'id'); + + if (desc && desc.trim() !== '') { + const textNode = parentXmlNode.ownerDocument.createTextNode(desc.trim()); + bookXmlNode.appendChild(textNode); + } + + parentXmlNode.appendChild(bookXmlNode); + } + + node2UsxC(node, parentXmlNode) { + // Build c, the chapter milestone node in usj + const chapCap = this.usfmLanguage + .query( + `(c (chapterNumber) @chap-num + (ca (chapterNumber) @alt-num)? + (cp (text) @pub-num)?)`, + ) + .captures(node); + const chapNum = this.usfm.slice( + chapCap[0].node.startIndex, + chapCap[0].node.endIndex, + ); + const bookNode = xpath.select1("book", parentXmlNode); + const bookCode = bookNode.getAttribute("code"); + const chapRef = `${bookCode} ${chapNum}`; + + // Create the 'chapter' element + const chapXmlNode = parentXmlNode.ownerDocument.createElement('chapter'); + chapXmlNode.setAttribute("number", chapNum); + chapXmlNode.setAttribute("style", "c"); + chapXmlNode.setAttribute("sid", chapRef); + + chapCap.forEach((cap) => { + if (cap.name === "alt-num") { + const altNum = this.usfm + .substring(cap.node.startIndex, cap.node.endIndex) + .trim(); + chapXmlNode.setAttribute('altnumber', altNum); + } + if (cap.name === "pub-num") { + const pubNum = this.usfm + .substring(cap.node.startIndex, cap.node.endIndex) + .trim(); + chapXmlNode.setAttribute('pubnumber', pubNum); + } + }); + + parentXmlNode.appendChild(chapXmlNode); + + node.children.forEach((child) => { + if (["cl", "cd"].includes(child.type)) { + this.node2Usx(child, parentXmlNode); + } + }); + } + + + + node2UsxChapter(node, parentXmlNode) { + // Build chapter node in USJ + node.children.forEach((child) => { + if (child.type === "c") { + this.node2UsxC(child, parentXmlNode); + } else { + this.node2Usx(child, parentXmlNode); + } + }); + + const prevVerses = xpath.select("//verse", this.xmlRootNode); + if (prevVerses.length > 0 && prevVerses[prevVerses.length - 1].hasAttribute('sid')) { + const vEndXmlNode = parentXmlNode.ownerDocument.createElement('verse'); + vEndXmlNode.setAttribute('eid', prevVerses[prevVerses.length - 1].getAttribute('sid')); + const sibblingCount = parentXmlNode.childNodes.length; + const lastSibbling = parentXmlNode.childNodes[sibblingCount-1]; + if (lastSibbling.tagName === "para") { + lastSibbling.appendChild(vEndXmlNode); + } else if (lastSibbling.tagName === "table") { + const rows = lastSibbling.getElementsByTagName('row'); + rows[rows.length - 1].appendChild(vEndXmlNode); + } else { + parentXmlNode.appendChild(vEndXmlNode); + } + } + + } + + findPrevUncle(parentXmlNode) { + // Get the grandparent node + const grandParent = parentXmlNode.parentNode; + let uncleIndex = grandParent.childNodes.length - 2; // Start from the previous sibling + + while (uncleIndex >= 0) { + const uncle = grandParent.childNodes[uncleIndex]; + + // Skip 'sidebar' and 'ms' elements + if (uncle.tagName === "sidebar" || uncle.tagName === "ms") { + uncleIndex--; + } + // Skip elements with 'ca' or 'cp' in the style attribute + else if (uncle.getAttribute('style') === 'ca' || uncle.getAttribute('style') === 'cp') { + uncleIndex--; + } + // Return the found uncle element + else { + return uncle; + } + } + return null; // No suitable uncle found + } + + node2UsxVerse(node, parentXmlNode) { + // Find all previous 'verse' elements + const prevVerses = xpath.select("//verse", this.xmlRootNode); + + // Check if there are previous verses and if the last one has a 'sid' attribute + if (prevVerses.length > 0 && prevVerses[prevVerses.length - 1].hasAttribute('sid')) { + let vEndXmlNode; + if (parentXmlNode.textContent.trim() !== "") { + // If there is verse text in the current parent + vEndXmlNode = parentXmlNode.ownerDocument.createElement('verse'); + parentXmlNode.appendChild(vEndXmlNode); + } else { + // If no text, find the previous uncle and attach the end verse + const prevUncle = this.findPrevUncle(parentXmlNode); + if (prevUncle.tagName === "para") { + vEndXmlNode = prevUncle.ownerDocument.createElement('verse'); + prevUncle.appendChild(vEndXmlNode); + } else if (prevUncle.tagName === "table") { + const rows = prevUncle.getElementsByTagName('row'); + vEndXmlNode = prevUncle.ownerDocument.createElement('verse'); + rows[rows.length - 1].appendChild(vEndXmlNode); + } else { + throw new Error(`prev_uncle is ${String(prevUncle)}`); + } + } + vEndXmlNode.setAttribute('eid', prevVerses[prevVerses.length - 1].getAttribute('sid')); + } + + // Query to capture verse-related elements + const verseNumCap = this.usfmLanguage + .query( + ` + (v + (verseNumber) @vnum + (va (verseNumber) @alt)? + (vp (text) @vp)? + )`, + ) + .captures(node); + + const verseNum = this.usfm.substring( + verseNumCap[0].node.startIndex, + verseNumCap[0].node.endIndex, + ); + const vXmlNode = parentXmlNode.ownerDocument.createElement('verse'); + parentXmlNode.appendChild(vXmlNode); + + // Loop through the captured elements and set the attributes + verseNumCap.forEach(capture => { + if (capture.name === 'alt') { + const altNum = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); + vXmlNode.setAttribute('altnumber', altNum); + } else if (capture.name === 'vp') { + const vpText = this.usfm.slice(capture.node.startIndex, capture.node.endIndex).trim(); + vXmlNode.setAttribute('pubnumber', vpText); + } + }); + + // Get the last chapter's 'sid' attribute to form the verse reference + const chapterSid = xpath.select("//chapter", this.xmlRootNode).pop().getAttribute('sid'); + const ref = `${chapterSid}:${verseNum}`; + + // Set attributes on the newly created 'verse' element + vXmlNode.setAttribute('number', verseNum.trim()); + vXmlNode.setAttribute('style', 'v'); + vXmlNode.setAttribute('sid', ref.trim()); + } + + node2UsxCaVa(node, parentXmlNode) { + // Build elements for independent ca and va away from c and v + const style = node.type; + + // Create a new 'char' element under the parent XML node + const charXmlNode = parentXmlNode.ownerDocument.createElement('char'); + charXmlNode.setAttribute('style', style); + + // Query to capture chapterNumber or verseNumber + const altNumMatch = this.usfmLanguage + .query( + `([ + (chapterNumber) + (verseNumber) + ] @alt-num)`, + ) + .captures(node); + + // Extract the alternate number from the captured range + const altNum = this.usfm + .slice(altNumMatch[0].node.startIndex, altNumMatch[0].node.endIndex) + .trim(); + + // Set the attributes on the 'char' element + charXmlNode.setAttribute('altnumber', altNum); + charXmlNode.setAttribute('closed', 'true'); + + // Append the 'char' element to the parent XML node + parentXmlNode.appendChild(charXmlNode); + } + + node2UsxPara(node, parentXmlNode) { + // Build paragraph nodes in USX + if (node.children[0].type.endsWith('Block')) { + for (const child of node.children[0].children) { + this.node2UsxPara(child, parentXmlNode); + } + } else if (node.type === 'paragraph') { + const paraTagCap = this.usfmLanguage + .query("(paragraph (_) @para-marker)") + .captures(node)[0]; + const paraMarker = paraTagCap.node.type; + + if (!paraMarker.endsWith("Block")) { + const paraXmlNode = parentXmlNode.ownerDocument.createElement("para"); + paraXmlNode.setAttribute("style", paraMarker); + parentXmlNode.appendChild(paraXmlNode); + + for (const child of paraTagCap.node.children.slice(1)) { + this.node2Usx(child, paraXmlNode); + } + + } + } else if (['pi', 'ph'].includes(node.type)) { + const paraMarker = this.usfm.slice(node.children[0].startIndex, node.children[0].endIndex) + .replace("\\", "") + .trim(); + const paraXmlNode = parentXmlNode.ownerDocument.createElement("para"); + paraXmlNode.setAttribute("style", paraMarker); + parentXmlNode.appendChild(paraXmlNode); + + for (const child of node.children.slice(1)) { + this.node2Usx(child, paraXmlNode); + } + + } + } + + + node2UsxNotes(node, parentXmlNode) { + // Build USJ nodes for footnotes and cross-references + const tagNode = node.children[0]; + const callerNode = node.children[1]; + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .trim(); + const noteXmlNode = parentXmlNode.ownerDocument.createElement('note'); + noteXmlNode.setAttribute('style', style); + const caller = this.usfm + .substring(callerNode.startIndex, callerNode.endIndex) + .trim(); + noteXmlNode.setAttribute('caller', caller); + parentXmlNode.appendChild(noteXmlNode); + for (let i = 2; i < node.children.length - 1; i++) { + this.node2Usx(node.children[i], noteXmlNode); + } + + } + + node2UsxChar(node, parentXmlNode) { + // Build USJ nodes for character markups, both regular and nested + const tagNode = node.children[0]; + let childrenRange = node.children.length; + if (node.children[node.children.length - 1].type.startsWith("\\")) { + childrenRange -= 1; // Exclude the last node if it starts with '\', treating it as a closing node + } + const charXmlNode = parentXmlNode.ownerDocument.createElement('char'); + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .replace("+", "") + .trim(); + charXmlNode.setAttribute('style', style); + parentXmlNode.appendChild(charXmlNode); + + for (let i = 1; i < childrenRange; i++) { + this.node2Usx(node.children[i], charXmlNode); + } + + } + + node2UsxAttrib(node, parentXmlNode) { + // Add attribute values to USJ elements + const attribNameNode = node.children[0]; + let attribName = this.usfm + .slice(attribNameNode.startIndex, attribNameNode.endIndex) + .trim(); + + // Handling special cases for attribute names + if (attribName === "|") { + attribName = DEFAULT_ATTRIB_MAP[node.parent.type]; + } + if (attribName === "src") { + // for \fig + attribName = "file"; + } + + const attribValCap = this.usfmLanguage + .query("((attributeValue) @attrib-val)") + .captures(node); + + let attribValue = ""; + if (attribValCap.length > 0) { + attribValue = this.usfm + .substring( + attribValCap[0].node.startIndex, + attribValCap[0].node.endIndex, + ) + .trim(); + } + + parentXmlNode.setAttribute(attribName, attribValue); + } + + node2UsxTable(node, parentXmlNode) { + // Handle table related components and convert to USJ + if (node.type === "table") { + const tableXmlNode = parentXmlNode.ownerDocument.createElement('table'); + parentXmlNode.appendChild(tableXmlNode); + node.children.forEach((child) => { + this.node2Usx(child, tableXmlNode); + }); + } else if (node.type === "tr") { + const rowXmlNode = parentXmlNode.ownerDocument.createElement('row'); + rowXmlNode.setAttribute("style", "tr"); + parentXmlNode.appendChild(rowXmlNode); + node.children.slice(1).forEach((child) => { + this.node2Usx(child, rowXmlNode); + }); + } else if (TABLE_CELL_MARKERS.includes(node.type)) { + const tagNode = node.children[0]; + const style = this.usfm + .substring(tagNode.startIndex, tagNode.endIndex) + .replace("\\", "") + .trim(); + const cellXmlNode = parentXmlNode.ownerDocument.createElement("cell"); + cellXmlNode.setAttribute("style", style); + cellXmlNode.setAttribute("align", style.includes("r") ? "end" : "start"); + parentXmlNode.appendChild(cellXmlNode); + node.children.slice(1).forEach((child) => { + this.node2Usx(child, cellXmlNode); + }); + } + } + + node2UsxMilestone(node, parentXmlNode) { + // Create ms node in USJ + + const msNameCap = this.usfmLanguage + .query( + `( + [(milestoneTag) + (milestoneStartTag) + (milestoneEndTag) + (zSpaceTag) + ] @ms-name)`, + ) + .captures(node)[0]; + + const style = this.usfm + .slice(msNameCap.node.startIndex, msNameCap.node.endIndex) + .replace("\\", "") + .trim(); + const msXmlNode = parentXmlNode.ownerDocument.createElement("ms"); + msXmlNode.setAttribute("style", style); + parentXmlNode.appendChild(msXmlNode); + node.children.forEach((child) => { + if (child.type.endsWith("Attribute")) { + this.node2Usx(child, msXmlNode); + } + }); + } + + node2UsxSpecial(node, parentXmlNode) { + // Build nodes for esb, cat, fig, optbreak in USJ + + if (node.type === "esb") { + const sidebarXmlNode = parentXmlNode.ownerDocument.createElement('sidebar'); + sidebarXmlNode.setAttribute('style', "esb"); + parentXmlNode.appendChild(sidebarXmlNode); + node.children.slice(1, -1).forEach((child) => { + this.node2Usx(child, sidebarXmlNode); + }); + } else if (node.type === "cat") { + const catCap = this.usfmLanguage + .query("((category) @category)") + .captures(node)[0]; + const category = this.usfm + .substring(catCap.node.startIndex, catCap.node.endIndex) + .trim(); + parentXmlNode.setAttribute("category", category); + } else if (node.type === "fig") { + const figXmlNode = parentXmlNode.ownerDocument.createElement('figure'); + figXmlNode.setAttribute("style", "fig"); + parentXmlNode.appendChild(figXmlNode); + node.children.slice(1, -1).forEach((child) => { + this.node2Usx(child, figXmlNode); + }); + } else if (node.type === "ref") { + const refXmlNode = parentXmlNode.ownerDocument.createElement('ref'); + parentXmlNode.appendChild(refXmlNode); + node.children.slice(1, -1).forEach((child) => { + this.node2Usx(child, refXmlNode); + }); + } + } + + node2UsxGeneric(node, parentXmlNode) { + const tagNode = node.children[0]; + let style = this.usfm.slice(tagNode.startIndex, tagNode.endIndex).trim(); + + // Strip leading backslashes from the style or use node type + if (style.startsWith('\\')) { + style = style.replace('\\', ''); + // } else { + // style = node.type; + } + + if (style === "usfm") { + return + } + + let childrenRangeStart = 1; + + // Create a 'para' element and set its style attribute + const paraXmlNode = parentXmlNode.ownerDocument.createElement('para'); + paraXmlNode.setAttribute('style', style); + parentXmlNode.appendChild(paraXmlNode); + + // Loop through the child nodes and recursively process them + for (let i = childrenRangeStart; i < node.children.length; i++) { + const child = node.children[i]; + if ( + CHAR_STYLE_MARKERS.includes(child.type) || + NESTED_CHAR_STYLE_MARKERS.includes(child.type) || + [ + "text", + "footnote", + "crossref", + "verseText", + "v", + "b", + "milestone", + "zNameSpace", + ].includes(child.type) + ) { + // If the child is of one of the allowed types, nest it inside the para node + this.node2Usx(child, paraXmlNode); + } else { + // Otherwise, append the child to the parent XML node + this.node2Usx(child, parentXmlNode); + } + } + + // Append the created para node to the parent XML node + } + + node2Usx(node, parentXmlNode) { + // Handling node types with respective functions + if (node.type === "id") { + this.node2UsxId(node, parentXmlNode); + } else if (node.type === "chapter") { + this.node2UsxChapter(node, parentXmlNode); + } else if (["cl", "cp", "cd", "vp"].includes(node.type)) { + this.node2UsxGeneric(node, parentXmlNode); + } else if (["ca", "va"].includes(node.type)) { + this.node2UsxCaVa(node, parentXmlNode); + } else if (node.type === "v") { + this.node2UsxVerse(node, parentXmlNode); + } else if (node.type === "verseText") { + node.children.forEach(child => { + this.node2Usx(child, parentXmlNode); + }); + } else if (["paragraph", "pi", "ph"].includes(node.type)) { + this.node2UsxPara(node, parentXmlNode); + } else if (NOTE_MARKERS.includes(node.type)) { + this.node2UsxNotes(node, parentXmlNode); + } else if ( + CHAR_STYLE_MARKERS.concat(NESTED_CHAR_STYLE_MARKERS, ["xt_standalone"]).includes(node.type) + ) { + this.node2UsxChar(node, parentXmlNode); + } else if (node.type.endsWith("Attribute")) { + this.node2UsxAttrib(node, parentXmlNode); + } else if (node.type === "text") { + let textVal = this.usfm.slice(node.startIndex, node.endIndex).trim(); + textVal = textVal.replace("~", " ") + const textNode = parentXmlNode.ownerDocument.createTextNode(textVal); + parentXmlNode.appendChild(textNode); + } else if (["table", "tr"].concat(TABLE_CELL_MARKERS).includes(node.type)) { + this.node2UsxTable(node, parentXmlNode); + } else if (node.type === "milestone" || node.type === "zNameSpace") { + this.node2UsxMilestone(node, parentXmlNode); + } else if (["esb", "cat", "fig", "ref"].includes(node.type)) { + this.node2UsxSpecial(node, parentXmlNode); + } else if ( + PARA_STYLE_MARKERS.includes(node.type) || + PARA_STYLE_MARKERS.includes(node.type.replace("\\", "").trim()) + ) { + this.node2UsxGeneric(node, parentXmlNode); + } else if (["", "|"].includes(node.type.trim())) { + // Skip whitespace nodes + } else if (node.children.length > 0) { + node.children.forEach(child => { + this.node2Usx(child, parentXmlNode); + }); + } + // else { + // throw new Error(`Encountered unknown element: ${node}`); + // } + } +} + + +export default USXGenerator; diff --git a/web-usfm-parser/src/utils/filter.js b/web-usfm-parser/src/utils/filter.js new file mode 100644 index 00000000..5cd904bd --- /dev/null +++ b/web-usfm-parser/src/utils/filter.js @@ -0,0 +1,147 @@ +// src/Filter.js + +const Filter = { + BOOK_HEADERS: [ + "ide", + "usfm", + "h", + "toc", + "toca", + "imt", + "is", + "ip", + "ipi", + "im", + "imi", + "ipq", + "imq", + "ipr", + "iq", + "ib", + "ili", + "iot", + "io", + "iex", + "imte", + "ie", + ], + TITLES: [ + "mt", + "mte", + "cl", + "cd", + "ms", + "mr", + "s", + "sr", + "r", + "d", + "sp", + "sd", + ], + COMMENTS: ["sts", "rem", "lit", "restore"], + PARAGRAPHS: [ + "p", + "m", + "po", + "pr", + "cls", + "pmo", + "pm", + "pmc", + "pmr", + "pi", + "mi", + "nb", + "pc", + "ph", + "q", + "qr", + "qc", + "qa", + "qm", + "qd", + "lh", + "li", + "lf", + "lim", + "litl", + "tr", + "tc", + "th", + "tcr", + "thr", + "table", + "b", + ], + CHARACTERS: [ + "add", + "bk", + "dc", + "ior", + "iqt", + "k", + "litl", + "nd", + "ord", + "pn", + "png", + "qac", + "qs", + "qt", + "rq", + "sig", + "sls", + "tl", + "wj", + "em", + "bd", + "bdit", + "it", + "no", + "sc", + "sup", + "rb", + "pro", + "w", + "wh", + "wa", + "wg", + "lik", + "liv", + "jmp", + ], + NOTES: [ + "f", + "fe", + "ef", + "efe", + "x", + "ex", + "fr", + "ft", + "fk", + "fq", + "fqa", + "fl", + "fw", + "fp", + "fv", + "fdc", + "xo", + "xop", + "xt", + "xta", + "xk", + "xq", + "xot", + "xnt", + "xdc", + ], + STUDY_BIBLE: ["esb", "cat"], + BCV: ["id", "c", "v"], + TEXT: ["text-in-excluded-parent"], + // INNER_CONTENT: ["content-in-excluded-parent"] +}; + +export default Filter; diff --git a/web-usfm-parser/src/utils/format.js b/web-usfm-parser/src/utils/format.js new file mode 100644 index 00000000..6d45ad57 --- /dev/null +++ b/web-usfm-parser/src/utils/format.js @@ -0,0 +1,12 @@ +// src/Format.js + +const Format = { + JSON: "usj", + CSV: "table", + ST: "syntax-tree", + USX: "usx", + MD: "markdown", + USFM: "usfm", +}; + +export default Format; diff --git a/web-usfm-parser/src/utils/markers.js b/web-usfm-parser/src/utils/markers.js new file mode 100644 index 00000000..b98012d2 --- /dev/null +++ b/web-usfm-parser/src/utils/markers.js @@ -0,0 +1,124 @@ +export const PARA_STYLE_MARKERS = [ + "ide", + "usfm", + "h", + "toc", + "toca", //identification + "imt", + "is", + "ip", + "ipi", + "im", + "imi", + "ipq", + "imq", + "ipr", + "iq", + "ib", + "ili", + "iot", + "io", + "iex", + "imte", + "ie", // intro + "mt", + "mte", + "cl", + "cd", + "ms", + "mr", + "s", + "sr", + "r", + "d", + "sp", + "sd", //titles + "q", + "qr", + "qc", + "qa", + "qm", + "qd", //poetry + "lh", + "li", + "lf", + "lim", + "litl", //lists + "sts", + "rem", + "lit", + "restore", //comments +]; +export const NOTE_MARKERS = ["f", "fe", "ef", "efe", "x", "ex"]; +export const CHAR_STYLE_MARKERS = [ + "add", + "bk", + "dc", + "ior", + "iqt", + "k", + "litl", + "nd", + "ord", + "pn", + "png", + "qac", + "qs", + "qt", + "rq", + "sig", + "sls", + "tl", + "wj", // Special - text + "em", + "bd", + "bdit", + "it", + "no", + "sc", + "sup", // character styling + "rb", + "pro", + "w", + "wh", + "wa", + "wg", //special - features + "lik", + "liv", //structred list entries + "jmp", + "fr", + "ft", + "fk", + "fq", + "fqa", + "fl", + "fw", + "fp", + "fv", + "fdc", //footnote - content + "xo", + "xop", + "xt", + "xta", + "xk", + "xq", + "xot", + "xnt", + "xdc", //crossref - content +]; +export const NESTED_CHAR_STYLE_MARKERS = CHAR_STYLE_MARKERS.map( + (item) => item + "Nested", +); +export const DEFAULT_ATTRIB_MAP = { + w: "lemma", + rb: "gloss", + xt: "href", + fig: "alt", + xt_standalone: "href", + xtNested: "href", + ref: "loc", + "milestone": "who", + "k":"key" +}; +export const TABLE_CELL_MARKERS = ["tc", "th", "tcr", "thr"]; +export const MISC_MARKERS = ["fig", "cat", "esb", "b", "ph", "pi"]; \ No newline at end of file diff --git a/web-usfm-parser/src/utils/types.js b/web-usfm-parser/src/utils/types.js new file mode 100644 index 00000000..a0aaee5e --- /dev/null +++ b/web-usfm-parser/src/utils/types.js @@ -0,0 +1,32 @@ +export const NO_USFM_USJ_TYPES = ["USJ", "table"]; +export const CLOSING_USJ_TYPES + = ["char", "note", "figure"]; +export const NON_ATTRIB_USJ_KEYS = [ + "type", + "marker", + "content", + "number", + "sid", + "code", + "caller", + "align", + "version", + "altnumber", + "pubnumber", + "category", +]; + +export const NON_ATTRIB_USX_KEYS = [ + "style", + "number", + "sid", + "code", + "caller", + "align", + "version", + "altnumber", + "pubnumber", + "category", +]; +export const NO_NEWLINE_USJ_TYPES = ["char", "note", "verse", "table:cell"]; +export const NO_NEWLINE_USX_TYPES = ["char", "note", "verse", "cell"]; diff --git a/web-usfm-parser/src/web-tree-sitter/package.json b/web-usfm-parser/src/web-tree-sitter/package.json new file mode 100644 index 00000000..a2091cee --- /dev/null +++ b/web-usfm-parser/src/web-tree-sitter/package.json @@ -0,0 +1,3 @@ +{ + "type": "commonjs" +} \ No newline at end of file diff --git a/web-usfm-parser/test.js b/web-usfm-parser/test.js new file mode 100644 index 00000000..7ab8d4dc --- /dev/null +++ b/web-usfm-parser/test.js @@ -0,0 +1,42 @@ +import {USFMParser} from './src/index.js'; +import { readFile } from 'fs/promises'; +import { DOMParser } from 'xmldom'; + +(async () => { + await USFMParser.init("tree-sitter-usfm.wasm", "tree-sitter.wasm"); + // await USFMParser.init(); + // const usfmParser = new USFMParser('\\id GEN\n\\c 1\n\\p\n\\v 1 In the begining..\\v 2 more text'); + // const output = usfmParser.toUSJ(); + // console.log({ output }); + + // const usfmParser2 = new USFMParser(null, output); + // const output2 = usfmParser.usfm; + // console.log({ output2 }); + + // const filePath = "../tests/usfmjsTests/missing_verses/origin.usfm"; + // const content = await readFile(filePath, 'utf-8'); // Specify encoding + // console.log(content); + + // await USFMParser.init("tree-sitter-usfm.wasm", "tree-sitter.wasm"); + // const usfmParser = new USFMParser(content); + // const output = usfmParser.toUSJ(null, null, true); + // console.log({output}) + + const filePath = "../tests/usfmjsTests/missing_verses/origin.xml"; + const content = await readFile(filePath, 'utf-8'); // Specify encoding + console.log(content); + console.log("*************************"); + + const doc = new DOMParser().parseFromString(content); + const usfmParser = new USFMParser(null, null, doc); + console.log(usfmParser.usfm) + console.log("*************************"); + + const output = usfmParser.toUSJ(null, null, true); + console.log({output}) + console.log("*************************"); + + + +})(); + diff --git a/web-usfm-parser/test/basic.js b/web-usfm-parser/test/basic.js new file mode 100644 index 00000000..aec2d9be --- /dev/null +++ b/web-usfm-parser/test/basic.js @@ -0,0 +1,110 @@ +// const assert = require('assert'); +// const {USFMParser} = require("../src/index"); +import assert from 'assert' +import {USFMParser} from '../src/index.js'; + +const simpleUSFM = '\\id GEN\n\\c 1\n\\p\n\\v 1 In the begining..\\v 2'; +const simpleUSJ = { + type: 'USJ', + version: '0.3.0', + content: [ + { type: 'book', marker: 'id', code: 'GEN', content: [] }, + { type: 'chapter', marker: 'c', number: '1', sid: 'GEN 1' }, + { type: 'para', marker: 'p', content: [ + {type: 'verse', marker: 'v', number: 1 }, + "In the begining..", + {type: 'verse', marker: 'v', number: 2 } + ] } + ] +} +describe("Sanity Check for the testing pipeline", () => { + + it("Parse, toUSJ and back toUSFM", async () => { + await USFMParser.init("./tree-sitter-usfm.wasm", "./tree-sitter.wasm"); + const usfmParser = new USFMParser(simpleUSFM); + const output = usfmParser.toUSJ() + assert.notStrictEqual(output, null, 'The result should not be null and no errors during conversion'); + + const usfm = usfmParser.usjToUsfm(output) + assert.notStrictEqual(usfm, null, 'The result should not be null and no errors during conversion'); + + + }); +}); + +describe("USFMParser Object initialization", () => { + + it("with USFM", async () => { + await USFMParser.init("./tree-sitter-usfm.wasm", "./tree-sitter.wasm"); + const usfmParser = new USFMParser(simpleUSFM) + assert.strictEqual(usfmParser.usfm, simpleUSFM) + + }); + + it("with USJ", async () => { + const usfmParser = new USFMParser(null, simpleUSJ) + assert.strictEqual(usfmParser.usj, simpleUSJ) + + }); + + it("with nothing", async () => { + await USFMParser.init("./tree-sitter-usfm.wasm", "./tree-sitter.wasm"); + let usfmParser = null; + try { + const usfmParser = new USFMParser() + + } catch(err) { + assert.strictEqual(err.message, "Missing input! Either USFM, USJ or USX is to be provided.") + } + assert.strictEqual(usfmParser, null); + }); + + it("with usfm and usj", async () => { + await USFMParser.init("./tree-sitter-usfm.wasm", "./tree-sitter.wasm"); + let usfmParser = null; + try { + const usfmParser = new USFMParser(simpleUSFM, simpleUSJ) + + } catch(err) { + assert.strictEqual(err.message, `Found more than one input! +Only one of USFM, USJ or USX is supported in one object.` ) + } + assert.strictEqual(usfmParser, null); + }); + + it("with usj in place of USFM", async () => { + await USFMParser.init("./tree-sitter-usfm.wasm", "./tree-sitter.wasm"); + let usfmParser = null; + try { + const usfmParser = new USFMParser(simpleUSJ) + + } catch(err) { + assert.strictEqual(err.message, "Invalid input for USFM. Expected a string.") + } + assert.strictEqual(usfmParser, null); + }); + + it("with usfm in place of USJ", async () => { + await USFMParser.init("./tree-sitter-usfm.wasm", "./tree-sitter.wasm"); + let usfmParser = null; + try { + const usfmParser = new USFMParser(null, simpleUSJ) + + } catch(err) { + assert.strictEqual(err.message, "Invalid input for USJ. Expected an object.") + } + assert.strictEqual(usfmParser, null); + }); + + it("with usj as default", async () => { + await USFMParser.init("./tree-sitter-usfm.wasm", "./tree-sitter.wasm"); + let usfmParser = null; + try { + const usfmParser = new USFMParser(simpleUSJ) + + } catch(err) { + assert.strictEqual(err.message, "Invalid input for USFM. Expected a string.") + } + assert.strictEqual(usfmParser, null); + }); +}); diff --git a/web-usfm-parser/test/config.js b/web-usfm-parser/test/config.js new file mode 100644 index 00000000..cb375cbf --- /dev/null +++ b/web-usfm-parser/test/config.js @@ -0,0 +1,231 @@ +import {glob} from 'glob'; +import fs from 'node:fs'; +import xml2js from "xml2js"; +import {USFMParser} from "../src/index.js" + +let allUsfmFiles = []; +let negativeTests = [] + +const TEST_DIR = "../tests"; + +allUsfmFiles = allUsfmFiles.concat( glob.sync(TEST_DIR+'/*/*/origin.usfm')); +allUsfmFiles = allUsfmFiles.concat( glob.sync(TEST_DIR+'/*/*/*/origin.usfm')); +// console.log(allUsfmFiles) + + + +let passFailOverrideList = { + //linkhref without - + "/paratextTests/Usfm30Usage/origin.usfm": "fail", + + // custom attribute without x- + "/paratextTests/InvalidAttributes/origin.usfm": "fail", + "/paratextTests/InvalidFigureAttributesReported/origin.usfm": "fail", + + // link attributes used without hyphen + "/paratextTests/LinkAttributesAreValid/origin.usfm": "fail", + + // significant space missing after \p , \q, \m, \b + "/paratextTests/CustomAttributesAreValid/origin.usfm": "fail", + "/paratextTests/NestingInFootnote/origin.usfm": "fail", + "/specExamples/cross-ref/origin.usfm": "fail", + "/paratextTests/MarkersMissingSpace/origin.usfm": "fail", + "/paratextTests/NestingInCrossReferences/origin.usfm": "fail", + "/special-cases/empty-para/origin.usfm": "fail", + // "/special-cases/sp/origin.usfm": "fail", + "/specExamples/extended/sidebars/origin.usfm":"fail", + + // No. of columns in table not validated by usfm-grammar + "/paratextTests/MissingColumnInTable/origin.usfm": "pass", + + // WordlistMarkerMissingFromGlossaryCitationForms from paratext. Something to do with \k or \w + "/paratextTests/WordlistMarkerMissingFromGlossaryCitationForms/origin.usfm": "pass", + + "/usfmjsTests/ts/origin.usfm": "pass", // Committee thinks these should fail though + "/usfmjsTests/chunk_footnote/origin.usfm": "pass", // Committee thinks these should fail though + "/usfmjsTests/ts_2/origin.usfm": "pass", // Committee thinks these should fail though + "/special-cases/newline-attributes/origin.usfm": "pass", // Committee thinks these should fail though + "/special-cases/empty-attributes5/origin.usfm": "pass", // Committee thinks these should fail though + + // no content in ide, rem, toc1, ip etc + "/paratextTests/NoErrorsPartiallyEmptyBook/origin.usfm": "fail", + "/paratextTests/NoErrorsEmptyBook/origin.usfm": "fail", + "/usfmjsTests/57-TIT.greek/origin.usfm": "fail", + "/paratextTests/EmptyMarkers/origin.usfm": "fail", + + // no \p (usually after \s) + "/usfmjsTests/missing_verses/origin.usfm": "fail", // has \s5 + "/usfmjsTests/isa_verse_span/origin.usfm": "fail", // has \s5 + "/usfmjsTests/isa_footnote/origin.usfm": "fail", // has \s5 + "/usfmjsTests/tit_extra_space_after_chapter/origin.usfm": "fail", // has \s5 + "/usfmjsTests/1ch_verse_span/origin.usfm": "fail", // has \s5 + "/usfmjsTests/usfmIntroTest/origin.usfm": "fail", + "/usfmjsTests/out_of_sequence_verses/origin.usfm": "fail", + "/usfmjsTests/acts_1_milestone/origin.usfm": "fail", + "/usfmjsTests/luk_quotes/origin.usfm": "fail", + "/biblica/BlankLinesWithFigures/origin.usfm": "fail", //\fig used without \p, only \b + + //no space after \s5 + "/usfmjsTests/usfmBodyTestD/origin.usfm": "fail", + "/usfmjsTests/usfm-body-testF/origin.usfm": "fail", + "/usfmjsTests/psa_quotes/origin.usfm": "fail", + "/usfmjsTests/pro_footnote/origin.usfm": "fail", + "/usfmjsTests/pro_quotes/origin.usfm": "fail", + "/samples-from-wild/doo43-1/origin.usfm": "fail", + "/usfmjsTests/gn_headers/origin.usfm": "fail", + "/usfmjsTests/isa_inline_quotes/origin.usfm": "fail", + "/usfmjsTests/job_footnote/origin.usfm": "fail", + "/usfmjsTests/mat-4-6.whitespace/origin.usfm": "fail", + "/usfmjsTests/out_of_sequence_chapters/origin.usfm": "fail", + + "/biblica/PublishingVersesWithFormatting/origin.usfm": "fail", // \c without number + + "/special-cases/figure_with_quotes_in_desc/origin.usfm": "fail", // quote within quote + "/specExamples/poetry/origin.usfm": "fail", // \b not followed by a \p or \q + + "/paratextTests/InvalidRubyMarkup/origin.usfm": "fail", // contradicts /paratextTests/MissingRequiredAttributesReported + "/special-cases/empty-book/origin.usfm": "pass", // Just says only \id is not enough. Not clear what else is mandatory + "/usfmjsTests/f10_gen12-2_empty_word/origin.usfm": "pass", // Empty \w \w* is accepted by us as of now + //########## Need to be fixed ####################### + "/paratextTests/NoErrorsShort/origin.usfm": "pass", // \c is mandatory! + // "/usfmjsTests/gn_headers/origin.usfm": "fail", # what is the valid position for mte and imt + "/usfmjsTests/acts_8-37-ugnt-footnote/origin.usfm": "fail", // no clue why it fails + + "/advanced/periph/origin.usfm": "fail", // Peripharals not implemented + "/advanced/nesting1/origin.usfm": "fail", // We dont support char within char w/o +, yet + "/samples-from-wild/doo43-4/origin.usfm": "fail", // ior surronded by a () leaves a stray ) at the end. + +}; + + +let excludeUSJs = [ + `${TEST_DIR}/biblica/CrossRefWithPipe/origin.json`, //ref object introduced which is not in usfm + `${TEST_DIR}/special-cases/empty-attributes/origin.json`, //lemma not given correctly. Issue from USX + `${TEST_DIR}/specExamples/character/origin.json`,// lit element treated as a body paragraph enclosing a verse! Issue from USX + + ] + +let excludeUSXs = [ + `${TEST_DIR}/specExamples/extended/contentCatogories2/origin.xml`, + // \ef not treated as inline content of paragraph + `${TEST_DIR}/specExamples/extended/sectionIntroductions/origin.xml`, + // verse number="+"!!! + `${TEST_DIR}/specExamples/character/origin.xml`, + // lit element treated as a body paragraph enclosing a verse! + `${TEST_DIR}/usfmjsTests/esb/origin.xml`, + // last verse text given outside of paragraph. + `${TEST_DIR}/special-cases/nbsp/origin.xml`, + // ~ not being replaced by nbsp in usfm-grammar + `${TEST_DIR}/special-cases/empty-attributes/origin.xml`, + // attributes treated as text content of marker + `${TEST_DIR}/biblica/CategoriesOnNotes/origin.xml`, + `${TEST_DIR}/biblica/CrossRefWithPipe/origin.xml`, + // ref node has type ref. Is it char or ref? + `${TEST_DIR}/usfmjsTests/usfmBodyTestD/origin.xml`, + // \v and other contents contained inside \lit. New docs doesnt have \lit + `${TEST_DIR}/usfmjsTests/usfm-body-testF/origin.xml`, + // does the ms go inside \s5 or after it? +] + +await USFMParser.init("./tree-sitter-usfm.wasm", "./tree-sitter.wasm"); + +const initialiseParser = async function (inputUsfmPath){ + `Open and parse the given file` + try { + const data = fs.readFileSync(inputUsfmPath, 'utf8'); + let testParser = new USFMParser(data); + if (testParser === null) { + throw Error(`Paring failed for ${inputUsfmPath}: ${data}`) + } + return testParser; + } catch (err) { + throw err; + } +} + +const checkValidUsfm = function (inputUsfmPath) { + `Checks the metadata.xml to see is the USFM is a valid one` + if (inputUsfmPath.replace(TEST_DIR, '') in passFailOverrideList){ + if (passFailOverrideList[inputUsfmPath.replace(TEST_DIR, '')] === "pass"){ + return true + } else if (passFailOverrideList[inputUsfmPath.replace(TEST_DIR, '')] === "fail") { + return false + } + } + let value = null; + let metaFilePath = inputUsfmPath.replace("origin.usfm", "metadata.xml") + let metadata = fs.readFileSync(metaFilePath, 'utf8') + + xml2js.parseString(metadata, (err, result) => { + if (err) { + console.error('Error parsing XML:', err); + return; + } + value = result['test-metadata']['validated'][0]; + }); + + if (value === "fail"){ + return false + } + else if (value === "pass") { + return true + } else { + throw Error(`Validation read as : ${value} for ${metaFilePath}`) + + } +} + +const findAllMarkers = function (usfmStr, keepId = false, keepNumber = true) { + // Regex pattern to find all markers in the USFM string + let allMarkersInInput = [...usfmStr.matchAll(/\\\+?(([A-Za-z]+)\d*(-[se])?)/g)]; + + // Processing based on `keepNumber` flag + if (keepNumber) { + allMarkersInInput = allMarkersInInput.map(match => match[1]); + } else { + allMarkersInInput = allMarkersInInput.map(match => match[1] + match[2]); + } + + // Remove duplicates + allMarkersInInput = [...new Set(allMarkersInInput)]; + + // Remove 'id' marker if `keepId` is false + if (!keepId) { + const idIndex = allMarkersInInput.indexOf('id'); + if (idIndex !== -1) allMarkersInInput.splice(idIndex, 1); + } + + // Handle 'esbe' and 'usfm' markers + const esbeIndex = allMarkersInInput.indexOf('esbe'); + if (esbeIndex !== -1) { + const esbIndex = allMarkersInInput.indexOf('esb'); + if (esbIndex === -1) throw new Error("'esb' must be present if 'esbe' is found"); + allMarkersInInput.splice(esbeIndex, 1); + } + + const usfmIndex = allMarkersInInput.indexOf('usfm'); + if (usfmIndex !== -1) { + allMarkersInInput.splice(usfmIndex, 1); + } + + return allMarkersInInput; +} + +let isValidUsfm = {} + +allUsfmFiles.forEach((filepath) => { + isValidUsfm[filepath] = checkValidUsfm(filepath) +}); +// console.log(allUsfmFiles[0]) + +// const test_parser = initialiseParser("../tests/samples-from-wild/WEB1/origin.usfm") + + +export{ + allUsfmFiles, + initialiseParser, + isValidUsfm, + excludeUSJs, + excludeUSXs, + findAllMarkers +}; diff --git a/web-usfm-parser/test/test_list_conversion.js b/web-usfm-parser/test/test_list_conversion.js new file mode 100644 index 00000000..02a2e317 --- /dev/null +++ b/web-usfm-parser/test/test_list_conversion.js @@ -0,0 +1,71 @@ +import assert from 'assert'; +import fs from 'node:fs'; +import {allUsfmFiles, initialiseParser, isValidUsfm, excludeUSJs, findAllMarkers} from './config.js'; +import {USFMParser, Filter} from '../src/index.js'; + + +describe("Check successful USFM-List conversion for positive samples", () => { + + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Convert ${value} to List`, async (inputUsfmPath=value) => { + //Tests if input parses without errors + const testParser = await initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const list = testParser.toList(); + assert(list instanceof Array); + assert.deepStrictEqual(list[0], + [ 'Book', 'Chapter', 'Verse', 'Text', 'Type', 'Marker' ]); + + }); + } + }); +}); + + +describe("Test Exclude Marker option in List conversion", () => { + // Test Exclude Maker option by checking markers in the List + const excludeTests = [ + ['s', 'r'] + ] + excludeTests.forEach(function(exList) { + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Exclude ${exList.slice(0, 5)} from ${value}`, async (inputUsfmPath=value) => { + const testParser = await initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const list = testParser.toList(exList); + assert(list instanceof Array); + + const allTypes = list.map(row => row[5]); + let types = new Set(allTypes); + let intersection = exList.filter(value => types.has(value)); + assert.deepStrictEqual(intersection, []) + }); + } + }) + }) +}); + +describe("Test include Marker option in List conversion", () => { + // Test include Maker option by checking markers in the List + const includeTests = [ + ['id', 'c', 'v']+Filter.TEXT+Filter.PARAGRAPHS + ] + includeTests.forEach(function(inList) { + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`include ${inList.slice(0, 5)} of ${value} in List`, async (inputUsfmPath=value) => { + const testParser = await initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const list = testParser.toList(null, inList); + assert(list instanceof Array); + + const allTypes = list.slice(1).map(row => row[5]); + assert( allTypes.every(element => inList.includes(element)), allTypes) + + }); + } + }) + }) +}); \ No newline at end of file diff --git a/web-usfm-parser/test/test_parsing.js b/web-usfm-parser/test/test_parsing.js new file mode 100644 index 00000000..a00788a6 --- /dev/null +++ b/web-usfm-parser/test/test_parsing.js @@ -0,0 +1,23 @@ +import assert from 'assert'; +import {allUsfmFiles, initialiseParser, isValidUsfm} from './config.js'; +import {USFMParser} from '../src/index.js'; + +describe("Check parsing pass or fail is correct", () => { + + allUsfmFiles.forEach(function(value) { + it(`Parse ${value} to ensure validity ${isValidUsfm[value]}`, async (inputUsfmPath=value) => { + await USFMParser.init("./tree-sitter-usfm.wasm", "./tree-sitter.wasm"); + const testParser = await initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + assert(testParser.errors instanceof Array) + if (isValidUsfm[inputUsfmPath] === true) { + assert.strictEqual(testParser.errors.length, 0); + } else { + assert.notStrictEqual(testParser.errors.length, 0); + } + + + }); + + }); +}); diff --git a/web-usfm-parser/test/test_usj_conversion.js b/web-usfm-parser/test/test_usj_conversion.js new file mode 100644 index 00000000..8d54e7f5 --- /dev/null +++ b/web-usfm-parser/test/test_usj_conversion.js @@ -0,0 +1,268 @@ +import assert from 'assert'; +import fs from 'node:fs'; +import Ajv from 'ajv'; +import {allUsfmFiles, initialiseParser, isValidUsfm, excludeUSJs, findAllMarkers} from './config.js'; +import {USFMParser, Filter} from '../src/index.js'; + +beforeEach(() => { + if (global.gc) { global.gc(); } + }); + +describe("Check successful USFM-USJ conversion for positive samples", () => { + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Convert ${value} to USJ`, async (inputUsfmPath=value) => { + const testParser = await initialiseParser(inputUsfmPath) + // assert(testParser instanceof USFMParser) + const usj = testParser.toUSJ(); + assert(testParser instanceof Object); + assert.strictEqual(usj["type"], "USJ"); + assert.strictEqual(usj["version"], "3.1"); + assert.strictEqual(usj.content[0].type, "book"); + assert.strictEqual(usj.content[0].marker, "id"); + }); + } + }); +}); + + +describe("Compare generated USJ with testsuite sample", () => { + allUsfmFiles.forEach(function(value) { + const usjPath = value.replace(".usfm", ".json"); + if (isValidUsfm[value] && ! excludeUSJs.includes(usjPath)) { + it(`Compare generated USJ to ${usjPath}`, async (inputUsfmPath=value) => { + const testParser = await initialiseParser(inputUsfmPath) + const generatedUSJ = testParser.toUSJ(); + const filePath = usjPath; + let fileData = null; + try { + fileData = fs.readFileSync(filePath, "utf8"); + } catch(err) { + if (err.code === "ENOENT") { + return + } + } + const testsuiteUSJ = JSON.parse(fileData); + stripDefaultAttribValue(testsuiteUSJ) + removeNewlinesInText(testsuiteUSJ) + stripTextValue(testsuiteUSJ) + removeNewlinesInText(generatedUSJ) + stripTextValue(generatedUSJ) + + assert.deepEqual(generatedUSJ, testsuiteUSJ); + }); + } + }); +}); + + +describe("Test USFM-USJ-USFM roundtripping", () => { + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Roundtrip ${value} via USJ`, async (inputUsfmPath=value) => { + const testParser = await initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usj = testParser.toUSJ(); + assert(usj instanceof Object); + + const testParser2 = new USFMParser(null, usj); + const generatedUSFM = testParser2.usfm; + assert.strictEqual(typeof generatedUSFM, 'string'); + assert(generatedUSFM.startsWith("\\id")); + + const inputMarkers = findAllMarkers(testParser.usfm) + const finalMarkers = findAllMarkers(generatedUSFM) + assert.deepStrictEqual(inputMarkers, finalMarkers, `Markers in input and generated USFMs differ`) + + + + }); + } + }); + +}); + + +describe("Ensure all markers are in USJ", () => { + // Tests if all markers in USFM are present in output also + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Check for markers of ${value} in USJ`, async (inputUsfmPath=value) => { + const testParser = await initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usj = testParser.toUSJ(); + assert(usj instanceof Object); + + const inputMarkers = [... new Set(findAllMarkers(testParser.usfm, true))] + const allUSJTypes = getTypes(usj); + + assert.deepStrictEqual(inputMarkers, allUSJTypes, `Markers in input and generated USJ differ`) + }); + } + }); + +}); + +describe("Validate USJ against schema", () => { + // Test generated USJ against USJ schema + const ajv = new Ajv(); + const schemaStr = fs.readFileSync("../schemas/usj.js", 'utf8'); + const schema = JSON.parse(schemaStr); + const validate = ajv.compile(schema); + + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Validate USJ generated from ${value}`, async (inputUsfmPath=value) => { + const testParser = await initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usj = testParser.toUSJ(); + assert(usj instanceof Object); + + assert(validate(usj)); + + }); + } + }); + +}); + +describe("Test Exclude Marker option", () => { + // Test Exclude Maker option by checking markers in the USJ + const excludeTests = [ + ['v', 'c'], + Filter.PARAGRAPHS, + [...Filter.TITLES, ...Filter.BOOK_HEADERS ] + ] + excludeTests.forEach(function(exList) { + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Exclude ${exList.slice(0, 5)} from ${value}`, async (inputUsfmPath=value) => { + const testParser = await initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usj = testParser.toUSJ(exList); + assert(usj instanceof Object); + + const allUSJTypes = getTypes(usj) + let types = new Set(allUSJTypes); + let intersection = exList.filter(value => types.has(value)); + assert.deepStrictEqual(intersection, []) + }); + } + }) + }) +}); + +describe("Test Include Marker option", () => { + // Test Include Maker option by checking markers in the USJ + const includeTests = [ + ['v', 'c'], + Filter.PARAGRAPHS, + [...Filter.TITLES, ...Filter.BOOK_HEADERS ] + ] + includeTests.forEach(function(inList) { + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Include ${inList.slice(0, 5)} in ${value}`, async (inputUsfmPath=value) => { + const testParser = await initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usj = testParser.toUSJ(null, inList); + assert(usj instanceof Object); + + let allUSJTypes = getTypes(usj, false) + assert( allUSJTypes.every(element => inList.includes(element)), allUSJTypes) + }); + } + }) + }) +}); + + +function stripTextValue(usjObj) { + /* Trailing and preceding space handling can be different between tcdocs and our logic. + Strip both before comparison */ + if (usjObj.hasOwnProperty("content")) { + usjObj["content"].forEach((item, index) => { + if (typeof item === 'string') { + usjObj["content"][index] = item.trim(); // Strip spaces from strings + } else { + stripTextValue(item); // Recursively handle nested objects + } + }); + } +} + +function removeNewlinesInText(usjDict) { + /* The test samples in testsuite do not preserve new lines. But we do in usfm-grammar. + So removing them just for comparison */ + if (usjDict.hasOwnProperty("content")) { + usjDict["content"].forEach((item, index) => { + if (typeof item === 'string') { + // Replace newlines with spaces + usjDict["content"][index] = item.replace(/\n/g, " "); + // Replace multiple spaces with a single space + usjDict["content"][index] = usjDict["content"][index].replace(/\s+/g, " "); + } else { + removeNewlinesInText(item); // Recursively handle nested dictionaries + } + }); + } +} + + +function stripDefaultAttribValue(usjDict) { + /* The USX samples in test suite have space in lemma values when given as default attribute */ + if (usjDict.hasOwnProperty("content")) { + usjDict["content"].forEach(item => { + if (typeof item === 'object' && !Array.isArray(item)) { + if (item["type"] === "char" && item["marker"] === "w") { + if (item.hasOwnProperty("lemma")) { + item["lemma"] = item["lemma"].trim(); // Strip spaces from 'lemma' + } + } + stripDefaultAttribValue(item); // Recursively handle nested dictionaries + } + }); + } +} + +function getTypes(element, keepNumber=true) { + // Recursive function to find all keys in the dict output + let types = []; + if (typeof element === 'string') { + return types; // Return empty array if element is a string + } else { + if ('marker' in element) { + types.push(element.marker); + } + if (element.type === 'ref') { + types.push("ref"); + } + if ('altnumber' in element) { + if (element.marker === 'c') { + types.push('ca'); + } else { + types.push('va'); + } + } + if ('pubnumber' in element) { + if (element.marker === 'c') { + types.push('cp'); + } else { + types.push('vp'); + } + } + if ('category' in element) { + types.push('cat'); + } + if ('content' in element) { + element.content.forEach(item => { + types = types.concat(getTypes(item)); // Recursively get types from content + }); + } + } + let uniqueTypes = [...new Set(types)]; + if (! keepNumber) { + uniqueTypes = uniqueTypes.map(item => item.replace(/\d+$/, '')); + } + return uniqueTypes; +} diff --git a/web-usfm-parser/test/test_usx_conversion.js b/web-usfm-parser/test/test_usx_conversion.js new file mode 100644 index 00000000..d74a91e5 --- /dev/null +++ b/web-usfm-parser/test/test_usx_conversion.js @@ -0,0 +1,140 @@ + +import assert from 'assert'; +import fs from "node:fs"; +import { DOMImplementation, XMLSerializer, DOMParser } from 'xmldom'; +import {allUsfmFiles, initialiseParser, isValidUsfm, excludeUSXs, findAllMarkers} from './config.js' +import {USFMParser, Filter} from '../src/index.js'; + +describe("Check successful USFM-USX conversion for positive samples", () => { + const domImpl = new DOMImplementation(); + const sampleDoc = domImpl.createDocument(null, 'usx', null); + allUsfmFiles.forEach(function(value) { + + if (isValidUsfm[value]) { + it(`Convert ${value} to USX`, async (inputUsfmPath=value) => { + //Tests if input parses without errors + const testParser = await initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usx = testParser.toUSX(); + // assert(usx instanceof DOMImplementation.Document); + assert(usx.tagName === "usx"); + assert(usx.getAttribute("version") === "3.1"); + assert(usx.childNodes[0].tagName === "book"); + assert(usx.childNodes[0].getAttribute("style") === "id"); + }); + } + }); +}); + + + +describe("Ensure all markers are in USX", () => { + // Tests if all markers in USFM are present in output also + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Check for markers of ${value} in USX`, async (inputUsfmPath=value) => { + const testParser = await initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usx = testParser.toUSX(); + + const inputMarkers = [... new Set(findAllMarkers(testParser.usfm, true))] + const allUSXNodes = getNodes(usx); + + assert.deepStrictEqual(inputMarkers, allUSXNodes, `Markers in input and generated USJ differ`) + }); + } + }); + +}); + +describe("Test USFM-USX-USFM roundtripping", () => { + allUsfmFiles.forEach(function(value) { + if (isValidUsfm[value]) { + it(`Roundtrip ${value} via USX`, async (inputUsfmPath=value) => { + const testParser = await initialiseParser(inputUsfmPath) + assert(testParser instanceof USFMParser) + const usx = testParser.toUSX(); + assert(usx.nodeType === 1); + + const testParser2 = new USFMParser(null, null, usx); + const generatedUSFM = testParser2.usfm.trim(); + assert.strictEqual(typeof generatedUSFM, 'string'); + assert(generatedUSFM.startsWith("\\id")); + + const inputMarkers = findAllMarkers(testParser.usfm) + const finalMarkers = findAllMarkers(generatedUSFM) + assert.deepStrictEqual(inputMarkers, finalMarkers, `Markers in input and generated USFMs differ`) + + }); + } + }); + +}); + +// describe("Compare generated USX with testsuite sample", () => { + +// allUsfmFiles.forEach(function(value) { +// const usxPath = value.replace(".usfm", ".xml"); +// if (isValidUsfm[value] && ! excludeUSXs.includes(usxPath)) { +// it(`Compare generated USX to ${usxPath}`, async (inputUsfmPath=value) => { +// const testParser = await initialiseParser(inputUsfmPath) +// const generatedUSX = testParser.toUSX(); +// const filePath = usxPath; +// let fileData = null; +// try { +// fileData = fs.readFileSync(filePath, "utf8"); +// } catch(err) { +// if (err.code === "ENOENT") { +// return +// } +// } +// const testsuiteUSX = new DOMParser().parseFromString( +// fileData, 'text/xml').getElementsByTagName("usx")[0]; + +// assert.deepEqual(generatedUSX, testsuiteUSX); +// }); +// } +// }); +// }); + +function getNodes(element, keepNumber=true) { + // Recursive function to find all keys in the dict output + let types = []; + if (element.nodeType === element.TEXT_NODE) { + return types; // Return empty array if element is a string + } else { + if (element.getAttribute('style')) { + types.push(element.getAttribute('style')); + } + if (element.tagName === "ref") { + types.push("ref"); + } + if (element.getAttribute('altnumber')) { + if (element.tagName === 'chapter') { + types.push('ca'); + } else { + types.push('va'); + } + } + if (element.getAttribute('pubnumber')) { + if (element.tagName === 'chapter') { + types.push('cp'); + } else { + types.push('vp'); + } + } + if (element.getAttribute('category')) { + types.push('cat'); + } + if (element.childNodes.length > 0) { + Array.from(element.childNodes).forEach(child => { + types = types.concat(getNodes(child)); // Recursively get types from content + }); + } + } + let uniqueTypes = [...new Set(types)]; + if (! keepNumber) { + uniqueTypes = uniqueTypes.map(item => item.replace(/\d+$/, '')); + } + return uniqueTypes; +}