diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000..97ca688 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,2 @@ +github: ["EmilStenstrom"] + diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..fb535f8 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,179 @@ +# This file is autogenerated by maturin v1.8.1 +# To update, run +# +# maturin generate-ci github +# +name: Publish to PyPI + +on: + push: + tags: + - '*' + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +permissions: + contents: read + +jobs: + linux: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: ubuntu-22.04 + target: x86_64 + - runner: ubuntu-22.04 + target: x86 + - runner: ubuntu-22.04 + target: aarch64 + - runner: ubuntu-22.04 + target: armv7 + - runner: ubuntu-22.04 + target: s390x + - runner: ubuntu-22.04 + target: ppc64le + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: 'true' + manylinux: auto + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-linux-${{ matrix.platform.target }} + path: dist + + musllinux: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: ubuntu-22.04 + target: x86_64 + - runner: ubuntu-22.04 + target: x86 + - runner: ubuntu-22.04 + target: aarch64 + - runner: ubuntu-22.04 + target: armv7 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: 'true' + manylinux: musllinux_1_2 + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-musllinux-${{ matrix.platform.target }} + path: dist + + windows: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: windows-latest + target: x64 + - runner: windows-latest + target: x86 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + architecture: ${{ matrix.platform.target }} + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: 'true' + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-windows-${{ matrix.platform.target }} + path: dist + + macos: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: macos-13 + target: x86_64 + - runner: macos-14 + target: aarch64 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: 'true' + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-macos-${{ matrix.platform.target }} + path: dist + + sdist: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Build sdist + uses: PyO3/maturin-action@v1 + with: + command: sdist + args: --out dist + - name: Upload sdist + uses: actions/upload-artifact@v4 + with: + name: wheels-sdist + path: dist + + release: + name: Release + runs-on: ubuntu-latest + if: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' }} + needs: [linux, musllinux, windows, macos, sdist] + permissions: + # Use to sign the release artifacts + id-token: write + # Used to upload release artifacts + contents: write + # Used to generate artifact attestation + attestations: write + steps: + - uses: actions/download-artifact@v4 + - name: Generate artifact attestation + uses: actions/attest-build-provenance@v1 + with: + subject-path: 'wheels-*/*' + - name: Publish to PyPI + if: ${{ startsWith(github.ref, 'refs/tags/') }} + uses: PyO3/maturin-action@v1 + env: + MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} + with: + command: upload + args: --non-interactive --skip-existing wheels-*/* diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..161a84e --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,58 @@ +name: Run tests + +on: + push: + branches: + - 'main' + - 'dev' + pull_request: + workflow_dispatch: + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13'] + os: [ubuntu-20.04, windows-latest] + + steps: + - uses: actions/checkout@v4 + + # First check Rust tests + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + with: + toolchain: stable + components: rustfmt, clippy + + - name: Cache Rust dependencies + uses: Swatinem/rust-cache@v2 + + - name: Run Rust tests + run: cargo test + + # After Rust tests pass, run Python tests next + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: "pip" + + - name: Install Python dependencies + run: | + # NOTE: maturin requires a virtual environment to be active + python -m venv .venv + ${{ runner.os == 'Windows' && '.venv\Scripts\activate' || 'source .venv/bin/activate' }} + python -m pip install --upgrade pip + python -m pip install -r requirements-ci.txt + + - name: Build Python package + run: | + ${{ runner.os == 'Windows' && '.venv\Scripts\activate' || 'source .venv/bin/activate' }} + maturin develop + + - name: Run Python tests + run: | + ${{ runner.os == 'Windows' && '.venv\Scripts\activate' || 'source .venv/bin/activate' }} + pytest diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6bae0a7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,96 @@ +/target + +# Byte-compiled / optimized / DLL files +__pycache__/ +.pytest_cache/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg +include/ +man/ +venv/ + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +*.sqlite3 + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# VSCode +.vscode + +# Poetry +# lock file is not needed for development +# as project supports variety of Django versions +poetry.lock + +# PyCharm +.idea/ + +# Python environment +.venv/ +.DS_Store +.python-version +site +.direnv/ +.envrc + +# JS, NPM Dependency directories +node_modules/ +jspm_packages/ + +# Mr Developer +.mr.developer.cfg +.project +.pydevproject + +# Rope +.ropeproject \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..e1a9e5e --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,11 @@ +# Release notes + +## v1.0.0 + +Initial release. + +#### Feat + +- Parser can be configured to add attributes to the HTML elements. +- Parser optionally captures what attributes were set on HTML elements + identified by a specific attribute. diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..ac35f2e --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,76 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or + advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. Examples of +representing a project or community include using an official project e-mail +address, posting via an official social media account, or acting as an appointed +representative at an online or offline event. Representation of a project may be +further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at emil@emilstenstrom.se. All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..aa2ac46 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,296 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "bitflags" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "djc_core_html_parser" +version = "1.0.0" +dependencies = [ + "pyo3", + "quick-xml", +] + +[[package]] +name = "indoc" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa799dd5ed20a7e349f3b4639aa80d74549c81716d9ec4f994c9b5815598306" + +[[package]] +name = "libc" +version = "0.2.169" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" + +[[package]] +name = "lock_api" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "memoffset" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" + +[[package]] +name = "parking_lot" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + +[[package]] +name = "proc-macro2" +version = "1.0.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "pyo3" +version = "0.19.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e681a6cfdc4adcc93b4d3cf993749a4552018ee0a9b65fc0ccfad74352c72a38" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "memoffset", + "parking_lot", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.19.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "076c73d0bc438f7a4ef6fdd0c3bb4732149136abd952b110ac93e4edb13a6ba5" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.19.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e53cee42e77ebe256066ba8aa77eff722b3bb91f3419177cf4cd0f304d3284d9" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.19.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfeb4c99597e136528c6dd7d5e3de5434d1ceaf487436a3f03b2d56b6fc9efd1" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.19.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "947dc12175c254889edc0c02e399476c2f652b4b9ebd123aa655c224de259536" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "quick-xml" +version = "0.37.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "165859e9e55f79d67b96c5d96f4e88b6f2695a1972849c15a6a3f5c59fc2c003" +dependencies = [ + "memchr", +] + +[[package]] +name = "quote" +version = "1.0.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "redox_syscall" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" +dependencies = [ + "bitflags", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "target-lexicon" +version = "0.12.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" + +[[package]] +name = "unicode-ident" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" + +[[package]] +name = "unindent" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1766d682d402817b5ac4490b3c3002d91dfa0d22812f341609f97b08757359c" + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..7eff0db --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "djc_core_html_parser" +version = "1.0.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[lib] +name = "djc_core_html_parser" +crate-type = ["cdylib"] + +[dependencies] +pyo3 = { version = "0.19.0", features = ["extension-module"] } +quick-xml = "0.37.2" + +# https://ohadravid.github.io/posts/2023-03-rusty-python +[profile.release] +debug = true # Debug symbols for profiler. +lto = true # Link-time optimization. +codegen-units = 1 # Slower compilation but faster code. diff --git a/README.md b/README.md new file mode 100644 index 0000000..e28e25f --- /dev/null +++ b/README.md @@ -0,0 +1,118 @@ +# djc-core-html-parser + +HTML parser used by [django-components](https://github.com/django-components/django-components). Written in Rust, exposed as a Python package with [maturin](https://www.maturin.rs/). + +This implementation was found to be 40-50x faster than our Python implementation, taking ~90ms to parse 5 MB of HTML. + +## Installation + +```sh +pip install djc-core-html-parser +``` + +## Usage + +```python +from djc_core_html_parser import set_html_attributes + +html = '

Hello

' +result, _ = set_html_attributes( + html, + # Add attributes to the root elements + root_attributes=['data-root-id'], + # Add attributes to all elements + all_attributes=['data-v-123'], +) +``` + +To save ourselves from re-parsing the HTML, `set_html_attributes` returns not just the transformed HTML, but also a dictionary as the second item. + +This dictionary contains a record of which HTML attributes were written to which elemenents. + +To populate this dictionary, you need set `watch_on_attribute` to an attribute name. + +Then, during the HTML transformation, we check each element for this attribute. And if the element HAS this attribute, we: + +1. Get the value of said attribute +2. Record the attributes that were added to the element, using the value of the watched attribute as the key. + +```python +from djc_core_html_parser import set_html_attributes + +html = """ +
+

+ Hello +

+
+""" + +result, captured = set_html_attributes( + html, + # Add attributes to the root elements + root_attributes=['data-root-id'], + # Add attributes to all elements + all_attributes=['data-djc-tag'], + # Watch for this attribute on elements + watch_on_attribute='data-watch-id', +) + +print(captured) +# { +# '123': ['data-root-id', 'data-djc-tag'], +# '456': ['data-djc-tag'], +# } +``` + +## Development + +1. Setup python env + + ```sh + python -m venv .venv + ``` + +2. Install dependencies + + ```sh + pip install -r requirements-dev.txt + ``` + + The dev requirements also include `maturin` which is used packaging a Rust project + as Python package. + +3. Install Rust + + See https://www.rust-lang.org/tools/install + +4. Run Rust tests + + ```sh + cargo test + ``` + +5. Build the Python package + + ```sh + maturin develop + ``` + + To build the production-optimized package, use `maturin develop --release`. + +6. Run Python tests + + ```sh + pytest + ``` + + > NOTE: When running Python tests, you need to run `maturin develop` first. + +## Deployment + +Deployment is done automatically via GitHub Actions. + +To publish a new version of the package, you need to: + +1. Bump the version in `pyproject.toml` and `Cargo.toml` +2. Open a PR and merge it to `main`. +3. Create a new tag on the `main` branch with the new version number (e.g. `v1.0.0`), or create a new release in the GitHub UI. diff --git a/__init__.pyi b/__init__.pyi new file mode 100644 index 0000000..eeeff7c --- /dev/null +++ b/__init__.pyi @@ -0,0 +1,34 @@ +from typing import List, Dict, Optional + +def set_html_attributes( + html: str, + root_attributes: List[str], + all_attributes: List[str], + check_end_names: Optional[bool] = None, + watch_on_attribute: Optional[str] = None, +) -> tuple[str, Dict[str, List[str]]]: + """ + Transform HTML by adding attributes to root and all elements. + + Args: + html (str): The HTML string to transform. Can be a fragment or full document. + root_attributes (List[str]): List of attribute names to add to root elements only. + all_attributes (List[str]): List of attribute names to add to all elements. + check_end_names (Optional[bool]): Whether to validate matching of end tags. Defaults to None. + watch_on_attribute (Optional[str]): If set, captures which attributes were added to elements with this attribute. + + Returns: + A tuple containing: + - The transformed HTML string + - A dictionary mapping captured attribute values to lists of attributes that were added + to those elements. Only returned if watch_on_attribute is set, otherwise empty dict. + + Example: + >>> html = '

Hello

' + >>> set_html_attributes(html, ['data-root-id'], ['data-v-123']) + '

Hello

' + + Raises: + ValueError: If the HTML is malformed or cannot be parsed. + """ + ... diff --git a/py.typed b/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..bda53c0 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,96 @@ +[build-system] +requires = ["maturin>=1.8,<2.0"] +build-backend = "maturin" + +[project] +name = "djc_core_html_parser" +version = "1.0.0" +requires-python = ">=3.8, <4.0" +description = "HTML parser used by django-components written in Rust." +keywords = ["django", "components", "html"] +readme = "README.md" +authors = [ + {name = "Juro Oravec", email = "juraj.oravec.josefson@gmail.com"}, +] +classifiers = [ + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Rust", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = [] +license = {text = "MIT"} + +# See https://docs.pypi.org/project_metadata/#icons +[project.urls] +Homepage = "https://github.com/django-components/djc-core-html-parser/" +Changelog = "https://github.com/django-components/djc-core-html-parser/blob/main/CHANGELOG.md" +Issues = "https://github.com/django-components/djc-core-html-parser/issues" +Donate = "https://github.com/sponsors/EmilStenstrom" + +[tool.maturin] +features = ["pyo3/extension-module"] + +[tool.black] +line-length = 119 +include = '\.pyi?$' +exclude = ''' +/( + \.git + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | activate + | _build + | buck-out + | build + | dist +)/ +''' + +[tool.isort] +profile = "black" +line_length = 119 +multi_line_output = 3 +include_trailing_comma = "True" +known_first_party = "djc_core_html_parser" + +[tool.flake8] +ignore = ['E302', 'W503'] +max-line-length = 119 +exclude = [ + 'migrations', + '__pycache__', + 'manage.py', + 'settings.py', + 'env', + '.env', + '.venv', + '.tox', + 'build', +] + +[tool.mypy] +check_untyped_defs = true +ignore_missing_imports = true +exclude = [ + 'build', +] + +[[tool.mypy.overrides]] +module = "djc_core_html_parser.*" +disallow_untyped_defs = true + + +[tool.pytest.ini_options] +testpaths = [ + "tests" +] diff --git a/requirements-ci.in b/requirements-ci.in new file mode 100644 index 0000000..33832cb --- /dev/null +++ b/requirements-ci.in @@ -0,0 +1,2 @@ +maturin +pytest \ No newline at end of file diff --git a/requirements-ci.txt b/requirements-ci.txt new file mode 100644 index 0000000..0607ebc --- /dev/null +++ b/requirements-ci.txt @@ -0,0 +1,16 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile requirements-ci.in +# +iniconfig==2.0.0 + # via pytest +maturin==1.8.1 + # via -r requirements-ci.in +packaging==24.2 + # via pytest +pluggy==1.5.0 + # via pytest +pytest==8.3.4 + # via -r requirements-ci.in diff --git a/requirements-dev.in b/requirements-dev.in new file mode 100644 index 0000000..bd354f4 --- /dev/null +++ b/requirements-dev.in @@ -0,0 +1,2 @@ +maturin +pytest diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..ea6c360 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,16 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile requirements-dev.in +# +iniconfig==2.0.0 + # via pytest +maturin==1.8.1 + # via -r requirements-dev.in +packaging==24.2 + # via pytest +pluggy==1.5.0 + # via pytest +pytest==8.3.4 + # via -r requirements-dev.in diff --git a/src/html_parser.rs b/src/html_parser.rs new file mode 100644 index 0000000..5bb7eef --- /dev/null +++ b/src/html_parser.rs @@ -0,0 +1,463 @@ +use pyo3::exceptions::PyValueError; +use pyo3::prelude::*; +use pyo3::types::{PyDict, PyTuple}; +use quick_xml::events::{BytesStart, Event}; +use quick_xml::reader::Reader; +use quick_xml::writer::Writer; +use std::collections::HashSet; +use std::io::Cursor; + +// List of HTML5 void elements. These can be written as `` or ``, +//e.g. `
`, ``, ``, etc. +const VOID_ELEMENTS: [&str; 14] = [ + "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", + "track", "wbr", +]; + +/// Transform HTML by adding attributes to the elements. +/// +/// Args: +/// html (str): The HTML string to transform. Can be a fragment or full document. +/// root_attributes (List[str]): List of attribute names to add to root elements only. +/// all_attributes (List[str]): List of attribute names to add to all elements. +/// check_end_names (bool, optional): Whether to validate matching of end tags. Defaults to false. +/// watch_on_attribute (str, optional): If set, captures which attributes were added to elements with this attribute. +/// +/// Returns: +/// Tuple[str, Dict[str, List[str]]]: A tuple containing: +/// - The transformed HTML string +/// - A dictionary mapping captured attribute values to lists of attributes that were added +/// to those elements. Only returned if watch_on_attribute is set, otherwise empty dict. +/// +/// Example: +/// >>> html = '

Hello

' +/// >>> html, captured = set_html_attributes(html, ['data-root-id'], ['data-v-123'], watch_on_attribute='data-id') +/// >>> print(captured) +/// {'123': ['data-root-id', 'data-v-123']} +/// +/// Raises: +/// ValueError: If the HTML is malformed or cannot be parsed. +#[pyfunction] +#[pyo3( + text_signature = "(html, root_attributes, all_attributes, *, check_end_names=False, watch_on_attribute=None)" +)] +pub fn set_html_attributes( + py: Python, + html: &str, + root_attributes: Vec, + all_attributes: Vec, + check_end_names: Option, + watch_on_attribute: Option, +) -> PyResult { + let config = HtmlTransformerConfig::new( + root_attributes, + all_attributes, + check_end_names.unwrap_or(false), + watch_on_attribute, + ); + + match transform(&config, html) { + Ok((html, captured)) => { + // Convert captured attributes to a Python dictionary + let captured_dict = PyDict::new(py); + for (id, attrs) in captured { + captured_dict.set_item(id, attrs)?; + } + + let result = PyTuple::new(py, &[html.into_py(py), captured_dict.into_py(py)]); + Ok(result.into()) + } + Err(e) => Err(PyValueError::new_err(e.to_string())), + } +} + +/// Configuration for HTML transformation +pub struct HtmlTransformerConfig { + root_attributes: Vec, + all_attributes: Vec, + void_elements: HashSet, + check_end_names: bool, + watch_on_attribute: Option, +} + +impl HtmlTransformerConfig { + pub fn new( + root_attributes: Vec, + all_attributes: Vec, + check_end_names: bool, + watch_on_attribute: Option, + ) -> Self { + let void_elements = VOID_ELEMENTS.iter().map(|&s| s.to_string()).collect(); + + HtmlTransformerConfig { + root_attributes, + all_attributes, + void_elements, + check_end_names, + watch_on_attribute, + } + } +} + +/// Add attributes to a HTML start tag (e.g. `
`) based on the configuration +fn add_attributes( + config: &HtmlTransformerConfig, + element: &mut BytesStart, + is_root: bool, + captured_attributes: &mut Vec<(String, Vec)>, +) { + let mut added_attrs = Vec::new(); + + // Add root attributes if this is a root element + if is_root { + for attr in &config.root_attributes { + element.push_attribute((attr.as_str(), "")); + added_attrs.push(attr.clone()); + } + } + + // Add attributes that should be applied to all elements + for attr in &config.all_attributes { + element.push_attribute((attr.as_str(), "")); + added_attrs.push(attr.clone()); + } + + // If we're watching for a specific attribute, check if this element has it + if let Some(watch_attr) = &config.watch_on_attribute { + if let Some(attr_value) = element + .attributes() + .find(|a| { + if let Ok(attr) = a { + String::from_utf8_lossy(attr.key.as_ref()) == *watch_attr + } else { + false + } + }) + .and_then(|a| a.ok()) + .map(|a| String::from_utf8_lossy(a.value.as_ref()).into_owned()) + { + captured_attributes.push((attr_value, added_attrs)); + } + } +} + +/// Main entrypoint. Transform HTML by adding attributes to the elements. +pub fn transform( + config: &HtmlTransformerConfig, + html: &str, +) -> Result<(String, Vec<(String, Vec)>), Box> { + let mut reader = Reader::from_str(html); + let reader_config = reader.config_mut(); + reader_config.check_end_names = config.check_end_names; + + // We transform the HTML by reading it and writing it simultaneously + let mut writer = Writer::new(Cursor::new(Vec::new())); + let mut captured_attributes = Vec::new(); + + // Track the nesting depth of elements to identify root elements (depth == 0) + let mut depth: i32 = 0; + + // Read the HTML event by event + loop { + match reader.read_event() { + // Start tag + Ok(Event::Start(e)) => { + let tag_name = String::from_utf8_lossy(e.name().as_ref()) + .to_string() + .to_lowercase(); + let mut elem = e.into_owned(); + add_attributes(config, &mut elem, depth == 0, &mut captured_attributes); + + // For void elements, write as Empty event + if config.void_elements.contains(&tag_name) { + writer.write_event(Event::Empty(elem))?; + } else { + writer.write_event(Event::Start(elem))?; + depth += 1; + } + } + + // End tag + Ok(Event::End(e)) => { + let tag_name = String::from_utf8_lossy(e.name().as_ref()) + .to_string() + .to_lowercase(); + + // Skip end tags for void elements + if !config.void_elements.contains(&tag_name) { + writer.write_event(Event::End(e))?; + depth -= 1; + } + } + + // Empty element (AKA void or self-closing tag, e.g. `
`) + Ok(Event::Empty(e)) => { + let mut elem = e.into_owned(); + add_attributes(config, &mut elem, depth == 0, &mut captured_attributes); + writer.write_event(Event::Empty(elem))?; + } + + // End of file + Ok(Event::Eof) => break, + // Other events (e.g. comments, processing instructions, etc.) + Ok(e) => writer.write_event(e)?, + Err(e) => return Err(Box::new(e)), + } + } + + // Convert the transformed HTML to a string + let result = String::from_utf8(writer.into_inner().into_inner())?; + Ok((result, captured_attributes)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_transformation() { + let config = HtmlTransformerConfig::new( + vec!["data-root".to_string()], + vec!["data-all".to_string()], + false, + None, + ); + + let input = "

Hello

"; + let (result, _) = transform(&config, input).unwrap(); + + assert!(result.contains("data-root")); + assert!(result.contains("data-all")); + } + + #[test] + fn test_multiple_roots() { + let config = HtmlTransformerConfig::new( + vec!["data-root".to_string()], + vec!["data-all".to_string()], + false, + None, + ); + + let input = "
First
Second"; + let (result, _) = transform(&config, input).unwrap(); + + // Both root elements should have data-root + assert_eq!(result.matches("data-root").count(), 2); + // All elements should have data-all + assert_eq!(result.matches("data-all").count(), 2); + } + + #[test] + fn test_complex_html() { + let config = HtmlTransformerConfig::new( + vec!["data-root".to_string()], + vec!["data-all".to_string(), "data-v-123".to_string()], + false, + None, + ); + + let input = r#" +
+
+

Hello & Welcome

+ +
+
+
+

Article 1

+

Some text with bold and emphasis

+ Test Image +
+
+
+
+

© 2024

+
+ "#; + + let (result, _) = transform(&config, input).unwrap(); + + // Check root elements have root attributes + assert!(result.contains( + r#"
"# + )); + assert!(result.contains(r#"
"#)); + + // Check nested elements have all_attributes but not root_attributes + assert!(result.contains(r#"

"#)); + assert!(result.contains(r#"

")); + } + + #[test] + fn test_html_head_with_meta() { + let config = HtmlTransformerConfig::new( + vec!["data-root".to_string()], + vec!["data-v-123".to_string()], + false, + None, + ); + + let input = r#" + + + Test Page + + + "#; + + let (result, _) = transform(&config, input).unwrap(); + + // Check that it parsed successfully + assert!(result.contains(r#"Test Page"#)); + assert!(result.contains(r#"")); + assert!(!result.contains("")); + assert!(result.contains("/>")); + } + + #[test] + fn test_config_check_end_names() { + // Test with check_end_names = false (lenient mode) + let config = HtmlTransformerConfig::new( + vec!["data-root".to_string()], + vec!["data-v-123".to_string()], + false, // Don't check end names + None, + ); + + // These should parse successfully with check_end_names = false + let lenient_cases = [ + "

Hello

", // Mismatched nesting + "
Text", // Wrong closing tag + "

Text", // Non-matching end tag + ]; + + for input in lenient_cases { + assert!(transform(&config, input).is_ok()); + } + + // Test with check_end_names = true (strict mode) + let config = HtmlTransformerConfig::new( + vec!["data-root".to_string()], + vec!["data-v-123".to_string()], + true, // Check end names + None, + ); + + // These should fail with check_end_names = true + for input in lenient_cases { + assert!(transform(&config, input).is_err()); + } + + // But well-formed HTML should still work + let valid_input = "

Hello

"; + assert!(transform(&config, valid_input).is_ok()); + } + + #[test] + fn test_watch_attribute() { + let config = HtmlTransformerConfig::new( + vec!["data-root".to_string()], + vec!["data-v-123".to_string()], + false, + Some("data-id".to_string()), + ); + + let input = r#" +
+

Regular element

+ Nested element + +
"#; + + let (result, captured) = transform(&config, input).unwrap(); + + println!("result: {}", result); + println!("captured: {:?}", captured); + + // Verify HTML transformation worked + assert!(result.contains(r#"
"#)); + assert!(result.contains(r#""#)); + assert!(result.contains(r#""#)); + + // Verify attribute capturing + assert_eq!(captured.len(), 3); + assert!(captured.iter().any(|(id, attrs)| id == "123" + && attrs.contains(&"data-root".to_string()) + && attrs.contains(&"data-v-123".to_string()))); + assert!(captured + .iter() + .any(|(id, attrs)| id == "456" && attrs.contains(&"data-v-123".to_string()))); + assert!(captured + .iter() + .any(|(id, attrs)| id == "789" && attrs.contains(&"data-v-123".to_string()))); + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..050fa3e --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,11 @@ +use pyo3::prelude::*; +use pyo3::types::PyModule; + +mod html_parser; + +/// A Python module implemented in Rust for high-performance HTML transformation. +#[pymodule] +fn djc_core_html_parser(_py: Python, m: &PyModule) -> PyResult<()> { + m.add_function(wrap_pyfunction!(html_parser::set_html_attributes, m)?)?; + Ok(()) +} diff --git a/tests/benchmark.py b/tests/benchmark.py new file mode 100644 index 0000000..1cf02b1 --- /dev/null +++ b/tests/benchmark.py @@ -0,0 +1,97 @@ +from statistics import mean, stdev +import time + +from djc_core_html_parser import set_html_attributes + + +def generate_large_html(num_elements: int = 1000) -> str: + """Generate a large HTML document with various features for benchmarking.""" + elements = [] + for i in range(num_elements): + # Mix of different elements and features + if i % 5 == 0: + # Void element with multiple attributes + elements.append(f'Image {i}') + elif i % 5 == 1: + # Nested divs with attributes + elements.append( + f""" +
+
+

Content {i}

+ +
+
+ """ + ) + elif i % 5 == 2: + # Script tag with content + elements.append( + f""" + + """ + ) + elif i % 5 == 3: + # CDATA section + elements.append( + f""" + + ]]> + """ + ) + else: + # Regular element with attributes + elements.append( + f""" +
+

Heading {i}

+

Paragraph {i}

+
+ """ + ) + + return f""" + + + + Benchmark Page + + + + {''.join(elements)} + + + """ + + +# Generate test HTML +HTML_SIZE = 27_000 # Set to 11_000 for 2MB +NUM_ITER = 2 +html = generate_large_html(HTML_SIZE) +print(f"\nBenchmarking with HTML size: {len(html) // 1_000} KB") + +root_attributes = ["data-root-id"] +all_attributes = ["data-v-123"] + +# Test transform +modify_times = [] +for i in range(NUM_ITER): # Run N iterations + + start = time.perf_counter() + set_html_attributes(html, root_attributes, all_attributes, watch_on_attribute="data-id") + modify_time = time.perf_counter() - start + modify_times.append(modify_time) + +print("\nTransform:") +print(f" Total: {sum(modify_times):.3f}s") +print(f" Min: {min(modify_times):.3f}s") +print(f" Max: {max(modify_times):.3f}s") +print(f" Avg: {mean(modify_times):.3f}s") +print(f" Std: {stdev(modify_times):.3f}s") diff --git a/tests/test_html_parser.py b/tests/test_html_parser.py new file mode 100644 index 0000000..57cf3b1 --- /dev/null +++ b/tests/test_html_parser.py @@ -0,0 +1,148 @@ +# This same set of tests is also found in django-components, to ensure that +# this implementation can be replaced with the django-components' pure-python implementation + +from djc_core_html_parser import set_html_attributes +from typing import Dict, List + + +def test_basic_transformation(): + html = "

Hello

" + result, _ = set_html_attributes(html, ["data-root"], ["data-all"]) + expected = '

Hello

' + assert result == expected + + +def test_multiple_roots(): + html = "
First
Second" + result, _ = set_html_attributes(html, ["data-root"], ["data-all"]) + expected = '
First
Second' + assert result == expected + + +def test_complex_html(): + html = """ +
+
+

Hello & Welcome

+ +
+
+
+

Article 1

+

Some text with bold and emphasis

+ Test Image +
+
+
+
+

© 2024

+
+ """ + + result, _ = set_html_attributes(html, ["data-root"], ["data-all", "data-v-123"]) + expected = """ +
+
+

Hello & Welcome

+ +
+
+
+

Article 1

+

Some text with bold and emphasis

+ Test Image +
+
+
+
+

© 2024

+
+ """ # noqa: E501 + assert result == expected + + +def test_void_elements(): + test_cases = [ + ('', ''), + ('', ''), + ("


", '


'), + ('Test', 'Test'), + ] + + for input_html, expected in test_cases: + result, _ = set_html_attributes(input_html, ["data-root"], ["data-v-123"]) + assert result == expected + + +def test_html_head_with_meta(): + html = """ + + + Test Page + + + """ + + result, _ = set_html_attributes(html, ["data-root"], ["data-v-123"]) + expected = """ + + + Test Page + + + """ + assert result == expected + + +def test_watch_attribute(): + html = """ +
+

Regular element

+ Nested element + +
""" + + result: str + captured: Dict[str, List[str]] + result, captured = set_html_attributes(html, ["data-root"], ["data-v-123"], watch_on_attribute="data-id") + expected = """ +
+

Regular element

+ Nested element + +
""" + assert result == expected + + # Verify attribute capturing + assert len(captured) == 3 + + # Root element should have both root and all attributes + assert "123" in captured + assert "data-root" in captured["123"] + assert "data-v-123" in captured["123"] + + # Non-root elements should only have all attributes + assert "456" in captured + assert captured["456"] == ["data-v-123"] + assert "789" in captured + assert captured["789"] == ["data-v-123"] + + +def test_whitespace_preservation(): + html = """
+

Hello World

+ Text with spaces +
""" + + result, _ = set_html_attributes(html, ["data-root"], ["data-all"]) + expected = """
+

Hello World

+ Text with spaces +
""" + assert result == expected