diff --git a/.github/workflows/_quality-python.yml b/.github/workflows/_quality-python.yml index c67052a799..a015f53031 100644 --- a/.github/workflows/_quality-python.yml +++ b/.github/workflows/_quality-python.yml @@ -36,7 +36,7 @@ jobs: ${{ inputs.working-directory }}/poetry.lock - name: Install packages for workers that use datasets if: ${{ inputs.is-datasets-worker }} - run: sudo apt update; sudo apt install -y libicu-dev ffmpeg libavcodec-extra libsndfile1 llvm pkg-config + run: sudo apt update; sudo apt install -y libicu-dev ffmpeg libavcodec-extra libsndfile1 llvm pkg-config poppler-utils - name: Install dependencies # "poetry env use" is required: https://github.com/actions/setup-python/issues/374#issuecomment-1088938718 run: | diff --git a/.github/workflows/_unit-tests-python.yml b/.github/workflows/_unit-tests-python.yml index d7155b0f32..6109f07991 100644 --- a/.github/workflows/_unit-tests-python.yml +++ b/.github/workflows/_unit-tests-python.yml @@ -37,7 +37,7 @@ jobs: ${{ inputs.working-directory }}/poetry.lock - name: Install packages for workers that use datasets if: ${{ inputs.is-datasets-worker }} - run: sudo apt update; sudo apt install -y libicu-dev ffmpeg libavcodec-extra libsndfile1 llvm pkg-config + run: sudo apt update; sudo apt install -y libicu-dev ffmpeg libavcodec-extra libsndfile1 llvm pkg-config poppler-utils - name: Install dependencies # "poetry env use" is required: https://github.com/actions/setup-python/issues/374#issuecomment-1088938718 run: | diff --git a/workers/datasets_based/Dockerfile b/workers/datasets_based/Dockerfile index 83c7705345..39ee8c95b7 100644 --- a/workers/datasets_based/Dockerfile +++ b/workers/datasets_based/Dockerfile @@ -17,6 +17,7 @@ ENV PYTHONFAULTHANDLER=1 \ RUN apt-get update \ && apt-get install -y build-essential unzip wget python3-dev make \ libicu-dev ffmpeg libavcodec-extra libsndfile1 llvm pkg-config \ + poppler-utils \ && rm -rf /var/lib/apt/lists/* RUN pip install -U --no-cache-dir pip diff --git a/workers/datasets_based/poetry.lock b/workers/datasets_based/poetry.lock index fff9c50ef8..b756355b95 100644 --- a/workers/datasets_based/poetry.lock +++ b/workers/datasets_based/poetry.lock @@ -846,6 +846,7 @@ python-versions = "*" [package.source] type = "url" url = "https://github.com/kpu/kenlm/archive/master.zip" + [[package]] name = "keras" version = "2.11.0" @@ -1272,6 +1273,17 @@ category = "dev" optional = false python-versions = ">=2.6" +[[package]] +name = "pdf2image" +version = "1.16.2" +description = "A wrapper around the pdftoppm and pdftocairo command line tools to convert PDF to a PIL Image list." +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +pillow = "*" + [[package]] name = "pillow" version = "9.4.0" @@ -1600,6 +1612,24 @@ python-versions = ">=3.6.8" [package.extras] diagrams = ["jinja2", "railroad-diagrams"] +[[package]] +name = "pypdf2" +version = "3.0.1" +description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +typing_extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} + +[package.extras] +crypto = ["PyCryptodome"] +dev = ["black", "flit", "pip-tools", "pre-commit (<2.18.0)", "pytest-cov", "wheel"] +docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] +full = ["Pillow", "PyCryptodome"] +image = ["Pillow"] + [[package]] name = "pyppmd" version = "1.0.0" @@ -2150,6 +2180,7 @@ opt-einsum = ["opt-einsum (>=3.3)"] [package.source] type = "url" url = "https://download.pytorch.org/whl/cpu/torch-1.13.1%2Bcpu-cp39-cp39-linux_x86_64.whl" + [[package]] name = "torchaudio" version = "0.13.1+cpu" @@ -2164,6 +2195,7 @@ torch = "1.13.1" [package.source] type = "url" url = "https://download.pytorch.org/whl/cpu/torchaudio-0.13.1%2Bcpu-cp39-cp39-linux_x86_64.whl" + [[package]] name = "tqdm" version = "4.64.1" @@ -2447,7 +2479,7 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "1.1" python-versions = "3.9.15" -content-hash = "0fa47399ab7f3f7a1bd3676b6b171e166336318c18270d468b112c5701595f43" +content-hash = "7be4e8e664880da11c0b018fc1902c413ecafbf410f5513c6bae66e85e631164" [metadata.files] absl-py = [ @@ -3882,6 +3914,10 @@ pbr = [ {file = "pbr-5.11.1-py2.py3-none-any.whl", hash = "sha256:567f09558bae2b3ab53cb3c1e2e33e726ff3338e7bae3db5dc954b3a44eef12b"}, {file = "pbr-5.11.1.tar.gz", hash = "sha256:aefc51675b0b533d56bb5fd1c8c6c0522fe31896679882e1c4c63d5e4a0fccb3"}, ] +pdf2image = [ + {file = "pdf2image-1.16.2-py3-none-any.whl", hash = "sha256:1469335050a17657f94c2f1ef3a23e57807d631ad5bcbaec997c2c42a8186f4a"}, + {file = "pdf2image-1.16.2.tar.gz", hash = "sha256:86761091eee35f4641ea98dfddb254254361d018be698a199aff7c1d37331803"}, +] pillow = [ {file = "Pillow-9.4.0-1-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:1b4b4e9dda4f4e4c4e6896f93e84a8f0bcca3b059de9ddf67dac3c334b1195e1"}, {file = "Pillow-9.4.0-1-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:fb5c1ad6bad98c57482236a21bf985ab0ef42bd51f7ad4e4538e89a997624e12"}, @@ -4327,6 +4363,10 @@ pyparsing = [ {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"}, {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"}, ] +pypdf2 = [ + {file = "PyPDF2-3.0.1.tar.gz", hash = "sha256:a74408f69ba6271f71b9352ef4ed03dc53a31aa404d29b5d31f53bfecfee1440"}, + {file = "pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928"}, +] pyppmd = [ {file = "pyppmd-1.0.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8049c19af4b78b400b2347bff4514763257b55516c359144e9d8091991ed12e8"}, {file = "pyppmd-1.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1c0fd06aaf782e65b7b5bbc47f8a9dbe050c1ba18474ccbe0a2b37f57a8d8c72"}, diff --git a/workers/datasets_based/pyproject.toml b/workers/datasets_based/pyproject.toml index b3b5672a2e..1619056c71 100644 --- a/workers/datasets_based/pyproject.toml +++ b/workers/datasets_based/pyproject.toml @@ -23,8 +23,10 @@ lxml = "^4.9.1" nlp = "^0.4.0" nltk = "^3.6.5" openpyxl = "^3.0.9" +pdf2image = "^1.16.2" py7zr = "^0.20.1" pydub = "^0.25.1" +pypdf2 = "^3.0.1" python = "3.9.15" rarfile = "^4.0" scikit-learn = "^1.0"