diff --git a/services/worker/poetry.lock b/services/worker/poetry.lock index 0934c65b7d..d769fb04dd 100644 --- a/services/worker/poetry.lock +++ b/services/worker/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.4.0 and should not be changed by hand. +# This file is automatically @generated by Poetry and should not be changed by hand. [[package]] name = "absl-py" @@ -136,71 +136,6 @@ files = [ [package.dependencies] frozenlist = ">=1.1.0" -[[package]] -name = "apache-beam" -version = "2.41.0" -description = "Apache Beam SDK for Python" -category = "main" -optional = false -python-versions = ">=3.7" -files = [ - {file = "apache-beam-2.41.0.zip", hash = "sha256:c1a0456a5b48c3481bf20dc904e4d812515144336873b322f17ba188e2fabd92"}, - {file = "apache_beam-2.41.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8fe21bf02554ad6ca1c1b19d37afc0c08b9d0676fb4b5a9b1f4a17303edd94d4"}, - {file = "apache_beam-2.41.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:2b3a06b0cb73a2e1b5ad892c3fab36bd8454ac8abee3cae23832c03ab1cc7121"}, - {file = "apache_beam-2.41.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:82faeb079d9612918b0ad7cbd12fa54ca56b6cb8175c043804446eb2744d965b"}, - {file = "apache_beam-2.41.0-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:ea0f500e16ffb0e5932c802abd301dc042e88ef27ded0935b9b8cac58113c43c"}, - {file = "apache_beam-2.41.0-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:e4c101f8c1427ced3c17525540c81d05137415dc4398d5d51df4c70608aee46d"}, - {file = "apache_beam-2.41.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:615612971a6aeb15eb41697945a25bb726f6f7410c71ce029fe3196cdf486edb"}, - {file = "apache_beam-2.41.0-cp37-cp37m-win32.whl", hash = "sha256:2f9395f2faefaa28306081e8fc7ebc5e7157f6734001bfcf9233c077e3d8b2ca"}, - {file = "apache_beam-2.41.0-cp37-cp37m-win_amd64.whl", hash = "sha256:bb216e5890279988c490428cfbce62a03fcdd90d6feca13ec29a074c8c3cfe36"}, - {file = "apache_beam-2.41.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cbad1f7d207224efbe8e461ceacf8c21e21e4a5a011cde87e5881649e24b5e4e"}, - {file = "apache_beam-2.41.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:dab8ee4b15cc2608bf5a715167150210246924cca65ddc6847afa21f5211c22f"}, - {file = "apache_beam-2.41.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:cca61ef1cc417ce2eca8f331fa0f8f9bbceaf6e67460a048527ebd1c33562d16"}, - {file = "apache_beam-2.41.0-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:c25ab457f4ebe356fe3726c5e7554ca29e975bf5df67ff20e339fc5e568ed550"}, - {file = "apache_beam-2.41.0-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:b7c8ca34772c26cd3103b36b69f2c31fa834ac5bb85859ac9dd51b64a2100b5b"}, - {file = "apache_beam-2.41.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:9abcb01b85fd27eaac29ea90757da1d95e293ca2fdaf6b69192020ee05d71257"}, - {file = "apache_beam-2.41.0-cp38-cp38-win32.whl", hash = "sha256:7b6581739ea8d5a346b4a722d1d280adf748d74a5c31322288a0fa9ba3204645"}, - {file = "apache_beam-2.41.0-cp38-cp38-win_amd64.whl", hash = "sha256:7c2ab828a7a3b8973f5f01101fd7746a8562a20f5f390b07e301744afa2a83d7"}, - {file = "apache_beam-2.41.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3e28758eec094b7f5585e92d8a6f9b5745a6b335d646b8fd58b6dd7f99109e67"}, - {file = "apache_beam-2.41.0-cp39-cp39-manylinux1_i686.whl", hash = "sha256:ea94024188e8aa1eb9774be66cf368d44c08cf3b34626fca4803bb33c353b72d"}, - {file = "apache_beam-2.41.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:06da861c4092f64ed9868375e8049ae26b3208c105f3f93268eacd3c7a35e1b9"}, - {file = "apache_beam-2.41.0-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:e511f2cf7d767810ad51ddca72ab93992d0bbd310984f36d5a5659276f3e5e98"}, - {file = "apache_beam-2.41.0-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:4777dbbb0ed371cf7c72d784acdefba5963d61bac11a3b62875b5817fad2d608"}, - {file = "apache_beam-2.41.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:bf3e5d838122d8fdf8fc1a1752ff2b661ad5a0641bb62dc227e433b557022acb"}, - {file = "apache_beam-2.41.0-cp39-cp39-win32.whl", hash = "sha256:fa9d2f4ce10662950fa9bec37295d8c1a50dbd4b558ea61693c6bd455d7db790"}, - {file = "apache_beam-2.41.0-cp39-cp39-win_amd64.whl", hash = "sha256:f978f7a815e2ee00c9bcec756b1aa7114ab4ba4c572978a48610f1bed6bc5e35"}, -] - -[package.dependencies] -cloudpickle = ">=2.1.0,<3" -crcmod = ">=1.7,<2.0" -dill = ">=0.3.1.1,<0.3.2" -fastavro = ">=0.23.6,<2" -grpcio = ">=1.33.1,<2" -hdfs = ">=2.1.0,<3.0.0" -httplib2 = ">=0.8,<0.21.0" -numpy = ">=1.14.3,<1.23.0" -orjson = "<4.0" -proto-plus = ">=1.7.1,<2" -protobuf = ">=3.12.2,<4" -pyarrow = ">=0.15.1,<8.0.0" -pydot = ">=1.2.0,<2" -pymongo = ">=3.8.0,<4.0.0" -python-dateutil = ">=2.8.0,<3" -pytz = ">=2018.3" -requests = ">=2.24.0,<3.0.0" -typing-extensions = ">=3.7.0" - -[package.extras] -aws = ["boto3 (>=1.9)"] -azure = ["azure-core (>=1.7.0)", "azure-storage-blob (>=12.3.2)"] -dataframe = ["pandas (>=1.0,<1.5)"] -docs = ["Sphinx (>=1.5.2,<2.0)", "docutils (==0.17.1)"] -gcp = ["cachetools (>=3.1.0,<5)", "google-api-core (!=2.8.2,<3)", "google-apitools (>=0.5.31,<0.5.32)", "google-auth (>=1.18.0,<3)", "google-auth-httplib2 (>=0.1.0,<0.2.0)", "google-cloud-bigquery (>=1.6.0,<3)", "google-cloud-bigquery-storage (>=2.6.3,<2.14)", "google-cloud-bigtable (>=0.31.1,<2)", "google-cloud-core (>=0.28.1,<3)", "google-cloud-datastore (>=1.8.0,<2)", "google-cloud-dlp (>=3.0.0,<4)", "google-cloud-language (>=1.3.0,<2)", "google-cloud-pubsub (>=2.1.0,<3)", "google-cloud-pubsublite (>=1.2.0,<2)", "google-cloud-recommendations-ai (>=0.1.0,<0.8.0)", "google-cloud-spanner (>=1.13.0,<2)", "google-cloud-videointelligence (>=1.8.0,<2)", "google-cloud-vision (>=0.38.0,<2)", "grpcio-gcp (>=0.2.2,<1)"] -interactive = ["facets-overview (>=1.0.0,<2)", "google-cloud-dataproc (>=3.0.0,<3.2.0)", "ipykernel (>=6,<7)", "ipython (>=7,<8)", "ipython (>=8,<9)", "ipywidgets (>=7.6.5,<8)", "jupyter-client (>=6.1.11,<6.1.13)", "timeloop (>=1.0.2,<2)"] -interactive-test = ["chromedriver-binary (>=100,<101)", "nbconvert (>=6.2.0,<7)", "nbformat (>=5.0.5,<6)", "needle (>=0.5.0,<1)", "pillow (>=7.1.1,<8)"] -test = ["cryptography (>=36.0.0)", "freezegun (>=0.3.12)", "joblib (>=1.0.1)", "mock (>=1.0.1,<3.0.0)", "pandas (<2.0.0)", "parameterized (>=0.7.1,<0.9.0)", "psycopg2-binary (>=2.8.5,<3.0.0)", "pyhamcrest (>=1.9,!=1.10.0,<2.0.0)", "pytest (>=4.4.0,<5.0)", "pytest-timeout (>=1.3.3,<2)", "pytest-xdist (>=1.29.0,<2)", "pyyaml (>=3.12,<7.0.0)", "requests-mock (>=1.7,<2.0)", "scikit-learn (>=0.20.0)", "sqlalchemy (>=1.3,<2.0)", "tenacity (>=5.0.2,<6.0)", "testcontainers[mysql] (>=3.0.3,<4.0.0)"] - [[package]] name = "appdirs" version = "1.4.4" @@ -732,18 +667,6 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} -[[package]] -name = "cloudpickle" -version = "2.2.1" -description = "Extended pickling support for Python objects" -category = "main" -optional = false -python-versions = ">=3.6" -files = [ - {file = "cloudpickle-2.2.1-py3-none-any.whl", hash = "sha256:61f594d1f4c295fa5cd9014ceb3a1fc4a70b0de1164b94fbc2d854ccba056f9f"}, - {file = "cloudpickle-2.2.1.tar.gz", hash = "sha256:d89684b8de9e34a2a43b3460fbca07d09d6e25ce858df4d5a44240403b6178f5"}, -] - [[package]] name = "colorama" version = "0.4.6" @@ -920,17 +843,6 @@ files = [ {file = "crc32c-2.3.post0.tar.gz", hash = "sha256:7d4b39ca6791830c4f1c053d2d8983627af702f0445535ff53d3220f35cf6ce6"}, ] -[[package]] -name = "crcmod" -version = "1.7" -description = "CRC Generator" -category = "main" -optional = false -python-versions = "*" -files = [ - {file = "crcmod-1.7.tar.gz", hash = "sha256:dc7051a0db5f2bd48665a990d3ec1cc305a466a77358ca4492826f41f283601e"}, -] - [[package]] name = "cyclonedx-python-lib" version = "3.1.5" @@ -1037,17 +949,6 @@ files = [ dnssec = ["ecdsa (>=0.13)", "pycryptodome"] idna = ["idna (>=2.1)"] -[[package]] -name = "docopt" -version = "0.6.2" -description = "Pythonic argument parser, that will make you smile" -category = "main" -optional = false -python-versions = "*" -files = [ - {file = "docopt-0.6.2.tar.gz", hash = "sha256:49b3a825280bd66b3aa83585ef59c4a8c82f2c8a522dbe754a8bc8d08c85c491"}, -] - [[package]] name = "environs" version = "9.5.0" @@ -1097,53 +998,6 @@ files = [ [package.extras] test = ["pytest (>=6)"] -[[package]] -name = "fastavro" -version = "1.7.3" -description = "Fast read/write of AVRO files" -category = "main" -optional = false -python-versions = ">=3.7" -files = [ - {file = "fastavro-1.7.3-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:10a5ac9d8c66d4ba24f25ad7313e2dab56d98ceebcf53ba9cfa88acdd135c794"}, - {file = "fastavro-1.7.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2e6d8bb79e53dc39e620c777f14b5f7122f1bf21309a9fcf60085f8e062e49c"}, - {file = "fastavro-1.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a0ba2f43844eb784f8abf5324a0c10474287beaecb14fb736e47136464e3044"}, - {file = "fastavro-1.7.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e20db96b092d7b6208f3063a424d35bb48c283e2d8b4e7ad4ee6541dc1fac2ed"}, - {file = "fastavro-1.7.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:52ba6bb5525561df577ebd94819784626caac9d8ad2ed167030403ba1bf73159"}, - {file = "fastavro-1.7.3-cp310-cp310-win_amd64.whl", hash = "sha256:22d6f3e73f471e2b4ba0785cb60df939792e8904db4ba93037ba6b7858f7d6f9"}, - {file = "fastavro-1.7.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5dd5299cbc5bc2aa15f1c619f4cc55c054c6fe9ccd614f93eb1d6ab22cf314dd"}, - {file = "fastavro-1.7.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4706a77038bf31ad2e8cc752a0c007894bd39ffb0b775c7824113743182c5f6"}, - {file = "fastavro-1.7.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6b0e58e7dd34906d21738c3461cddef760de3b7845779169a378b2757afa693"}, - {file = "fastavro-1.7.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:04740e2dd27084b4155337d082f2a232cf1d801a1b009f772e50c8306a8f8aaf"}, - {file = "fastavro-1.7.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2f95c767bf97e896640f24d58931b3a19df3d84ccaf0606c92e603c79de60f16"}, - {file = "fastavro-1.7.3-cp311-cp311-win_amd64.whl", hash = "sha256:a727e07007230267e25702d5f3738854cb315747fc58b84839699db30dedf490"}, - {file = "fastavro-1.7.3-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:da71b9db7718f4682cc11e0f25b5e395d5f3bc17ddaf0224f39be3bac5309cfa"}, - {file = "fastavro-1.7.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1a62c359f4c9472c3ebe2be478e203ff434cc1d6bebaf61181a4a121c0899a6"}, - {file = "fastavro-1.7.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e2ba001864607b46fc2f6124d690731b19db215a84751c4b3b155e70b615d05"}, - {file = "fastavro-1.7.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:cd10bffc26457402da9727663de71c40dd717d90e8ab3d3b893bc227cad5e410"}, - {file = "fastavro-1.7.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e5d0f16b85104aa0e2899a47c186be1082a10cecf6b331571afa92a4b8e6061a"}, - {file = "fastavro-1.7.3-cp37-cp37m-win_amd64.whl", hash = "sha256:b86d1c1188ec47aeb76d6195e36ab52665984e8e98f69a224ab550c82991fe07"}, - {file = "fastavro-1.7.3-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:a007151cc2a08e61dd5ea5b48989849d056a8d63b04d7e6799c36fdf0b702bf4"}, - {file = "fastavro-1.7.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b176d5731f336c2c9c88d95225f71f862b2512c33ef917b1fe7f87379cc92fd"}, - {file = "fastavro-1.7.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c52e4a1b05306f82916eacf83c732a4637a5be748bc2ef2ff6fed1506535d692"}, - {file = "fastavro-1.7.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:207bc7663133ca766eaf9033806da4cf08071dacf2e9779aa9427df40815f846"}, - {file = "fastavro-1.7.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:cc98be4ad3d8fb9000abeeae0ecb0f8e62ec7898b791da5ec2f6de81dd2a73e2"}, - {file = "fastavro-1.7.3-cp38-cp38-win_amd64.whl", hash = "sha256:fb3879aaeb3b56ee5b3a22ffa11cbdf4ba65c04be4688ee8bd152aa6535a00ee"}, - {file = "fastavro-1.7.3-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:8c45f7fdfab351431d106f5981fdc2313a8cbfdb82d2b1172b2a144bfba376b7"}, - {file = "fastavro-1.7.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:127e928604753d845fa0f2ae758c1640215ff901a5ce20cdf7e9f154500c3212"}, - {file = "fastavro-1.7.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b82a71a6c014ec5d03293d8dc8b698220380266d5503779fd3712a94e4497069"}, - {file = "fastavro-1.7.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:08bfd749cce456f925203895d6732f6b68c973d63ff886733f27db3c2d3c0b9a"}, - {file = "fastavro-1.7.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:dc1e88d5db17e7ebc3fc764a1091f6c05a42e3cb0e2c8eaf49126743c7ca1bb5"}, - {file = "fastavro-1.7.3-cp39-cp39-win_amd64.whl", hash = "sha256:7d525f3f99cc49a5e245e08d7ab947195a18cbdd5c43af75c0989fbe14a32597"}, - {file = "fastavro-1.7.3.tar.gz", hash = "sha256:8b08bd3cba45830b64adda32ccc5b027a71b6941a99cc39f90d7019a7986cc19"}, -] - -[package.extras] -codecs = ["lz4", "python-snappy", "zstandard"] -lz4 = ["lz4"] -snappy = ["python-snappy"] -zstandard = ["zstandard"] - [[package]] name = "filelock" version = "3.9.0" @@ -1529,26 +1383,31 @@ files = [ numpy = ">=1.14.5" [[package]] -name = "hdfs" -version = "2.7.0" -description = "HdfsCLI: API and command line interface for HDFS." +name = "hffs" +version = "0.0.1.dev0" +description = "Filesystem interface over huggingface.co repositories" category = "main" optional = false -python-versions = "*" -files = [ - {file = "hdfs-2.7.0-py3-none-any.whl", hash = "sha256:3428078ad1e83a2e2a11801c536ac2aa5094f5fabde5d1e7145bacbf4a599c1e"}, - {file = "hdfs-2.7.0.tar.gz", hash = "sha256:ecd4650c39bb4f9421641320f4931edd81cf7126ae4e5ec880215adf6435df3d"}, -] +python-versions = ">=3.7.0" +files = [] +develop = false [package.dependencies] -docopt = "*" -requests = ">=2.7.0" -six = ">=1.9.0" +fsspec = "*" +huggingface_hub = ">=0.12.0" +packaging = ">=20.9" +requests = "*" [package.extras] -avro = ["fastavro (>=0.21.19)"] -dataframe = ["fastavro (>=0.21.19)", "pandas (>=0.14.1)"] -kerberos = ["requests-kerberos (>=0.7.0)"] +dev = ["black (>=23.1,<24.0)", "pytest", "ruff (>=0.0.241)"] +quality = ["black (>=23.1,<24.0)", "ruff (>=0.0.241)"] +tests = ["pytest"] + +[package.source] +type = "git" +url = "https://github.com/huggingface/hffs.git" +reference = "0e187e74d38e9436353691f4a7a26b15f0663f58" +resolved_reference = "0e187e74d38e9436353691f4a7a26b15f0663f58" [[package]] name = "html5lib" @@ -1572,21 +1431,6 @@ chardet = ["chardet (>=2.2)"] genshi = ["genshi"] lxml = ["lxml"] -[[package]] -name = "httplib2" -version = "0.20.4" -description = "A comprehensive HTTP client library." -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -files = [ - {file = "httplib2-0.20.4-py3-none-any.whl", hash = "sha256:8b6a905cb1c79eefd03f8669fd993c36dc341f7c558f056cb5a33b5c2f458543"}, - {file = "httplib2-0.20.4.tar.gz", hash = "sha256:58a98e45b4b1a48273073f905d2961666ecf0fbac4250ea5b47aef259eb5c585"}, -] - -[package.dependencies] -pyparsing = {version = ">=2.4.2,<3.0.0 || >3.0.0,<3.0.1 || >3.0.1,<3.0.2 || >3.0.2,<3.0.3 || >3.0.3,<4", markers = "python_version > \"3.0\""} - [[package]] name = "huggingface-hub" version = "0.12.1" @@ -2879,6 +2723,7 @@ optional = false python-versions = "*" files = [ {file = "pdf2image-1.16.3-py3-none-any.whl", hash = "sha256:b6154164af3677211c22cbb38b2bd778b43aca02758e962fe1e231f6d3b0e380"}, + {file = "pdf2image-1.16.3.tar.gz", hash = "sha256:74208810c2cef4d9e347769b8e62a52303982ddb4f2dfd744c7ab4b940ae287e"}, ] [package.dependencies] @@ -3105,24 +2950,6 @@ progress = ["tqdm (>=4.41.0,<5.0.0)"] sftp = ["paramiko (>=2.7.0)"] xxhash = ["xxhash (>=1.4.3)"] -[[package]] -name = "proto-plus" -version = "1.22.2" -description = "Beautiful, Pythonic protocol buffers." -category = "main" -optional = false -python-versions = ">=3.6" -files = [ - {file = "proto-plus-1.22.2.tar.gz", hash = "sha256:0e8cda3d5a634d9895b75c573c9352c16486cb75deb0e078b5fda34db4243165"}, - {file = "proto_plus-1.22.2-py3-none-any.whl", hash = "sha256:de34e52d6c9c6fcd704192f09767cb561bb4ee64e70eede20b0834d841f0be4d"}, -] - -[package.dependencies] -protobuf = ">=3.19.0,<5.0.0dev" - -[package.extras] -testing = ["google-api-core[grpc] (>=1.31.5)"] - [[package]] name = "protobuf" version = "3.19.6" @@ -3218,42 +3045,37 @@ test-compat = ["libarchive-c"] [[package]] name = "pyarrow" -version = "7.0.0" +version = "11.0.0" description = "Python library for Apache Arrow" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "pyarrow-7.0.0-cp310-cp310-macosx_10_13_universal2.whl", hash = "sha256:0f15213f380539c9640cb2413dc677b55e70f04c9e98cfc2e1d8b36c770e1036"}, - {file = "pyarrow-7.0.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:29c4e3b3be0b94d07ff4921a5e410fc690a3a066a850a302fc504de5fc638495"}, - {file = "pyarrow-7.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8a9bfc8a016bcb8f9a8536d2fa14a890b340bc7a236275cd60fd4fb8b93ff405"}, - {file = "pyarrow-7.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:49d431ed644a3e8f53ae2bbf4b514743570b495b5829548db51610534b6eeee7"}, - {file = "pyarrow-7.0.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:aa6442a321c1e49480b3d436f7d631c895048a16df572cf71c23c6b53c45ed66"}, - {file = "pyarrow-7.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f6b01a23cb401750092c6f7c4dcae67cd8fd6b99ae710e26f654f23508f25f25"}, - {file = "pyarrow-7.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f10928745c6ff66e121552731409803bed86c66ac79c64c90438b053b5242c5"}, - {file = "pyarrow-7.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:759090caa1474cafb5e68c93a9bd6cb45d8bb8e4f2cad2f1a0cc9439bae8ae88"}, - {file = "pyarrow-7.0.0-cp37-cp37m-macosx_10_13_x86_64.whl", hash = "sha256:e3fe34bcfc28d9c4a747adc3926d2307a04c5c50b89155946739515ccfe5eab0"}, - {file = "pyarrow-7.0.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:040dce5345603e4e621bcf4f3b21f18d557852e7b15307e559bb14c8951c8714"}, - {file = "pyarrow-7.0.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ed4b647c3345ae3463d341a9d28d0260cd302fb92ecf4e2e3e0f1656d6e0e55c"}, - {file = "pyarrow-7.0.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e7fecd5d5604f47e003f50887a42aee06cb8b7bf8e8bf7dc543a22331d9ba832"}, - {file = "pyarrow-7.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f2d00b892fe865e43346acb78761ba268f8bb1cbdba588816590abcb780ee3d"}, - {file = "pyarrow-7.0.0-cp37-cp37m-win_amd64.whl", hash = "sha256:f439f7d77201681fd31391d189aa6b1322d27c9311a8f2fce7d23972471b02b6"}, - {file = "pyarrow-7.0.0-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:3e06b0e29ce1e32f219c670c6b31c33d25a5b8e29c7828f873373aab78bf30a5"}, - {file = "pyarrow-7.0.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:13dc05bcf79dbc1bd2de1b05d26eb64824b85883d019d81ca3c2eca9b68b5a44"}, - {file = "pyarrow-7.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:06183a7ff2b0c030ec0413fc4dc98abad8cf336c78c280a0b7f4bcbebb78d125"}, - {file = "pyarrow-7.0.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:702c5a9f960b56d03569eaaca2c1a05e8728f05ea1a2138ef64234aa53cd5884"}, - {file = "pyarrow-7.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c7313038203df77ec4092d6363dbc0945071caa72635f365f2b1ae0dd7469865"}, - {file = "pyarrow-7.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e87d1f7dc7a0b2ecaeb0c7a883a85710f5b5626d4134454f905571c04bc73d5a"}, - {file = "pyarrow-7.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:ba69488ae25c7fde1a2ae9ea29daf04d676de8960ffd6f82e1e13ca945bb5861"}, - {file = "pyarrow-7.0.0-cp39-cp39-macosx_10_13_universal2.whl", hash = "sha256:11a591f11d2697c751261c9d57e6e5b0d38fdc7f0cc57f4fd6edc657da7737df"}, - {file = "pyarrow-7.0.0-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:6183c700877852dc0f8a76d4c0c2ffd803ba459e2b4a452e355c2d58d48cf39f"}, - {file = "pyarrow-7.0.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d1748154714b543e6ae8452a68d4af85caf5298296a7e5d4d00f1b3021838ac6"}, - {file = "pyarrow-7.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fcc8f934c7847a88f13ec35feecffb61fe63bb7a3078bd98dd353762e969ce60"}, - {file = "pyarrow-7.0.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:759f59ac77b84878dbd54d06cf6df74ff781b8e7cf9313eeffbb5ec97b94385c"}, - {file = "pyarrow-7.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d3e3f93ac2993df9c5e1922eab7bdea047b9da918a74e52145399bc1f0099a3"}, - {file = "pyarrow-7.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:306120af554e7e137895254a3b4741fad682875a5f6403509cd276de3fe5b844"}, - {file = "pyarrow-7.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:087769dac6e567d58d59b94c4f866b3356c00d3db5b261387ece47e7324c2150"}, - {file = "pyarrow-7.0.0.tar.gz", hash = "sha256:da656cad3c23a2ebb6a307ab01d35fce22f7850059cffafcb90d12590f8f4f38"}, + {file = "pyarrow-11.0.0-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:40bb42afa1053c35c749befbe72f6429b7b5f45710e85059cdd534553ebcf4f2"}, + {file = "pyarrow-11.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7c28b5f248e08dea3b3e0c828b91945f431f4202f1a9fe84d1012a761324e1ba"}, + {file = "pyarrow-11.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a37bc81f6c9435da3c9c1e767324ac3064ffbe110c4e460660c43e144be4ed85"}, + {file = "pyarrow-11.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad7c53def8dbbc810282ad308cc46a523ec81e653e60a91c609c2233ae407689"}, + {file = "pyarrow-11.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:25aa11c443b934078bfd60ed63e4e2d42461682b5ac10f67275ea21e60e6042c"}, + {file = "pyarrow-11.0.0-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:e217d001e6389b20a6759392a5ec49d670757af80101ee6b5f2c8ff0172e02ca"}, + {file = "pyarrow-11.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ad42bb24fc44c48f74f0d8c72a9af16ba9a01a2ccda5739a517aa860fa7e3d56"}, + {file = "pyarrow-11.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d942c690ff24a08b07cb3df818f542a90e4d359381fbff71b8f2aea5bf58841"}, + {file = "pyarrow-11.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f010ce497ca1b0f17a8243df3048055c0d18dcadbcc70895d5baf8921f753de5"}, + {file = "pyarrow-11.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:2f51dc7ca940fdf17893227edb46b6784d37522ce08d21afc56466898cb213b2"}, + {file = "pyarrow-11.0.0-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:1cbcfcbb0e74b4d94f0b7dde447b835a01bc1d16510edb8bb7d6224b9bf5bafc"}, + {file = "pyarrow-11.0.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aaee8f79d2a120bf3e032d6d64ad20b3af6f56241b0ffc38d201aebfee879d00"}, + {file = "pyarrow-11.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:410624da0708c37e6a27eba321a72f29d277091c8f8d23f72c92bada4092eb5e"}, + {file = "pyarrow-11.0.0-cp37-cp37m-win_amd64.whl", hash = "sha256:2d53ba72917fdb71e3584ffc23ee4fcc487218f8ff29dd6df3a34c5c48fe8c06"}, + {file = "pyarrow-11.0.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:f12932e5a6feb5c58192209af1d2607d488cb1d404fbc038ac12ada60327fa34"}, + {file = "pyarrow-11.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:41a1451dd895c0b2964b83d91019e46f15b5564c7ecd5dcb812dadd3f05acc97"}, + {file = "pyarrow-11.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:becc2344be80e5dce4e1b80b7c650d2fc2061b9eb339045035a1baa34d5b8f1c"}, + {file = "pyarrow-11.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f40be0d7381112a398b93c45a7e69f60261e7b0269cc324e9f739ce272f4f70"}, + {file = "pyarrow-11.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:362a7c881b32dc6b0eccf83411a97acba2774c10edcec715ccaab5ebf3bb0835"}, + {file = "pyarrow-11.0.0-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:ccbf29a0dadfcdd97632b4f7cca20a966bb552853ba254e874c66934931b9841"}, + {file = "pyarrow-11.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3e99be85973592051e46412accea31828da324531a060bd4585046a74ba45854"}, + {file = "pyarrow-11.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69309be84dcc36422574d19c7d3a30a7ea43804f12552356d1ab2a82a713c418"}, + {file = "pyarrow-11.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da93340fbf6f4e2a62815064383605b7ffa3e9eeb320ec839995b1660d69f89b"}, + {file = "pyarrow-11.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:caad867121f182d0d3e1a0d36f197df604655d0b466f1bc9bafa903aa95083e4"}, + {file = "pyarrow-11.0.0.tar.gz", hash = "sha256:5461c57dbdb211a632a48facb9b39bbeb8a7905ec95d768078525283caef5f6d"}, ] [package.dependencies] @@ -3445,21 +3267,6 @@ files = [ {file = "pycryptodomex-3.17.tar.gz", hash = "sha256:0af93aad8d62e810247beedef0261c148790c52f3cd33643791cc6396dd217c1"}, ] -[[package]] -name = "pydot" -version = "1.4.2" -description = "Python interface to Graphviz's Dot" -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -files = [ - {file = "pydot-1.4.2-py2.py3-none-any.whl", hash = "sha256:66c98190c65b8d2e2382a441b4c0edfdb4f4c025ef9cb9874de478fb0793a451"}, - {file = "pydot-1.4.2.tar.gz", hash = "sha256:248081a39bcb56784deb018977e428605c1c758f10897a339fce1dd728ff007d"}, -] - -[package.dependencies] -pyparsing = ">=2.1.4" - [[package]] name = "pydub" version = "0.25.1" @@ -3646,7 +3453,7 @@ zstd = ["zstandard"] name = "pyparsing" version = "3.0.9" description = "pyparsing module - Classes and methods to define and execute parsing grammars" -category = "main" +category = "dev" optional = false python-versions = ">=3.6.8" files = [ @@ -5580,4 +5387,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "3.9.15" -content-hash = "f3944a99f45c8d6f063ec218eb246c3c6ef4015d6f633350ec026948efd54549" +content-hash = "22687b53309b318053980c87bcec9741ce1be71bcbb792cf510a721d92919e5e" diff --git a/services/worker/pyproject.toml b/services/worker/pyproject.toml index 17e16b5e8c..6a3d4378db 100644 --- a/services/worker/pyproject.toml +++ b/services/worker/pyproject.toml @@ -9,12 +9,12 @@ license = "Apache-2.0" Pillow = "^9.4.0" PyICU = "^2.10.2" aiohttp = "^3.8.4" -apache-beam = "2.41.0" # ^2 gives a InvalidWheelName error because it tries to install 2.42 that has not been released... bs4 = "^0.0.1" conllu = "^4.5.2" -datasets = { extras = ["audio", "vision"], version = "~2.10.0" } +datasets = { extras = ["audio", "vision"], version = "^2.10.1" } environs = "^9.5.0" gdown = "^4.6.3" +hffs = {git = "https://github.com/huggingface/hffs.git", rev="0e187e74d38e9436353691f4a7a26b15f0663f58"} huggingface-hub = "^0.12.0" kenlm = { git = "https://github.com/kpu/kenlm", branch = "master" } kss = "^2.6.0" @@ -23,8 +23,10 @@ lm-dataformat = "^0.0.20" lxml = "^4.9.2" nlp = "^0.4.0" nltk = "^3.8.1" +numpy = "~1.22.4" openpyxl = "^3.1.1" pdf2image = "^1.16.2" +pyarrow = "^11.0.0" py7zr = "^0.20.4" pydub = "^0.25.1" pypdf2 = "^3.0.1" @@ -84,5 +86,10 @@ exclude = 'vendors' strict = true [[tool.mypy.overrides]] -module = "datasets.*" +module = [ + "datasets.*", + "hffs.*", + "pyarrow.*", + "tqdm.*" +] ignore_missing_imports = true diff --git a/services/worker/tests/fixtures/hub.py b/services/worker/tests/fixtures/hub.py index 14eb69a893..982209aee2 100644 --- a/services/worker/tests/fixtures/hub.py +++ b/services/worker/tests/fixtures/hub.py @@ -328,7 +328,7 @@ def create_dataset_info_response_for_csv(dataset: str, config: str) -> Any: } -def create_dataset_info_response_for_audio(dataset: str, config: str) -> Any: +def create_dataset_info_response_for_audio() -> Any: return { "description": "", "citation": "", @@ -338,13 +338,13 @@ def create_dataset_info_response_for_audio(dataset: str, config: str) -> Any: "splits": {"train": {"name": "train", "num_bytes": 59, "num_examples": 1, "dataset_name": "parquet"}}, "download_checksums": { "SOME_KEY": { - "num_bytes": 1383, + "num_bytes": AUDIO_PARQUET_SIZE, "checksum": None, } }, - "download_size": 1383, + "download_size": AUDIO_PARQUET_SIZE, "dataset_size": 59, - "size_in_bytes": 1442, + "size_in_bytes": 1443, } @@ -356,7 +356,7 @@ def create_parquet_and_dataset_info_response(dataset: str, data_type: Literal["c info = ( create_dataset_info_response_for_csv(dataset, config) if data_type == "csv" - else create_dataset_info_response_for_audio(dataset, config) + else create_dataset_info_response_for_audio() ) return { "parquet_files": [ @@ -375,8 +375,8 @@ def create_parquet_and_dataset_info_response(dataset: str, data_type: Literal["c } -CSV_PARQUET_SIZE = 1_865 -AUDIO_PARQUET_SIZE = 1_383 +CSV_PARQUET_SIZE = 1_866 +AUDIO_PARQUET_SIZE = 1_384 DATA_cols = { "col_1": {"_type": "Value", "dtype": "int64"},