diff --git a/conftest.py b/conftest.py deleted file mode 100644 index f9c9e4e9d..000000000 --- a/conftest.py +++ /dev/null @@ -1,596 +0,0 @@ -from __future__ import annotations - -import logging -import os -import secrets -import shutil -from collections import namedtuple -from datetime import date, datetime, timedelta -from importlib import import_module -from pathlib import Path, PurePosixPath -from typing import TYPE_CHECKING -from unittest.mock import Mock - -import pytest -from etl_entities import ( - Column, - DateHWM, - DateTimeHWM, - FileListHWM, - IntHWM, - RemoteFolder, - Table, -) -from pytest_lazyfixture import lazy_fixture - -if TYPE_CHECKING: - from pyspark.sql import SparkSession - -# disable failing plugin import -os.environ["ONETL_PLUGINS_BLACKLIST"] = "failing-plugin" - -from onetl.hwm.store import MemoryHWMStore -from tests.lib.common import upload_files - -log = logging.getLogger(__name__) - -PreparedDbInfo = namedtuple("PreparedDbInfo", ["full_name", "schema", "table"]) - - -@pytest.fixture(scope="session") -def ftp_server(): - FTPServer = namedtuple("FTPServer", ["host", "port", "user", "password"]) - - return FTPServer( - host=os.getenv("ONETL_FTP_HOST"), - port=os.getenv("ONETL_FTP_PORT"), - user=os.getenv("ONETL_FTP_USER"), - password=os.getenv("ONETL_FTP_PASSWORD"), - ) - - -@pytest.fixture( - scope="function", - params=[ - pytest.param( - "real", - marks=[pytest.mark.ftp, pytest.mark.file_connection, pytest.mark.connection], - ), - ], -) -def ftp_data(ftp_server): - from onetl.connection import FTP - - ftp = FTP( - host=ftp_server.host, - port=ftp_server.port, - user=ftp_server.user, - password=ftp_server.password, - ) - - return ftp, PurePosixPath("/export/news_parse") - - -@pytest.fixture() -def ftp_connection(ftp_data): - return ftp_data[0] - - -@pytest.fixture(scope="session") -def ftps_server(): - FTPSServer = namedtuple("FTPSServer", ["host", "port", "user", "password"]) - - return FTPSServer( - host=os.getenv("ONETL_FTPS_HOST"), - port=os.getenv("ONETL_FTPS_PORT"), - user=os.getenv("ONETL_FTPS_USER"), - password=os.getenv("ONETL_FTPS_PASSWORD"), - ) - - -@pytest.fixture( - scope="function", - params=[ - pytest.param( - "real", - marks=[pytest.mark.ftps, pytest.mark.file_connection, pytest.mark.connection], - ), - ], -) -def ftps_data(ftps_server): - from onetl.connection import FTPS - - ftps = FTPS( - host=ftps_server.host, - port=ftps_server.port, - user=ftps_server.user, - password=ftps_server.password, - ) - - return ftps, PurePosixPath("/export/news_parse") - - -@pytest.fixture() -def ftps_connection(ftps_data): - return ftps_data[0] - - -@pytest.fixture(scope="session") -def s3_server(): - S3Server = namedtuple("S3Server", ["host", "port", "bucket", "access_key", "secret_key", "protocol"]) - - return S3Server( - host=os.getenv("ONETL_S3_HOST"), - port=os.getenv("ONETL_S3_PORT"), - bucket=os.getenv("ONETL_S3_BUCKET"), - access_key=os.getenv("ONETL_S3_ACCESS_KEY"), - secret_key=os.getenv("ONETL_S3_SECRET_KEY"), - protocol=os.getenv("ONETL_S3_PROTOCOL", "http").lower(), - ) - - -@pytest.fixture( - scope="function", - params=[ - pytest.param("real", marks=[pytest.mark.s3, pytest.mark.file_connection, pytest.mark.connection]), - ], -) -def s3_data(s3_server): - from onetl.connection import S3 - - s3 = S3( - host=s3_server.host, - port=s3_server.port, - bucket=s3_server.bucket, - access_key=s3_server.access_key, - secret_key=s3_server.secret_key, - protocol=s3_server.protocol, - ) - - if not s3.client.bucket_exists(s3_server.bucket): - s3.client.make_bucket(s3_server.bucket) - - return s3, PurePosixPath("/export/news_parse") - - -@pytest.fixture() -def s3_connection(s3_data): - return s3_data[0] - - -@pytest.fixture(scope="session") -def sftp_server(): - SFTPServer = namedtuple("SFTPServer", ["host", "port", "user", "password"]) - - return SFTPServer( - host=os.getenv("ONETL_SFTP_HOST"), - port=os.getenv("ONETL_SFTP_PORT"), - user=os.getenv("ONETL_SFTP_USER"), - password=os.getenv("ONETL_SFTP_PASSWORD"), - ) - - -@pytest.fixture( - scope="function", - params=[ - pytest.param( - "real", - marks=[pytest.mark.sftp, pytest.mark.file_connection, pytest.mark.connection], - ), - ], -) -def sftp_data(sftp_server): - from onetl.connection import SFTP - - sftp = SFTP( - host=sftp_server.host, - port=sftp_server.port, - user=sftp_server.user, - password=sftp_server.password, - ) - - return sftp, PurePosixPath("/app/news_parse") - - -@pytest.fixture() -def sftp_connection(sftp_data): - return sftp_data[0] - - -@pytest.fixture(scope="session") -def webdav_server(): - WebDAVServer = namedtuple("WebDAVServer", ["host", "port", "user", "password", "ssl_verify", "protocol"]) - - return WebDAVServer( - host=os.getenv("ONETL_WEBDAV_HOST"), - port=os.getenv("ONETL_WEBDAV_PORT"), - user=os.getenv("ONETL_WEBDAV_USER"), - password=os.getenv("ONETL_WEBDAV_PASSWORD"), - ssl_verify=os.getenv("ONETL_WEBDAV_SSL_VERIFY", "false").lower() != "true", - protocol=os.getenv("ONETL_WEBDAV_PROTOCOL", "http").lower(), - ) - - -@pytest.fixture( - scope="session", - params=[ - pytest.param( - "real", - marks=[pytest.mark.webdav, pytest.mark.file_connection, pytest.mark.connection], - ), - ], -) -def webdav_data(webdav_server): - from onetl.connection import WebDAV - - webdav = WebDAV( - host=webdav_server.host, - port=webdav_server.port, - user=webdav_server.user, - password=webdav_server.password, - ssl_verify=webdav_server.ssl_verify, - protocol=webdav_server.protocol, - ) - - return webdav, PurePosixPath("/export/news_parse") - - -@pytest.fixture() -def webdav_connection(webdav_data): - return webdav_data[0] - - -@pytest.fixture( - scope="session", - params=[ - pytest.param("real", marks=[pytest.mark.hdfs, pytest.mark.file_connection, pytest.mark.connection]), - ], -) -def hdfs_server(): - HDFSServer = namedtuple("HDFSServer", ["host", "port"]) - - return HDFSServer( - host=os.getenv("ONETL_HDFS_HOST"), - port=os.getenv("ONETL_HDFS_PORT"), - ) - - -@pytest.fixture(scope="function") -def hdfs_data(hdfs_server): - from onetl.connection import HDFS - - hdfs = HDFS(host=hdfs_server.host, port=hdfs_server.port) - return hdfs, PurePosixPath("/export/news_parse") - - -@pytest.fixture() -def hdfs_connection(hdfs_data): - return hdfs_data[0] - - -@pytest.fixture(scope="function") -def resource_path(tmp_path_factory): - original_files = Path(__file__).parent / "tests" / "resources" / "src" - - temp_dir = tmp_path_factory.mktemp("test_files") / secrets.token_hex(5) - shutil.copytree(original_files, temp_dir) - return temp_dir - - -@pytest.fixture(scope="function") -def test_files(resource_path): - resources = resource_path / "news_parse_zp" / "2018_03_05_10_00_00" - - return [ - resources / "newsage-zp-2018_03_05_10_00_00.csv", - resources / "newsage-zp-2018_03_05_10_10_00.csv", - ] - - -@pytest.fixture(scope="function") -def upload_files_with_encoding(file_all_connections, source_path): - local_root_filename = Path(__file__).parent / "tests" / "resources" - remote_root_filename = source_path - files = ["file_connection_utf.txt", "file_connection_ascii.txt"] - - for file in files: - file_all_connections.upload_file(local_root_filename / file, remote_root_filename / file) - - return { - "utf": remote_root_filename / "file_connection_utf.txt", - "ascii": remote_root_filename / "file_connection_ascii.txt", - } - - -@pytest.fixture(scope="session") -def warehouse_dir(tmp_path_factory): - # https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html - path = tmp_path_factory.mktemp("spark-warehouse") - yield path - shutil.rmtree(path, ignore_errors=True) - - -@pytest.fixture(scope="session") -def spark_metastore_dir(tmp_path_factory): - # https://stackoverflow.com/a/44048667 - path = tmp_path_factory.mktemp("metastore_db") - yield path - shutil.rmtree(path, ignore_errors=True) - - -@pytest.fixture(scope="session") -def ivysettings_path(): - return Path(__file__).parent / "tests" / "ivysettings.xml" - - -@pytest.fixture(scope="session") -def spark_packages(): - import pyspark - - from onetl.connection import ( - MSSQL, - Clickhouse, - Greenplum, - MongoDB, - MySQL, - Oracle, - Postgres, - Teradata, - ) - - packages = [ - Clickhouse.package, - MSSQL.package, - MySQL.package, - Oracle.package, - Postgres.package, - Teradata.package, - ] - - with_greenplum = os.getenv("ONETL_DB_WITH_GREENPLUM", "false").lower() == "true" - - pyspark_version = ".".join(pyspark.__version__.split(".")[:2]) - - if pyspark_version == "2.3": - if with_greenplum: - packages.extend([Greenplum.package_spark_2_3]) - return packages - - if pyspark_version == "2.4": - if with_greenplum: - packages.extend([Greenplum.package_spark_2_4]) - return packages - - if pyspark_version == "3.2": - packages.extend([MongoDB.package_spark_3_2]) - if with_greenplum: - packages.extend([Greenplum.package_spark_3_2]) - return packages - - if pyspark_version == "3.3": - packages.extend([MongoDB.package_spark_3_3]) - if not with_greenplum: - return packages - - raise ValueError(f"Greenplum connector does not support Spark {pyspark.__version__}") - - if pyspark_version == "3.4": - packages.extend([MongoDB.package_spark_3_4]) - if not with_greenplum: - return packages - - raise ValueError(f"Greenplum connector does not support Spark {pyspark.__version__}") - - raise ValueError(f"Unsupported Spark version: {pyspark.__version__}") - - -@pytest.fixture( - scope="session", - name="spark", - params=[ - pytest.param("real", marks=[pytest.mark.db_connection, pytest.mark.connection]), - ], -) -def get_spark_session(warehouse_dir, spark_metastore_dir, ivysettings_path, spark_packages): - from pyspark.sql import SparkSession - - spark = ( - SparkSession.builder.config("spark.app.name", "onetl") # noqa: WPS221 - .config("spark.master", "local[*]") - .config("spark.jars.packages", ",".join(spark_packages)) - .config("spark.jars.ivySettings", os.fspath(ivysettings_path)) - .config("spark.driver.memory", "1g") - .config("spark.driver.maxResultSize", "1g") - .config("spark.executor.cores", "1") - .config("spark.executor.memory", "1g") - .config("spark.executor.allowSparkContext", "true") # Greenplum uses SparkContext on executor if master==local - .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") - .config("spark.kryoserializer.buffer.max", "256m") - .config("spark.default.parallelism", "1") - .config("spark.driver.extraJavaOptions", f"-Dderby.system.home={os.fspath(spark_metastore_dir)}") - .config("spark.sql.warehouse.dir", warehouse_dir) - .enableHiveSupport() - .getOrCreate() - ) - - yield spark - spark.sparkContext.stop() - spark.stop() - - -@pytest.fixture( - scope="function", - params=[ - pytest.param("mock", marks=[pytest.mark.db_connection, pytest.mark.connection]), - ], -) -def spark_mock() -> SparkSession: - from pyspark.sql import SparkSession - - spark = Mock(spec=SparkSession) - spark.sparkContext = Mock() - spark.sparkContext.appName = "abc" - return spark - - -@pytest.fixture() -def processing(request, spark): - processing_classes = { - "clickhouse": ("tests.lib.clickhouse_processing", "ClickhouseProcessing"), - "greenplum": ("tests.lib.greenplum_processing", "GreenplumProcessing"), - "hive": ("tests.lib.hive_processing", "HiveProcessing"), - "mongodb": ("tests.lib.mongodb_processing", "MongoDBProcessing"), - "mssql": ("tests.lib.mssql_processing", "MSSQLProcessing"), - "mysql": ("tests.lib.mysql_processing", "MySQLProcessing"), - "oracle": ("tests.lib.oracle_processing", "OracleProcessing"), - "postgres": ("tests.lib.postgres_processing", "PostgresProcessing"), - } - - db_storage_name = request.function.__name__.split("_")[1] - if db_storage_name not in processing_classes: - raise ValueError(f"Wrong name. Please use one of: {list(processing_classes.keys())}") - - module_name, class_name = processing_classes[db_storage_name] - module = import_module(module_name) - db_processing = getattr(module, class_name) - - if db_storage_name == "hive": - yield db_processing(spark) - else: - with db_processing() as result: - yield result - - -@pytest.fixture -def get_schema_table(processing): - schema = processing.schema - processing.create_schema(schema=schema) - - table = f"test_{secrets.token_hex(5)}" - full_name = f"{schema}.{table}" - - yield PreparedDbInfo(full_name=full_name, schema=schema, table=table) - - try: - processing.drop_table( - table=table, - schema=schema, - ) - except Exception: # noqa: S110 - pass - - -@pytest.fixture -def prepare_schema_table(processing, get_schema_table): - fields = {column_name: processing.get_column_type(column_name) for column_name in processing.column_names} - _, schema, table = get_schema_table - - processing.create_table(schema=schema, table=table, fields=fields) - - return get_schema_table - - -@pytest.fixture -def load_table_data(prepare_schema_table, processing): - _, schema, table = prepare_schema_table - - processing.insert_data( - schema=schema, - table=table, - values=processing.create_pandas_df(), - ) - - return prepare_schema_table - - -@pytest.fixture(scope="function", autouse=True) -def use_memory_hwm_store(request): # noqa: WPS325 - test_function = request.function - entities = test_function.__name__.split("_") - - if "strategy" in entities: - with MemoryHWMStore(): - yield None - - else: - yield None - - -@pytest.fixture( - params=[ - lazy_fixture("ftp_data"), - lazy_fixture("ftps_data"), - lazy_fixture("hdfs_data"), - lazy_fixture("s3_data"), - lazy_fixture("sftp_data"), - lazy_fixture("webdav_data"), - ], -) -def file_connections_data(request): - return request.param - - -@pytest.fixture() -def create_keytab(tmp_path_factory): - path = Path(tmp_path_factory.mktemp("data") / "keytab") - path.write_text("content") - - return path - - -@pytest.fixture() -def file_all_connections(file_connections_data): - return file_connections_data[0] - - -@pytest.fixture(scope="function") -def source_path(file_connections_data): - connection, path = file_connections_data - connection.remove_dir(path, recursive=True) - connection.create_dir(path) - yield path - connection.remove_dir(path, recursive=True) - - -@pytest.fixture(scope="function") -def upload_test_files(file_all_connections, resource_path, source_path): - return upload_files(resource_path, source_path, file_all_connections) - - -@pytest.fixture( - params=[ - ( - IntHWM( - source=Table(name=secrets.token_hex(5), db=secrets.token_hex(5), instance="proto://domain.com"), - column=Column(name=secrets.token_hex(5)), - value=10, - ), - 5, - ), - ( - DateHWM( - source=Table(name=secrets.token_hex(5), db=secrets.token_hex(5), instance="proto://domain.com"), - column=Column(name=secrets.token_hex(5)), - value=date(year=2023, month=8, day=15), - ), - timedelta(days=31), - ), - ( - DateTimeHWM( - source=Table(name=secrets.token_hex(5), db=secrets.token_hex(5), instance="proto://domain.com"), - column=Column(name=secrets.token_hex(5)), - value=datetime(year=2023, month=8, day=15, hour=11, minute=22, second=33), - ), - timedelta(seconds=50), - ), - ( - FileListHWM( - source=RemoteFolder(name=f"/absolute/{secrets.token_hex(5)}", instance="ftp://ftp.server:21"), - value=["some/path", "another.file"], - ), - "third.file", - ), - ], -) -def hwm_delta(request): - return request.param diff --git a/setup.cfg b/setup.cfg index 5afcc1eb4..17f84d0f9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -258,6 +258,8 @@ ignore = WPS601, # WPS604 Found incorrect node inside `class` body: pass WPS604, +# WPS100 Found wrong module name: util + WPS100 # http://flake8.pycqa.org/en/latest/user/options.html?highlight=per-file-ignores#cmdoption-flake8-per-file-ignores per-file-ignores = diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 000000000..6afbfb0f0 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,20 @@ +import os + +# disable failing plugin import +os.environ["ONETL_PLUGINS_BLACKLIST"] = "failing-plugin" + +pytest_plugins = [ + "tests.fixtures.file_connections.fixtures", + "tests.fixtures.file_connections.ftp", + "tests.fixtures.file_connections.ftps", + "tests.fixtures.file_connections.hdfs", + "tests.fixtures.file_connections.s3", + "tests.fixtures.file_connections.sftp", + "tests.fixtures.file_connections.webdav", + "tests.fixtures.processing.fixtures", + "tests.fixtures.create_keytab", + "tests.fixtures.global_hwm_store", + "tests.fixtures.hwm_delta", + "tests.fixtures.spark_mock", + "tests.fixtures.spark", +] diff --git a/tests/lib/__init__.py b/tests/fixtures/__init__.py similarity index 100% rename from tests/lib/__init__.py rename to tests/fixtures/__init__.py diff --git a/tests/fixtures/create_keytab.py b/tests/fixtures/create_keytab.py new file mode 100644 index 000000000..b440b6a65 --- /dev/null +++ b/tests/fixtures/create_keytab.py @@ -0,0 +1,11 @@ +from pathlib import Path + +import pytest + + +@pytest.fixture() +def create_keytab(tmp_path_factory): + path = Path(tmp_path_factory.mktemp("data") / "keytab") + path.write_text("content") + + return path diff --git a/tests/resources/src/exclude_dir/file_4.txt b/tests/fixtures/file_connections/__init__.py similarity index 100% rename from tests/resources/src/exclude_dir/file_4.txt rename to tests/fixtures/file_connections/__init__.py diff --git a/tests/fixtures/file_connections/fixtures.py b/tests/fixtures/file_connections/fixtures.py new file mode 100644 index 000000000..11d6a0636 --- /dev/null +++ b/tests/fixtures/file_connections/fixtures.py @@ -0,0 +1,70 @@ +import secrets +import shutil +from pathlib import Path + +import pytest +from pytest_lazyfixture import lazy_fixture + + +@pytest.fixture(scope="session") +def resource_path_original(): + path = Path(__file__).parent.parent.parent / "resources" + assert path.exists() + return path + + +@pytest.fixture() +def resource_path(resource_path_original, tmp_path_factory): + temp_dir = tmp_path_factory.mktemp("test_files") / secrets.token_hex(5) + shutil.copytree(resource_path_original, temp_dir) + return temp_dir + + +@pytest.fixture() +def test_files(resource_path): + return [ + resource_path / "raw/ascii.txt", + resource_path / "raw/utf-8.txt", + ] + + +@pytest.fixture( + params=[ + lazy_fixture("ftp_file_connection"), + lazy_fixture("ftps_file_connection"), + lazy_fixture("hdfs_file_connection"), + lazy_fixture("s3_file_connection"), + lazy_fixture("sftp_file_connection"), + lazy_fixture("webdav_file_connection"), + ], +) +def file_connection(request): + return request.param + + +@pytest.fixture( + params=[ + lazy_fixture("ftp_file_connection_with_path"), + lazy_fixture("ftps_file_connection_with_path"), + lazy_fixture("hdfs_file_connection_with_path"), + lazy_fixture("s3_file_connection_with_path"), + lazy_fixture("sftp_file_connection_with_path"), + lazy_fixture("webdav_file_connection_with_path"), + ], +) +def file_connection_with_path(request): + return request.param + + +@pytest.fixture( + params=[ + lazy_fixture("ftp_file_connection_with_path_and_files"), + lazy_fixture("ftps_file_connection_with_path_and_files"), + lazy_fixture("hdfs_file_connection_with_path_and_files"), + lazy_fixture("s3_file_connection_with_path_and_files"), + lazy_fixture("sftp_file_connection_with_path_and_files"), + lazy_fixture("webdav_file_connection_with_path_and_files"), + ], +) +def file_connection_with_path_and_files(request): + return request.param diff --git a/tests/fixtures/file_connections/ftp.py b/tests/fixtures/file_connections/ftp.py new file mode 100644 index 000000000..3c1a5e3be --- /dev/null +++ b/tests/fixtures/file_connections/ftp.py @@ -0,0 +1,60 @@ +import os +from collections import namedtuple +from pathlib import PurePosixPath + +import pytest + +from tests.fixtures.file_connections.util import upload_files + + +@pytest.fixture( + scope="session", + params=[ + pytest.param("real", marks=[pytest.mark.ftp, pytest.mark.file_connection, pytest.mark.connection]), + ], +) +def ftp_server(): + FTPServer = namedtuple("FTPServer", ["host", "port", "user", "password"]) + + return FTPServer( + host=os.getenv("ONETL_FTP_HOST"), + port=os.getenv("ONETL_FTP_PORT"), + user=os.getenv("ONETL_FTP_USER"), + password=os.getenv("ONETL_FTP_PASSWORD"), + ) + + +@pytest.fixture() +def ftp_file_connection(ftp_server): + from onetl.connection import FTP + + return FTP( + host=ftp_server.host, + port=ftp_server.port, + user=ftp_server.user, + password=ftp_server.password, + ) + + +@pytest.fixture() +def ftp_file_connection_with_path(request, ftp_file_connection): + connection = ftp_file_connection + root = PurePosixPath("/data/") + + def finalizer(): + connection.remove_dir(root, recursive=True) + + request.addfinalizer(finalizer) + + connection.remove_dir(root, recursive=True) + connection.create_dir(root) + + return connection, root + + +@pytest.fixture() +def ftp_file_connection_with_path_and_files(resource_path_original, ftp_file_connection_with_path): + connection, upload_to = ftp_file_connection_with_path + upload_from = resource_path_original + files = upload_files(upload_from, upload_to, connection) + return connection, upload_to, files diff --git a/tests/fixtures/file_connections/ftps.py b/tests/fixtures/file_connections/ftps.py new file mode 100644 index 000000000..a7942b816 --- /dev/null +++ b/tests/fixtures/file_connections/ftps.py @@ -0,0 +1,60 @@ +import os +from collections import namedtuple +from pathlib import PurePosixPath + +import pytest + +from tests.fixtures.file_connections.util import upload_files + + +@pytest.fixture( + scope="session", + params=[ + pytest.param("real", marks=[pytest.mark.ftps, pytest.mark.file_connection, pytest.mark.connection]), + ], +) +def ftps_server(): + FTPSServer = namedtuple("FTPSServer", ["host", "port", "user", "password"]) + + return FTPSServer( + host=os.getenv("ONETL_FTPS_HOST"), + port=os.getenv("ONETL_FTPS_PORT"), + user=os.getenv("ONETL_FTPS_USER"), + password=os.getenv("ONETL_FTPS_PASSWORD"), + ) + + +@pytest.fixture() +def ftps_file_connection(ftps_server): + from onetl.connection import FTPS + + return FTPS( + host=ftps_server.host, + port=ftps_server.port, + user=ftps_server.user, + password=ftps_server.password, + ) + + +@pytest.fixture() +def ftps_file_connection_with_path(request, ftps_file_connection): + connection = ftps_file_connection + root = PurePosixPath("/data/") + + def finalizer(): + connection.remove_dir(root, recursive=True) + + request.addfinalizer(finalizer) + + connection.remove_dir(root, recursive=True) + connection.create_dir(root) + + return connection, root + + +@pytest.fixture() +def ftps_file_connection_with_path_and_files(resource_path_original, ftps_file_connection_with_path): + connection, upload_to = ftps_file_connection_with_path + upload_from = resource_path_original + files = upload_files(upload_from, upload_to, connection) + return connection, upload_to, files diff --git a/tests/fixtures/file_connections/hdfs.py b/tests/fixtures/file_connections/hdfs.py new file mode 100644 index 000000000..39cba18af --- /dev/null +++ b/tests/fixtures/file_connections/hdfs.py @@ -0,0 +1,52 @@ +import os +from collections import namedtuple +from pathlib import PurePosixPath + +import pytest + +from tests.fixtures.file_connections.util import upload_files + + +@pytest.fixture( + scope="session", + params=[ + pytest.param("real", marks=[pytest.mark.hdfs, pytest.mark.file_connection, pytest.mark.connection]), + ], +) +def hdfs_server(): + HDFSServer = namedtuple("HDFSServer", ["host", "port"]) + return HDFSServer( + host=os.getenv("ONETL_HDFS_HOST"), + port=os.getenv("ONETL_HDFS_PORT"), + ) + + +@pytest.fixture() +def hdfs_file_connection(hdfs_server): + from onetl.connection import HDFS + + return HDFS(host=hdfs_server.host, port=hdfs_server.port) + + +@pytest.fixture() +def hdfs_file_connection_with_path(request, hdfs_file_connection): + connection = hdfs_file_connection + root = PurePosixPath("/data/") + + def finalizer(): + connection.remove_dir(root, recursive=True) + + request.addfinalizer(finalizer) + + connection.remove_dir(root, recursive=True) + connection.create_dir(root) + + return connection, root + + +@pytest.fixture() +def hdfs_file_connection_with_path_and_files(resource_path_original, hdfs_file_connection_with_path): + connection, upload_to = hdfs_file_connection_with_path + upload_from = resource_path_original + files = upload_files(upload_from, upload_to, connection) + return connection, upload_to, files diff --git a/tests/fixtures/file_connections/s3.py b/tests/fixtures/file_connections/s3.py new file mode 100644 index 000000000..36877fae7 --- /dev/null +++ b/tests/fixtures/file_connections/s3.py @@ -0,0 +1,67 @@ +import os +from collections import namedtuple +from pathlib import PurePosixPath + +import pytest + +from tests.fixtures.file_connections.util import upload_files + + +@pytest.fixture( + scope="session", + params=[ + pytest.param("real", marks=[pytest.mark.s3, pytest.mark.file_connection, pytest.mark.connection]), + ], +) +def s3_server(): + S3Server = namedtuple("S3Server", ["host", "port", "bucket", "access_key", "secret_key", "protocol"]) + + return S3Server( + host=os.getenv("ONETL_S3_HOST"), + port=os.getenv("ONETL_S3_PORT"), + bucket=os.getenv("ONETL_S3_BUCKET"), + access_key=os.getenv("ONETL_S3_ACCESS_KEY"), + secret_key=os.getenv("ONETL_S3_SECRET_KEY"), + protocol=os.getenv("ONETL_S3_PROTOCOL", "http").lower(), + ) + + +@pytest.fixture() +def s3_file_connection(s3_server): + from onetl.connection import S3 + + return S3( + host=s3_server.host, + port=s3_server.port, + bucket=s3_server.bucket, + access_key=s3_server.access_key, + secret_key=s3_server.secret_key, + protocol=s3_server.protocol, + ) + + +@pytest.fixture() +def s3_file_connection_with_path(request, s3_file_connection): + connection = s3_file_connection + root = PurePosixPath("/data/") + + if not connection.client.bucket_exists(connection.bucket): + connection.client.make_bucket(connection.bucket) + + def finalizer(): + connection.remove_dir(root, recursive=True) + + request.addfinalizer(finalizer) + + connection.remove_dir(root, recursive=True) + connection.create_dir(root) + + return connection, root + + +@pytest.fixture() +def s3_file_connection_with_path_and_files(resource_path_original, s3_file_connection_with_path): + connection, upload_to = s3_file_connection_with_path + upload_from = resource_path_original + files = upload_files(upload_from, upload_to, connection) + return connection, upload_to, files diff --git a/tests/fixtures/file_connections/sftp.py b/tests/fixtures/file_connections/sftp.py new file mode 100644 index 000000000..ab83bd5e1 --- /dev/null +++ b/tests/fixtures/file_connections/sftp.py @@ -0,0 +1,60 @@ +import os +from collections import namedtuple +from pathlib import PurePosixPath + +import pytest + +from tests.fixtures.file_connections.util import upload_files + + +@pytest.fixture( + scope="session", + params=[ + pytest.param("real", marks=[pytest.mark.sftp, pytest.mark.file_connection, pytest.mark.connection]), + ], +) +def sftp_server(): + SFTPServer = namedtuple("SFTPServer", ["host", "port", "user", "password"]) + + return SFTPServer( + host=os.getenv("ONETL_SFTP_HOST"), + port=os.getenv("ONETL_SFTP_PORT"), + user=os.getenv("ONETL_SFTP_USER"), + password=os.getenv("ONETL_SFTP_PASSWORD"), + ) + + +@pytest.fixture() +def sftp_file_connection(sftp_server): + from onetl.connection import SFTP + + return SFTP( + host=sftp_server.host, + port=sftp_server.port, + user=sftp_server.user, + password=sftp_server.password, + ) + + +@pytest.fixture() +def sftp_file_connection_with_path(request, sftp_file_connection): + connection = sftp_file_connection + root = PurePosixPath("/app/data/") + + def finalizer(): + connection.remove_dir(root, recursive=True) + + request.addfinalizer(finalizer) + + connection.remove_dir(root, recursive=True) + connection.create_dir(root) + + return connection, root + + +@pytest.fixture() +def sftp_file_connection_with_path_and_files(resource_path_original, sftp_file_connection_with_path): + connection, upload_to = sftp_file_connection_with_path + upload_from = resource_path_original + files = upload_files(upload_from, upload_to, connection) + return connection, upload_to, files diff --git a/tests/lib/common.py b/tests/fixtures/file_connections/util.py similarity index 100% rename from tests/lib/common.py rename to tests/fixtures/file_connections/util.py diff --git a/tests/fixtures/file_connections/webdav.py b/tests/fixtures/file_connections/webdav.py new file mode 100644 index 000000000..cf12d25db --- /dev/null +++ b/tests/fixtures/file_connections/webdav.py @@ -0,0 +1,67 @@ +import os +from collections import namedtuple +from pathlib import PurePosixPath + +import pytest + +from tests.fixtures.file_connections.util import upload_files + + +@pytest.fixture( + scope="session", + params=[ + pytest.param( + "real", + marks=[pytest.mark.webdav, pytest.mark.file_connection, pytest.mark.connection], + ), + ], +) +def webdav_server(): + WebDAVServer = namedtuple("WebDAVServer", ["host", "port", "user", "password", "ssl_verify", "protocol"]) + + return WebDAVServer( + host=os.getenv("ONETL_WEBDAV_HOST"), + port=os.getenv("ONETL_WEBDAV_PORT"), + user=os.getenv("ONETL_WEBDAV_USER"), + password=os.getenv("ONETL_WEBDAV_PASSWORD"), + ssl_verify=os.getenv("ONETL_WEBDAV_SSL_VERIFY", "false").lower() != "true", + protocol=os.getenv("ONETL_WEBDAV_PROTOCOL", "http").lower(), + ) + + +@pytest.fixture() +def webdav_file_connection(webdav_server): + from onetl.connection import WebDAV + + return WebDAV( + host=webdav_server.host, + port=webdav_server.port, + user=webdav_server.user, + password=webdav_server.password, + ssl_verify=webdav_server.ssl_verify, + protocol=webdav_server.protocol, + ) + + +@pytest.fixture() +def webdav_file_connection_with_path(request, webdav_file_connection): + connection = webdav_file_connection + root = PurePosixPath("/data/") + + def finalizer(): + connection.remove_dir(root, recursive=True) + + request.addfinalizer(finalizer) + + connection.remove_dir(root, recursive=True) + connection.create_dir(root) + + return connection, root + + +@pytest.fixture() +def webdav_file_connection_with_path_and_files(resource_path_original, webdav_file_connection_with_path): + connection, upload_to = webdav_file_connection_with_path + upload_from = resource_path_original + files = upload_files(upload_from, upload_to, connection) + return connection, upload_to, files diff --git a/tests/fixtures/global_hwm_store.py b/tests/fixtures/global_hwm_store.py new file mode 100644 index 000000000..0b372959b --- /dev/null +++ b/tests/fixtures/global_hwm_store.py @@ -0,0 +1,16 @@ +import pytest + +from onetl.hwm.store import MemoryHWMStore + + +@pytest.fixture(scope="function", autouse=True) +def global_hwm_store(request): # noqa: WPS325 + test_function = request.function + entities = test_function.__name__.split("_") + + if "strategy" in entities: + with MemoryHWMStore(): + yield None + + else: + yield None diff --git a/tests/fixtures/hwm_delta.py b/tests/fixtures/hwm_delta.py new file mode 100644 index 000000000..dd981367e --- /dev/null +++ b/tests/fixtures/hwm_delta.py @@ -0,0 +1,52 @@ +import secrets +from datetime import date, datetime, timedelta + +import pytest +from etl_entities import ( + Column, + DateHWM, + DateTimeHWM, + FileListHWM, + IntHWM, + RemoteFolder, + Table, +) + + +@pytest.fixture( + params=[ + ( + IntHWM( + source=Table(name=secrets.token_hex(5), db=secrets.token_hex(5), instance="proto://domain.com"), + column=Column(name=secrets.token_hex(5)), + value=10, + ), + 5, + ), + ( + DateHWM( + source=Table(name=secrets.token_hex(5), db=secrets.token_hex(5), instance="proto://domain.com"), + column=Column(name=secrets.token_hex(5)), + value=date(year=2023, month=8, day=15), + ), + timedelta(days=31), + ), + ( + DateTimeHWM( + source=Table(name=secrets.token_hex(5), db=secrets.token_hex(5), instance="proto://domain.com"), + column=Column(name=secrets.token_hex(5)), + value=datetime(year=2023, month=8, day=15, hour=11, minute=22, second=33), + ), + timedelta(seconds=50), + ), + ( + FileListHWM( + source=RemoteFolder(name=f"/absolute/{secrets.token_hex(5)}", instance="ftp://ftp.server:21"), + value=["some/path", "another.file"], + ), + "third.file", + ), + ], +) +def hwm_delta(request): + return request.param diff --git a/tests/resources/src/exclude_dir/file_5.txt b/tests/fixtures/processing/__init__.py similarity index 100% rename from tests/resources/src/exclude_dir/file_5.txt rename to tests/fixtures/processing/__init__.py diff --git a/tests/lib/base_processing.py b/tests/fixtures/processing/base_processing.py similarity index 100% rename from tests/lib/base_processing.py rename to tests/fixtures/processing/base_processing.py diff --git a/tests/lib/clickhouse_processing.py b/tests/fixtures/processing/clickhouse.py similarity index 98% rename from tests/lib/clickhouse_processing.py rename to tests/fixtures/processing/clickhouse.py index 4f290e9fe..1205fad6a 100644 --- a/tests/lib/clickhouse_processing.py +++ b/tests/fixtures/processing/clickhouse.py @@ -9,7 +9,7 @@ import clickhouse_driver import pandas -from tests.lib.base_processing import BaseProcessing +from tests.fixtures.processing.base_processing import BaseProcessing logger = getLogger(__name__) diff --git a/tests/fixtures/processing/fixtures.py b/tests/fixtures/processing/fixtures.py new file mode 100644 index 000000000..2cca96908 --- /dev/null +++ b/tests/fixtures/processing/fixtures.py @@ -0,0 +1,77 @@ +import secrets +from collections import namedtuple +from importlib import import_module + +import pytest + +PreparedDbInfo = namedtuple("PreparedDbInfo", ["full_name", "schema", "table"]) + + +@pytest.fixture() +def processing(request, spark): + processing_classes = { + "clickhouse": ("tests.fixtures.processing.clickhouse", "ClickhouseProcessing"), + "greenplum": ("tests.fixtures.processing.greenplum", "GreenplumProcessing"), + "hive": ("tests.fixtures.processing.hive", "HiveProcessing"), + "mongodb": ("tests.fixtures.processing.mongodb", "MongoDBProcessing"), + "mssql": ("tests.fixtures.processing.mssql", "MSSQLProcessing"), + "mysql": ("tests.fixtures.processing.mysql", "MySQLProcessing"), + "oracle": ("tests.fixtures.processing.oracle", "OracleProcessing"), + "postgres": ("tests.fixtures.processing.postgres", "PostgresProcessing"), + } + + db_storage_name = request.function.__name__.split("_")[1] + if db_storage_name not in processing_classes: + raise ValueError(f"Wrong name. Please use one of: {list(processing_classes.keys())}") + + module_name, class_name = processing_classes[db_storage_name] + module = import_module(module_name) + db_processing = getattr(module, class_name) + + if db_storage_name == "hive": + yield db_processing(spark) + else: + with db_processing() as result: + yield result + + +@pytest.fixture +def get_schema_table(processing): + schema = processing.schema + processing.create_schema(schema=schema) + + table = f"test_{secrets.token_hex(5)}" + full_name = f"{schema}.{table}" + + yield PreparedDbInfo(full_name=full_name, schema=schema, table=table) + + try: + processing.drop_table( + table=table, + schema=schema, + ) + except Exception: # noqa: S110 + pass + + +@pytest.fixture +def prepare_schema_table(processing, get_schema_table): + fields = {column_name: processing.get_column_type(column_name) for column_name in processing.column_names} + _, schema, table = get_schema_table + + processing.create_table(schema=schema, table=table, fields=fields) + + return get_schema_table + + +@pytest.fixture +def load_table_data(prepare_schema_table, processing): + _, schema, table = prepare_schema_table + + processing.insert_data( + schema=schema, + table=table, + values=processing.create_pandas_df(), + ) + + return prepare_schema_table diff --git a/tests/lib/greenplum_processing.py b/tests/fixtures/processing/greenplum.py similarity index 91% rename from tests/lib/greenplum_processing.py rename to tests/fixtures/processing/greenplum.py index 8da693092..b47426021 100644 --- a/tests/lib/greenplum_processing.py +++ b/tests/fixtures/processing/greenplum.py @@ -1,6 +1,6 @@ import os -from tests.lib.postgres_processing import PostgresProcessing +from tests.fixtures.processing.postgres import PostgresProcessing class GreenplumProcessing(PostgresProcessing): diff --git a/tests/lib/hive_processing.py b/tests/fixtures/processing/hive.py similarity index 97% rename from tests/lib/hive_processing.py rename to tests/fixtures/processing/hive.py index 6fa05d0b6..2346576b9 100644 --- a/tests/lib/hive_processing.py +++ b/tests/fixtures/processing/hive.py @@ -7,7 +7,7 @@ import pandas -from tests.lib.base_processing import BaseProcessing +from tests.fixtures.processing.base_processing import BaseProcessing if TYPE_CHECKING: from pyspark.sql import SparkSession diff --git a/tests/lib/mongodb_processing.py b/tests/fixtures/processing/mongodb.py similarity index 98% rename from tests/lib/mongodb_processing.py rename to tests/fixtures/processing/mongodb.py index 4a76e29f1..bf3158a99 100644 --- a/tests/lib/mongodb_processing.py +++ b/tests/fixtures/processing/mongodb.py @@ -10,7 +10,7 @@ import pandas from pymongo import MongoClient -from tests.lib.base_processing import BaseProcessing +from tests.fixtures.processing.base_processing import BaseProcessing logger = getLogger(__name__) diff --git a/tests/lib/mssql_processing.py b/tests/fixtures/processing/mssql.py similarity index 98% rename from tests/lib/mssql_processing.py rename to tests/fixtures/processing/mssql.py index 39c8ece06..d175d18f6 100644 --- a/tests/lib/mssql_processing.py +++ b/tests/fixtures/processing/mssql.py @@ -8,7 +8,7 @@ import pymssql from pandas.io import sql as psql -from tests.lib.base_processing import BaseProcessing +from tests.fixtures.processing.base_processing import BaseProcessing logger = getLogger(__name__) diff --git a/tests/lib/mysql_processing.py b/tests/fixtures/processing/mysql.py similarity index 98% rename from tests/lib/mysql_processing.py rename to tests/fixtures/processing/mysql.py index 98f617d93..480a4b227 100644 --- a/tests/lib/mysql_processing.py +++ b/tests/fixtures/processing/mysql.py @@ -7,7 +7,7 @@ import pymysql from pandas.io import sql as psql -from tests.lib.base_processing import BaseProcessing +from tests.fixtures.processing.base_processing import BaseProcessing logger = getLogger(__name__) diff --git a/tests/lib/oracle_processing.py b/tests/fixtures/processing/oracle.py similarity index 98% rename from tests/lib/oracle_processing.py rename to tests/fixtures/processing/oracle.py index 482a6039b..509bb9e1f 100644 --- a/tests/lib/oracle_processing.py +++ b/tests/fixtures/processing/oracle.py @@ -7,7 +7,7 @@ import pandas from pandas.io import sql as psql -from tests.lib.base_processing import BaseProcessing +from tests.fixtures.processing.base_processing import BaseProcessing logger = getLogger(__name__) diff --git a/tests/lib/postgres_processing.py b/tests/fixtures/processing/postgres.py similarity index 98% rename from tests/lib/postgres_processing.py rename to tests/fixtures/processing/postgres.py index 9c1bd320a..fbbd43d70 100644 --- a/tests/lib/postgres_processing.py +++ b/tests/fixtures/processing/postgres.py @@ -8,7 +8,7 @@ from psycopg2 import connect as pg_connect from psycopg2.extensions import connection -from tests.lib.base_processing import BaseProcessing +from tests.fixtures.processing.base_processing import BaseProcessing logger = getLogger(__name__) diff --git a/tests/fixtures/spark.py b/tests/fixtures/spark.py new file mode 100644 index 000000000..52b5e52dd --- /dev/null +++ b/tests/fixtures/spark.py @@ -0,0 +1,123 @@ +import os +import shutil +from pathlib import Path + +import pytest + + +@pytest.fixture(scope="session") +def warehouse_dir(tmp_path_factory): + # https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html + path = tmp_path_factory.mktemp("spark-warehouse") + yield path + shutil.rmtree(path, ignore_errors=True) + + +@pytest.fixture(scope="session") +def spark_metastore_dir(tmp_path_factory): + # https://stackoverflow.com/a/44048667 + path = tmp_path_factory.mktemp("metastore_db") + yield path + shutil.rmtree(path, ignore_errors=True) + + +@pytest.fixture(scope="session") +def ivysettings_path(): + path = Path(__file__).parent.parent / "ivysettings.xml" + assert path.exists() + return path + + +@pytest.fixture(scope="session") +def spark_packages(): + import pyspark + + from onetl.connection import ( + MSSQL, + Clickhouse, + Greenplum, + MongoDB, + MySQL, + Oracle, + Postgres, + Teradata, + ) + + packages = [ + Clickhouse.package, + MSSQL.package, + MySQL.package, + Oracle.package, + Postgres.package, + Teradata.package, + ] + + with_greenplum = os.getenv("ONETL_DB_WITH_GREENPLUM", "false").lower() == "true" + + pyspark_version = ".".join(pyspark.__version__.split(".")[:2]) + + if pyspark_version == "2.3": + if with_greenplum: + packages.extend([Greenplum.package_spark_2_3]) + return packages + + if pyspark_version == "2.4": + if with_greenplum: + packages.extend([Greenplum.package_spark_2_4]) + return packages + + if pyspark_version == "3.2": + packages.extend([MongoDB.package_spark_3_2]) + if with_greenplum: + packages.extend([Greenplum.package_spark_3_2]) + return packages + + if pyspark_version == "3.3": + packages.extend([MongoDB.package_spark_3_3]) + if not with_greenplum: + return packages + + raise ValueError(f"Greenplum connector does not support Spark {pyspark.__version__}") + + if pyspark_version == "3.4": + packages.extend([MongoDB.package_spark_3_4]) + if not with_greenplum: + return packages + + raise ValueError(f"Greenplum connector does not support Spark {pyspark.__version__}") + + raise ValueError(f"Unsupported Spark version: {pyspark.__version__}") + + +@pytest.fixture( + scope="session", + name="spark", + params=[ + pytest.param("real", marks=[pytest.mark.db_connection, pytest.mark.connection]), + ], +) +def get_spark_session(warehouse_dir, spark_metastore_dir, ivysettings_path, spark_packages): + from pyspark.sql import SparkSession + + spark = ( + SparkSession.builder.config("spark.app.name", "onetl") # noqa: WPS221 + .config("spark.master", "local[*]") + .config("spark.jars.packages", ",".join(spark_packages)) + .config("spark.jars.ivySettings", os.fspath(ivysettings_path)) + .config("spark.driver.memory", "1g") + .config("spark.driver.maxResultSize", "1g") + .config("spark.executor.cores", "1") + .config("spark.executor.memory", "1g") + .config("spark.executor.allowSparkContext", "true") # Greenplum uses SparkContext on executor if master==local + .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + .config("spark.kryoserializer.buffer.max", "256m") + .config("spark.default.parallelism", "1") + .config("spark.driver.extraJavaOptions", f"-Dderby.system.home={os.fspath(spark_metastore_dir)}") + .config("spark.sql.warehouse.dir", warehouse_dir) + .enableHiveSupport() + .getOrCreate() + ) + + yield spark + spark.sparkContext.stop() + spark.stop() diff --git a/tests/fixtures/spark_mock.py b/tests/fixtures/spark_mock.py new file mode 100644 index 000000000..4c89390e3 --- /dev/null +++ b/tests/fixtures/spark_mock.py @@ -0,0 +1,18 @@ +from unittest.mock import Mock + +import pytest + + +@pytest.fixture( + scope="function", + params=[ + pytest.param("mock", marks=[pytest.mark.db_connection, pytest.mark.connection]), + ], +) +def spark_mock(): + from pyspark.sql import SparkSession + + spark = Mock(spec=SparkSession) + spark.sparkContext = Mock() + spark.sparkContext.appName = "abc" + return spark diff --git a/tests/resources/file_connection_ascii.txt b/tests/resources/raw/ascii.txt similarity index 100% rename from tests/resources/file_connection_ascii.txt rename to tests/resources/raw/ascii.txt diff --git a/tests/resources/src/news_parse_zp/exclude_dir/file_1.txt b/tests/resources/raw/exclude_dir/excluded1.txt similarity index 100% rename from tests/resources/src/news_parse_zp/exclude_dir/file_1.txt rename to tests/resources/raw/exclude_dir/excluded1.txt diff --git a/tests/resources/src/news_parse_zp/exclude_dir/file_2.txt b/tests/resources/raw/exclude_dir/nested/excluded2.txt similarity index 100% rename from tests/resources/src/news_parse_zp/exclude_dir/file_2.txt rename to tests/resources/raw/exclude_dir/nested/excluded2.txt diff --git a/tests/resources/src/news_parse_zp/exclude_dir/file_3.txt b/tests/resources/raw/nested/exclude_dir/excluded3.txt similarity index 100% rename from tests/resources/src/news_parse_zp/exclude_dir/file_3.txt rename to tests/resources/raw/nested/exclude_dir/excluded3.txt diff --git a/tests/resources/raw/some.csv b/tests/resources/raw/some.csv new file mode 100644 index 000000000..e229eb3da --- /dev/null +++ b/tests/resources/raw/some.csv @@ -0,0 +1,3 @@ +some,header +1,2 +3,4 diff --git a/tests/resources/file_connection_utf.txt b/tests/resources/raw/utf-8.txt similarity index 100% rename from tests/resources/file_connection_utf.txt rename to tests/resources/raw/utf-8.txt diff --git a/tests/resources/src/news_parse_zp/2018_03_05_10_00_00/newsage-zp-2018_03_05_10_00_00.csv b/tests/resources/src/news_parse_zp/2018_03_05_10_00_00/newsage-zp-2018_03_05_10_00_00.csv deleted file mode 100644 index 48449d3bd..000000000 --- a/tests/resources/src/news_parse_zp/2018_03_05_10_00_00/newsage-zp-2018_03_05_10_00_00.csv +++ /dev/null @@ -1,35 +0,0 @@ -DATE|FROM|TO|CARD|CARDNUM|TIME|SUM|BALANCE|CURRENCY|BANKNAME -2018-03-05 10:30:25|779438576505|713037152739|mir-|3278|28:25|2072|4585.32|rub|sberbank -2018-03-05 10:00:47|779438576505|713048496188|mir-|6262|82:64|6809.41|8815.58|rub|sberbank -2018-03-05 10:27:38|779438576505|713055814817|visa|7837|98:96|25331.75|958887.51|rub|sberbank -2018-03-05 10:07:36|779438576505|713060683005|visa|8221|95:37|35360|57287.98|rub|sberbank -2018-03-05 10:07:57|779438576505|713060683005|visa|0114|97:50|25611.66|25977.76|rub|sberbank -2018-03-05 10:23:44|779438576505|713062501527|mir-|3572|14:67|2857|8153.46|rub|sberbank -2018-03-05 10:05:43|779438576505|713067291397|visa|9907|10:23|4431.23|2660.87|rub|sberbank -2018-03-05 10:44:31|779438576505|713075186275|mir-|5771|41:21|69342.30|09823.50|rub|sberbank -2018-03-05 10:58:41|779438576505|713077556899|visa|7784|02:88|201193.52|703825.24|rub|sberbank -2018-03-05 10:55:15|779438576505|713096094330|maes|4805|03:63|62084|68662.61|rub|sberbank -2018-03-05 10:52:44|779438576505|713097103194|mir-|3067|03:45|50001.46|76738.74|rub|sberbank -2018-03-05 10:47:06|779438576505|713098526911|visa|2461|51:64|781|8711.72|rub|sberbank -2018-03-05 10:09:25|779438576505|713099561227|visa|4457|63:33|26580.80|17642.09|rub|sberbank -2018-03-05 10:36:08|779438576505|713099825432|mir-|1696|68:78|82110.48|60096.38|rub|sberbank -2018-03-05 10:11:04|779438576505|713106948966|mir-|8839|83:68|24815.94|81725.69|rub|sberbank -2018-03-05 10:58:23|779438576505|713116318704|mir-|4055|61:57|26256.43|26258.20|rub|sberbank -2018-03-05 10:35:21|779438576505|713118758528|mir-|5912|50:40|1917|27158.98|rub|sberbank -2018-03-05 10:39:45|779438576505|713118758528|mir-|6879|27:01|73401.49|55792.77|rub|sberbank -2018-03-05 10:30:45|779438576505|713128561666|mir-|5238|93:99|19152.78|67540.96|rub|sberbank -2018-03-05 10:21:15|779438576505|713129227065|maes|1094|08:61|4078.34|9084.70|rub|sberbank -2018-03-05 10:26:12|779438576505|713129227065|maes|9322|53:67|2606.54|8465.27|rub|sberbank -2018-03-05 10:30:04|779438576505|713134543987|visa|7955|66:00|02337|24706.85|rub|sberbank -2018-03-05 10:24:40|779438576505|713143681613|ecmc|1941|01:76|1549.11|3393.85|rub|sberbank -2018-03-05 10:37:27|779438576505|713143681613|ecmc|8169|23:41|83887|48305.73|rub|sberbank -2018-03-05 10:09:21|779438576505|713149895672|mir-|2470|90:94|7063.50|0538.03|rub|sberbank -2018-03-05 10:27:16|779438576505|713151644533|mir-|3299|40:92|5989.43|3255.69|rub|sberbank -2018-03-05 10:27:41|779438576505|713151644533|mir-|7431|30:46|26273.87|08510.29|rub|sberbank -2018-03-05 10:08:05|779438576505|713155983135|mir-|0615|47:33|028.57|43424.46|rub|sberbank -2018-03-05 10:07:38|779438576505|713155983135|mir-|3226|10:43|67645.94|48224.50|rub|sberbank -2018-03-05 10:29:38|779438576505|713173277825|mir-|2171|69:54|08879|32707.99|rub|sberbank -2018-03-05 10:08:33|779438576505|713177572231|maes|6035|10:31|58758|94141.20|rub|sberbank -2018-03-05 10:08:15|779438576505|713182257290|visa|4343|97:00|52260|21749.87|rub|sberbank -2018-03-05 10:08:31|779438576505|713182257290|visa|6357|25:14|28870.27|39440.08|rub|sberbank -2018-03-05 10:55:18|779438576505|713192431872|visa|2399|88:00|69083|91632.79|rub|sberbank diff --git a/tests/resources/src/news_parse_zp/2018_03_05_10_00_00/newsage-zp-2018_03_05_10_10_00.csv b/tests/resources/src/news_parse_zp/2018_03_05_10_00_00/newsage-zp-2018_03_05_10_10_00.csv deleted file mode 100644 index f6c664d0d..000000000 --- a/tests/resources/src/news_parse_zp/2018_03_05_10_00_00/newsage-zp-2018_03_05_10_10_00.csv +++ /dev/null @@ -1,34 +0,0 @@ -DATE|FROM|TO|CARD|CARDNUM|TIME|SUM|BALANCE|CURRENCY|BANKNAME -2018-03-05 10:34:27|779438576505|713203301416|mir-|6817|58:42|2270|9762.81|rub|sberbank -2018-03-05 10:35:07|779438576505|713203301416|mir-|1967|51:58|2467.03|62762.29|rub|sberbank -2018-03-05 10:36:12|779438576505|713203301416|mir-|5749|16:88|44834.94|24238.41|rub|sberbank -2018-03-05 10:07:26|779438576505|713217462616|visa|5701|81:87|64481|862983.37|rub|sberbank -2018-03-05 10:35:06|779438576505|713247859086|mir-|3918|47:12|3829|5863.38|rub|sberbank -2018-03-05 10:36:05|779438576505|713247859086|mir-|0278|68:28|6187.81|77782.83|rub|sberbank -2018-03-05 10:38:38|779438576505|713247859086|mir-|6366|40:43|0753.33|93947.25|rub|sberbank -2018-03-05 10:12:53|779438576505|713249922493|mir-|4113|60:94|4315.96|9684.06|rub|sberbank -2018-03-05 10:13:00|779438576505|713249922493|mir-|2931|36:12|72009.09|10417.46|rub|sberbank -2018-03-05 10:20:31|779438576505|713266374009|mir-|9709|93:86|34355.77|84818.36|rub|sberbank -2018-03-05 10:33:14|779438576505|713272014073|mir-|2655|12:40|32599.20|00098.51|rub|sberbank -2018-03-05 10:49:01|779438576505|713273566844|mir-|1077|82:64|9145.51|5142.94|rub|sberbank -2018-03-05 10:52:05|779438576505|713278577358|mir-|7881|82:53|2380|4740.43|rub|sberbank -2018-03-05 10:37:21|779438576505|713280562578|mir-|9072|13:69|15569.18|59842.04|rub|sberbank -2018-03-05 10:24:37|779438576505|713291184561|mir-|8007|00:19|02275.22|89117.12|rub|sberbank -2018-03-05 10:50:38|779438576505|713295430662|maes|8627|74:24|63917.91|50954.28|rub|sberbank -2018-03-05 10:30:20|779438576505|713296922432|visa|2080|38:58|68451.60|76777.08|rub|sberbank -2018-03-05 10:21:06|779438576505|713305030172|visa|0583|16:59|7982.95|8108.63|rub|sberbank -2018-03-05 10:38:37|779438576505|713313566745|mir-|6745|58:12|16909.02|15122.61|rub|sberbank -2018-03-05 10:48:01|779438576505|713313685279|maes|0321|47:32|95571|17132.04|rub|sberbank -2018-03-05 10:30:28|779438576505|713319765020|mir-|3762|71:02|020.35|12227.61|rub|sberbank -2018-03-05 10:30:32|779438576505|713319765020|mir-|0195|67:89|50052.75|23588.47|rub|sberbank -2018-03-05 10:34:31|779438576505|713320201250|mir-|6726|19:36|0932.50|9980.92|rub|sberbank -2018-03-05 10:46:42|779438576505|713323676055|visa|7857|32:32|6407|44340.17|rub|sberbank -2018-03-05 10:04:23|779438576505|713329554120|mir-|8402|87:39|1044.85|9353.57|rub|sberbank -2018-03-05 10:06:52|779438576505|713331558634|visa|6535|95:16|7621.65|0066.07|rub|sberbank -2018-03-05 10:09:26|779438576505|713333653163|mir-|2382|56:28|960515.04|439232.21|rub|sberbank -2018-03-05 10:56:05|779438576505|713336931471|mir-|8064|97:12|5608.43|10858.51|rub|sberbank -2018-03-05 10:54:36|779438576505|713336931471|mir-|1757|39:69|21689.65|13849.01|rub|sberbank -2018-03-05 10:39:08|779438576505|713347823993|mir-|5510|14:73|14115.48|30573.60|rub|sberbank -2018-03-05 10:01:19|779438576505|713366255169|mir-|0999|73:87|77737.26|21339.14|rub|sberbank -2018-03-05 10:38:19|779438576505|713367519261|ecmc|0520|69:82|7614|19751.26|rub|sberbank -2018-03-05 10:11:23|779438576505|713370436605|visa|9005|85:60|27252|10868.21|rub|sberbank diff --git a/tests/tests_integration/tests_core_integration/test_file_downloader_integration.py b/tests/tests_integration/tests_core_integration/test_file_downloader_integration.py index bd5051803..14440701e 100644 --- a/tests/tests_integration/tests_core_integration/test_file_downloader_integration.py +++ b/tests/tests_integration/tests_core_integration/test_file_downloader_integration.py @@ -8,7 +8,6 @@ import pytest from etl_entities import FileListHWM -from pytest_lazyfixture import lazy_fixture from onetl.exception import DirectoryNotFoundError, NotAFileError from onetl.file import FileDownloader @@ -29,17 +28,18 @@ ) -def test_downloader_view_file(file_all_connections, source_path, upload_test_files): +def test_file_downloader_view_file(file_connection_with_path_and_files): + file_connection, remote_path, _ = file_connection_with_path_and_files downloader = FileDownloader( - connection=file_all_connections, - source_path=source_path, + connection=file_connection, + source_path=remote_path, local_path="/some/path", ) remote_files = downloader.view_files() remote_files_list = [] - for root, _dirs, files in file_all_connections.walk(source_path): + for root, _dirs, files in file_connection.walk(remote_path): for file in files: remote_files_list.append(RemotePath(root) / file) @@ -54,20 +54,19 @@ def test_downloader_view_file(file_all_connections, source_path, upload_test_fil ids=["run_path_type str", "run_path_type Path"], ) @pytest.mark.parametrize("workers", [1, 3]) -def test_downloader_run( - file_all_connections, - source_path, - upload_test_files, +def test_file_downloader_run( + file_connection_with_path_and_files, path_type, run_path_type, tmp_path_factory, workers, ): + file_connection, remote_path, uploaded_files = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") downloader = FileDownloader( - connection=file_all_connections, - source_path=path_type(source_path), + connection=file_connection, + source_path=path_type(remote_path), local_path=run_path_type(local_path), options=FileDownloader.Options( workers=workers, @@ -82,7 +81,7 @@ def test_downloader_run( assert download_result.successful assert sorted(download_result.successful) == sorted( - local_path / file.relative_to(source_path) for file in upload_test_files + local_path / file.relative_to(remote_path) for file in uploaded_files ) for local_file in download_result.successful: @@ -92,29 +91,28 @@ def test_downloader_run( assert local_file.is_file() assert not local_file.is_dir() - remote_file_path = source_path / local_file.relative_to(local_path) - remote_file = file_all_connections.resolve_file(remote_file_path) + remote_file_path = remote_path / local_file.relative_to(local_path) + remote_file = file_connection.resolve_file(remote_file_path) # file size is same as expected - assert local_file.stat().st_size == file_all_connections.get_stat(remote_file).st_size + assert local_file.stat().st_size == file_connection.get_stat(remote_file).st_size # file content is same as expected - assert local_file.read_bytes() == file_all_connections.read_bytes(remote_file) + assert local_file.read_bytes() == file_connection.read_bytes(remote_file) -def test_downloader_run_delete_source( - file_all_connections, - source_path, - upload_test_files, +def test_file_downloader_run_delete_source( + file_connection_with_path_and_files, resource_path, tmp_path_factory, caplog, ): + file_connection, remote_path, uploaded_files = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") downloader = FileDownloader( - connection=file_all_connections, - source_path=source_path, + connection=file_connection, + source_path=remote_path, local_path=local_path, options=FileDownloader.Options(delete_source=True), ) @@ -130,7 +128,7 @@ def test_downloader_run_delete_source( assert download_result.successful assert sorted(download_result.successful) == sorted( - local_path / file.relative_to(source_path) for file in upload_test_files + local_path / file.relative_to(remote_path) for file in uploaded_files ) for local_file in download_result.successful: @@ -145,12 +143,12 @@ def test_downloader_run_delete_source( assert local_file.stat().st_size == original_file.stat().st_size assert local_file.read_bytes() == original_file.read_bytes() - if not file_all_connections.path_exists(source_path): + if not file_connection.path_exists(remote_path): # S3 does not support creating directories return remote_files = FileSet() - for root, _dirs, files in file_all_connections.walk(source_path): + for root, _dirs, files in file_connection.walk(remote_path): for file in files: remote_files.add(RemoteFile(path=root / file.name, stats=file.stats)) @@ -158,32 +156,31 @@ def test_downloader_run_delete_source( @pytest.mark.parametrize("path_type", [str, Path]) -def test_downloader_file_filter_exclude_dir( - file_all_connections, - source_path, - upload_test_files, +def test_file_downloader_file_filter_exclude_dir( + file_connection_with_path_and_files, path_type, tmp_path_factory, caplog, ): + file_connection, remote_path, uploaded_files = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") downloader = FileDownloader( - connection=file_all_connections, - source_path=source_path, + connection=file_connection, + source_path=remote_path, local_path=local_path, - filters=[ExcludeDir(path_type(source_path / "exclude_dir"))], + filters=[ExcludeDir(path_type(remote_path / "raw/exclude_dir"))], ) excluded = [ - source_path / "exclude_dir/file_4.txt", - source_path / "exclude_dir/file_5.txt", + remote_path / "raw/exclude_dir/excluded1.txt", + remote_path / "raw/exclude_dir/nested/excluded2.txt", ] with caplog.at_level(logging.INFO): download_result = downloader.run() assert " filters = [" in caplog.text - assert f" ExcludeDir('{source_path}/exclude_dir')," in caplog.text + assert f" ExcludeDir('{remote_path}/raw/exclude_dir')," in caplog.text assert " ]" in caplog.text assert not download_result.failed @@ -192,26 +189,27 @@ def test_downloader_file_filter_exclude_dir( assert download_result.successful assert sorted(download_result.successful) == sorted( - local_path / file.relative_to(source_path) for file in upload_test_files if file not in excluded + local_path / file.relative_to(remote_path) for file in uploaded_files if file not in excluded ) -def test_downloader_file_filter_glob(file_all_connections, source_path, upload_test_files, tmp_path_factory, caplog): +def test_file_downloader_file_filter_glob(file_connection_with_path_and_files, tmp_path_factory, caplog): + file_connection, remote_path, uploaded_files = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") downloader = FileDownloader( - connection=file_all_connections, - source_path=source_path, + connection=file_connection, + source_path=remote_path, local_path=local_path, filters=[Glob("*.csv")], ) excluded = [ - source_path / "exclude_dir/file_4.txt", - source_path / "exclude_dir/file_5.txt", - source_path / "news_parse_zp/exclude_dir/file_1.txt", - source_path / "news_parse_zp/exclude_dir/file_2.txt", - source_path / "news_parse_zp/exclude_dir/file_3.txt", + remote_path / "raw/utf-8.txt", + remote_path / "raw/ascii.txt", + remote_path / "raw/exclude_dir/excluded1.txt", + remote_path / "raw/exclude_dir/nested/excluded2.txt", + remote_path / "raw/nested/exclude_dir/excluded3.txt", ] with caplog.at_level(logging.INFO): @@ -226,58 +224,56 @@ def test_downloader_file_filter_glob(file_all_connections, source_path, upload_t assert download_result.successful assert sorted(download_result.successful) == sorted( - local_path / file.relative_to(source_path) for file in upload_test_files if file not in excluded + local_path / file.relative_to(remote_path) for file in uploaded_files if file not in excluded ) -def test_downloader_file_filter_is_ignored_by_user_input( - file_all_connections, - source_path, - upload_test_files, +def test_file_downloader_file_filter_is_ignored_by_user_input( + file_connection_with_path_and_files, tmp_path_factory, ): + file_connection, remote_path, uploaded_files = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") downloader = FileDownloader( - connection=file_all_connections, - source_path=source_path, + connection=file_connection, + source_path=remote_path, local_path=local_path, filters=[Glob("*.csv")], ) - download_result = downloader.run(upload_test_files) + download_result = downloader.run(uploaded_files) # filter is not being applied to explicit files list assert sorted(download_result.successful) == sorted( - local_path / file.relative_to(source_path) for file in upload_test_files + local_path / file.relative_to(remote_path) for file in uploaded_files ) @pytest.mark.parametrize( - "source_path_value", - [None, lazy_fixture("source_path")], - ids=["Without source_path", "With source path"], + "pass_source_path", + [False, True], + ids=["Without source_path", "With source_path"], ) -def test_downloader_run_with_files_absolute( - file_all_connections, - source_path, - upload_test_files, - source_path_value, +def test_file_downloader_run_with_files_absolute( + file_connection_with_path_and_files, + pass_source_path, tmp_path_factory, caplog, ): + file_connection, remote_path, uploaded_files = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") downloader = FileDownloader( - connection=file_all_connections, - source_path=source_path_value, + connection=file_connection, + source_path=remote_path if pass_source_path else None, local_path=local_path, ) with caplog.at_level(logging.WARNING): - download_result = downloader.run(upload_test_files) + download_result = downloader.run(uploaded_files) - if source_path_value: + if pass_source_path: assert ( "Passed both `source_path` and files list at the same time. Using explicit files list" ) in caplog.text @@ -287,17 +283,17 @@ def test_downloader_run_with_files_absolute( assert not download_result.missing assert download_result.successful - if source_path_value: - local_files = [local_path / file.relative_to(source_path) for file in upload_test_files] + if pass_source_path: + local_files = [local_path / file.relative_to(remote_path) for file in uploaded_files] else: # no source path - do not preserve folder structure - local_files = [local_path / file.name for file in upload_test_files] + local_files = [local_path / file.name for file in uploaded_files] assert sorted(download_result.successful) == sorted(local_files) - for remote_file_path in upload_test_files: - if source_path_value: - local_file = local_path / remote_file_path.relative_to(source_path) + for remote_file_path in uploaded_files: + if pass_source_path: + local_file = local_path / remote_file_path.relative_to(remote_path) else: local_file = local_path / remote_file_path.name @@ -305,28 +301,27 @@ def test_downloader_run_with_files_absolute( assert local_file.is_file() assert not local_file.is_dir() - remote_file = file_all_connections.resolve_file(remote_file_path) + remote_file = file_connection.resolve_file(remote_file_path) # file size is same as expected - assert local_file.stat().st_size == file_all_connections.get_stat(remote_file).st_size - assert remote_file.stat().st_size == file_all_connections.get_stat(remote_file).st_size + assert local_file.stat().st_size == file_connection.get_stat(remote_file).st_size + assert remote_file.stat().st_size == file_connection.get_stat(remote_file).st_size # file content is same as expected - assert local_file.read_bytes() == file_all_connections.read_bytes(remote_file) + assert local_file.read_bytes() == file_connection.read_bytes(remote_file) -def test_downloader_run_with_files_relative_and_source_path( - file_all_connections, - source_path, - upload_test_files, +def test_file_downloader_run_with_files_relative_and_source_path( + file_connection_with_path_and_files, tmp_path_factory, ): + file_connection, remote_path, uploaded_files = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") - relative_files_path = [file.relative_to(source_path) for file in upload_test_files] + relative_files_path = [file.relative_to(remote_path) for file in uploaded_files] downloader = FileDownloader( - connection=file_all_connections, - source_path=source_path, + connection=file_connection, + source_path=remote_path, local_path=local_path, ) @@ -339,28 +334,28 @@ def test_downloader_run_with_files_relative_and_source_path( assert sorted(download_result.successful) == sorted(local_path / file for file in relative_files_path) - for remote_file_path in upload_test_files: - local_file = local_path / remote_file_path.relative_to(source_path) + for remote_file_path in uploaded_files: + local_file = local_path / remote_file_path.relative_to(remote_path) assert local_file.exists() assert local_file.is_file() assert not local_file.is_dir() - remote_file = file_all_connections.resolve_file(remote_file_path) + remote_file = file_connection.resolve_file(remote_file_path) # file size is same as expected - assert local_file.stat().st_size == file_all_connections.get_stat(remote_file).st_size - assert remote_file.stat().st_size == file_all_connections.get_stat(remote_file).st_size + assert local_file.stat().st_size == file_connection.get_stat(remote_file).st_size + assert remote_file.stat().st_size == file_connection.get_stat(remote_file).st_size # file content is same as expected - assert local_file.read_bytes() == file_all_connections.read_bytes(remote_file) + assert local_file.read_bytes() == file_connection.read_bytes(remote_file) -def test_downloader_run_without_files_and_source_path(file_all_connections, tmp_path_factory): +def test_file_downloader_run_without_files_and_source_path(file_connection, tmp_path_factory): local_path = tmp_path_factory.mktemp("local_path") downloader = FileDownloader( - connection=file_all_connections, + connection=file_connection, local_path=local_path, ) with pytest.raises(ValueError, match="Neither file list nor `source_path` are passed"): @@ -372,19 +367,18 @@ def test_downloader_run_without_files_and_source_path(file_all_connections, tmp_ [False, True], ids=["Without source_path", "With source_path"], ) -def test_downloader_run_with_empty_files_input( - file_all_connections, +def test_file_downloader_run_with_empty_files_input( + file_connection_with_path_and_files, pass_source_path, tmp_path_factory, - upload_test_files, - source_path, ): + file_connection, remote_path, _ = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") downloader = FileDownloader( - connection=file_all_connections, + connection=file_connection, local_path=local_path, - source_path=source_path if pass_source_path else None, + source_path=remote_path if pass_source_path else None, ) download_result = downloader.run([]) # this argument takes precedence @@ -395,24 +389,25 @@ def test_downloader_run_with_empty_files_input( assert not download_result.successful -def test_downloader_run_with_empty_source_path(request, file_all_connections, tmp_path_factory): - source_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") +def test_file_downloader_run_with_empty_source_path(request, file_connection_with_path, tmp_path_factory): + file_connection, remote_path = file_connection_with_path + remote_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") - file_all_connections.create_dir(source_path) - if file_all_connections.path_exists(source_path): + file_connection.create_dir(remote_path) + if file_connection.path_exists(remote_path): # S3 does not support creating directories def finalizer(): - file_all_connections.remove_dir(source_path, recursive=True) + file_connection.remove_dir(remote_path, recursive=True) request.addfinalizer(finalizer) local_path = tmp_path_factory.mktemp("local_path") downloader = FileDownloader( - connection=file_all_connections, + connection=file_connection, local_path=local_path, - source_path=source_path, + source_path=remote_path, ) download_result = downloader.run() @@ -423,11 +418,11 @@ def finalizer(): assert not download_result.successful -def test_downloader_run_relative_path_without_source_path(file_all_connections, tmp_path_factory): +def test_file_downloader_run_relative_path_without_source_path(file_connection, tmp_path_factory): local_path = tmp_path_factory.mktemp("local_path") downloader = FileDownloader( - connection=file_all_connections, + connection=file_connection, local_path=local_path, ) @@ -435,21 +430,20 @@ def test_downloader_run_relative_path_without_source_path(file_all_connections, downloader.run(["some/relative/path/file.txt"]) -def test_downloader_run_absolute_path_not_match_source_path( - file_all_connections, - source_path, +def test_file_downloader_run_absolute_path_not_match_source_path( + file_connection_with_path_and_files, tmp_path_factory, - upload_test_files, ): + file_connection, remote_path, uploaded_files = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") downloader = FileDownloader( - connection=file_all_connections, - source_path=source_path, + connection=file_connection, + source_path=remote_path, local_path=local_path, ) - error_message = f"File path '/some/relative/path/file.txt' does not match source_path '{source_path}'" + error_message = f"File path '/some/relative/path/file.txt' does not match source_path '{remote_path}'" with pytest.raises(ValueError, match=error_message): downloader.run(["/some/relative/path/file.txt"]) @@ -458,21 +452,22 @@ def test_downloader_run_absolute_path_not_match_source_path( "options", [{"mode": "error"}, FileDownloader.Options(mode="error"), FileDownloader.Options(mode=FileWriteMode.ERROR)], ) -def test_downloader_mode_error(file_all_connections, source_path, upload_test_files, options, tmp_path_factory): +def test_file_downloader_mode_error(file_connection_with_path_and_files, options, tmp_path_factory): + file_connection, remote_path, uploaded_files = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") # make copy of files to download in the local_path local_files_stat = {} - for test_file in upload_test_files: - local_file = local_path / test_file.relative_to(source_path) + for test_file in uploaded_files: + local_file = local_path / test_file.relative_to(remote_path) local_file.parent.mkdir(parents=True, exist_ok=True) local_file.write_text("unchanged") local_files_stat[local_file] = local_file.stat() downloader = FileDownloader( - connection=file_all_connections, - source_path=source_path, + connection=file_connection, + source_path=remote_path, local_path=local_path, options=options, ) @@ -484,7 +479,7 @@ def test_downloader_mode_error(file_all_connections, source_path, upload_test_fi assert not download_result.skipped assert download_result.failed - assert sorted(download_result.failed) == sorted(upload_test_files) + assert sorted(download_result.failed) == sorted(uploaded_files) for remote_file in download_result.failed: assert isinstance(remote_file, FailedRemoteFile) @@ -495,7 +490,7 @@ def test_downloader_mode_error(file_all_connections, source_path, upload_test_fi assert isinstance(remote_file.exception, FileExistsError) - local_file = local_path / remote_file.relative_to(source_path) + local_file = local_path / remote_file.relative_to(remote_path) assert re.search(rf"File '{local_file}' \(kind='file', .*\) already exists", str(remote_file.exception)) # file size wasn't changed @@ -506,14 +501,15 @@ def test_downloader_mode_error(file_all_connections, source_path, upload_test_fi assert local_file.read_text() == "unchanged" -def test_downloader_mode_ignore(file_all_connections, source_path, upload_test_files, tmp_path_factory, caplog): +def test_file_downloader_mode_ignore(file_connection_with_path_and_files, tmp_path_factory, caplog): + file_connection, remote_path, uploaded_files = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") # make copy of files to download in the local_path local_files = [] local_files_stat = {} - for test_file in upload_test_files: - local_file = local_path / test_file.relative_to(source_path) + for test_file in uploaded_files: + local_file = local_path / test_file.relative_to(remote_path) local_file.parent.mkdir(parents=True, exist_ok=True) local_file.write_text("unchanged") @@ -521,8 +517,8 @@ def test_downloader_mode_ignore(file_all_connections, source_path, upload_test_f local_files_stat[local_file] = local_file.stat() downloader = FileDownloader( - connection=file_all_connections, - source_path=source_path, + connection=file_connection, + source_path=remote_path, local_path=local_path, options=FileDownloader.Options(mode=FileWriteMode.IGNORE), ) @@ -538,7 +534,7 @@ def test_downloader_mode_ignore(file_all_connections, source_path, upload_test_f assert not download_result.missing assert download_result.skipped - assert sorted(download_result.skipped) == sorted(upload_test_files) + assert sorted(download_result.skipped) == sorted(uploaded_files) for remote_file in download_result.skipped: assert isinstance(remote_file, RemoteFile) @@ -547,7 +543,7 @@ def test_downloader_mode_ignore(file_all_connections, source_path, upload_test_f assert remote_file.is_file() assert not remote_file.is_dir() - local_file = local_path / remote_file.relative_to(source_path) + local_file = local_path / remote_file.relative_to(remote_path) # file size wasn't changed assert local_file.stat().st_size != remote_file.stat().st_size @@ -557,14 +553,15 @@ def test_downloader_mode_ignore(file_all_connections, source_path, upload_test_f assert local_file.read_text() == "unchanged" -def test_downloader_mode_overwrite(file_all_connections, source_path, upload_test_files, tmp_path_factory, caplog): +def test_file_downloader_mode_overwrite(file_connection_with_path_and_files, tmp_path_factory, caplog): + file_connection, remote_path, uploaded_files = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") # make copy of files to download in the local_path local_files = [] local_files_stat = {} - for test_file in upload_test_files: - local_file = local_path / test_file.relative_to(source_path) + for test_file in uploaded_files: + local_file = local_path / test_file.relative_to(remote_path) local_file.parent.mkdir(parents=True, exist_ok=True) local_file.write_text("unchanged") @@ -572,8 +569,8 @@ def test_downloader_mode_overwrite(file_all_connections, source_path, upload_tes local_files_stat[local_file] = local_file.stat() downloader = FileDownloader( - connection=file_all_connections, - source_path=source_path, + connection=file_connection, + source_path=remote_path, local_path=local_path, options=FileDownloader.Options(mode=FileWriteMode.OVERWRITE), ) @@ -590,37 +587,36 @@ def test_downloader_mode_overwrite(file_all_connections, source_path, upload_tes assert download_result.successful assert sorted(download_result.successful) == sorted( - local_path / file.relative_to(source_path) for file in upload_test_files + local_path / file.relative_to(remote_path) for file in uploaded_files ) - for remote_file_path in upload_test_files: - local_file = local_path / remote_file_path.relative_to(source_path) + for remote_file_path in uploaded_files: + local_file = local_path / remote_file_path.relative_to(remote_path) assert local_file.exists() assert local_file.is_file() assert not local_file.is_dir() - remote_file = file_all_connections.resolve_file(remote_file_path) + remote_file = file_connection.resolve_file(remote_file_path) # file size was changed assert local_file.stat().st_size != local_files_stat[local_file].st_size - assert local_file.stat().st_size == file_all_connections.get_stat(remote_file).st_size - assert remote_file.stat().st_size == file_all_connections.get_stat(remote_file).st_size + assert local_file.stat().st_size == file_connection.get_stat(remote_file).st_size + assert remote_file.stat().st_size == file_connection.get_stat(remote_file).st_size # file content was changed assert local_file.read_text() != "unchanged" - assert local_file.read_bytes() == file_all_connections.read_bytes(remote_file) + assert local_file.read_bytes() == file_connection.read_bytes(remote_file) @pytest.mark.parametrize("local_dir_exist", [True, False]) -def test_downloader_mode_delete_all( - file_all_connections, - source_path, - upload_test_files, +def test_file_downloader_mode_delete_all( + file_connection_with_path_and_files, tmp_path_factory, local_dir_exist, caplog, ): + file_connection, remote_path, _ = file_connection_with_path_and_files if local_dir_exist: local_path = tmp_path_factory.mktemp("local_path") else: @@ -631,8 +627,8 @@ def test_downloader_mode_delete_all( temp_file.touch() downloader = FileDownloader( - connection=file_all_connections, - source_path=source_path, + connection=file_connection, + source_path=remote_path, local_path=local_path, options=FileDownloader.Options(mode=FileWriteMode.DELETE_ALL), ) @@ -651,29 +647,30 @@ def test_downloader_mode_delete_all( assert not temp_file.exists() -def test_downloader_run_missing_file(request, file_all_connections, upload_test_files, tmp_path_factory, caplog): +def test_file_downloader_run_missing_file(request, file_connection_with_path_and_files, tmp_path_factory, caplog): + file_connection, _, uploaded_files = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") target_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") - file_all_connections.create_dir(target_path) - if file_all_connections.path_exists(target_path): + file_connection.create_dir(target_path) + if file_connection.path_exists(target_path): # S3 does not support creating directories def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) # upload files downloader = FileDownloader( - connection=file_all_connections, + connection=file_connection, local_path=local_path, ) missing_file = target_path / "missing" with caplog.at_level(logging.WARNING): - download_result = downloader.run(upload_test_files + [missing_file]) + download_result = downloader.run(uploaded_files + [missing_file]) assert f"Missing file '{missing_file}', skipping" in caplog.text @@ -682,61 +679,61 @@ def finalizer(): assert download_result.missing assert download_result.successful - assert len(download_result.successful) == len(upload_test_files) + assert len(download_result.successful) == len(uploaded_files) assert len(download_result.missing) == 1 assert download_result.missing == {missing_file} assert isinstance(download_result.missing[0], RemotePath) -def test_downloader_source_path_does_not_exist(file_all_connections, tmp_path_factory): +def test_file_downloader_source_path_does_not_exist(file_connection, tmp_path_factory): local_path = tmp_path_factory.mktemp("local_path") - source_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") + remote_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") downloader = FileDownloader( - connection=file_all_connections, - source_path=source_path, + connection=file_connection, + source_path=remote_path, local_path=local_path, ) - with pytest.raises(DirectoryNotFoundError, match=f"'{source_path}' does not exist"): + with pytest.raises(DirectoryNotFoundError, match=f"'{remote_path}' does not exist"): downloader.run() -def test_downloader_source_path_not_a_directory(request, file_all_connections, tmp_path_factory): +def test_file_downloader_source_path_not_a_directory(request, file_connection, tmp_path_factory): local_path = tmp_path_factory.mktemp("local_path") - source_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") - file_all_connections.write_text(source_path, "abc") + remote_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") + file_connection.write_text(remote_path, "abc") def finalizer(): - file_all_connections.remove_file(source_path) + file_connection.remove_file(remote_path) request.addfinalizer(finalizer) downloader = FileDownloader( - connection=file_all_connections, - source_path=source_path, + connection=file_connection, + source_path=remote_path, local_path=local_path, ) - with pytest.raises(NotADirectoryError, match=rf"'{source_path}' \(kind='file', .*\) is not a directory"): + with pytest.raises(NotADirectoryError, match=rf"'{remote_path}' \(kind='file', .*\) is not a directory"): downloader.run() -def test_downloader_local_path_not_a_directory(request, file_all_connections): - source_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") - file_all_connections.create_dir(source_path) +def test_file_downloader_local_path_not_a_directory(request, file_connection): + remote_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") + file_connection.create_dir(remote_path) def finalizer(): - file_all_connections.remove_dir(source_path) + file_connection.remove_dir(remote_path) request.addfinalizer(finalizer) with tempfile.NamedTemporaryFile() as file: downloader = FileDownloader( - connection=file_all_connections, - source_path=source_path, + connection=file_connection, + source_path=remote_path, local_path=file.name, ) @@ -744,25 +741,25 @@ def finalizer(): downloader.run() -def test_downloader_run_input_is_not_file(request, file_all_connections, tmp_path_factory): +def test_file_downloader_run_input_is_not_file(request, file_connection, tmp_path_factory): local_path = tmp_path_factory.mktemp("local_path") - source_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") - not_a_file = source_path / "not_a_file" + remote_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") + not_a_file = remote_path / "not_a_file" - file_all_connections.create_dir(not_a_file) + file_connection.create_dir(not_a_file) - if not file_all_connections.path_exists(not_a_file): + if not file_connection.path_exists(not_a_file): # S3 does not support creating directories return def finalizer(): - file_all_connections.remove_dir(source_path, recursive=True) + file_connection.remove_dir(remote_path, recursive=True) request.addfinalizer(finalizer) downloader = FileDownloader( - connection=file_all_connections, + connection=file_connection, local_path=local_path, ) @@ -770,13 +767,14 @@ def finalizer(): downloader.run([not_a_file]) -def test_downloader_with_file_limit(file_all_connections, source_path, upload_test_files, tmp_path_factory, caplog): +def test_file_downloader_with_file_limit(file_connection_with_path_and_files, tmp_path_factory, caplog): + file_connection, remote_path, _ = file_connection_with_path_and_files limit = 2 local_path = tmp_path_factory.mktemp("local_path") downloader = FileDownloader( - connection=file_all_connections, - source_path=source_path, + connection=file_connection, + source_path=remote_path, local_path=local_path, limits=[MaxFilesCount(2)], ) @@ -793,44 +791,44 @@ def test_downloader_with_file_limit(file_all_connections, source_path, upload_te assert len(download_result.successful) == limit -def test_downloader_file_limit_is_ignored_by_user_input( - file_all_connections, - source_path, - upload_test_files, +def test_file_downloader_file_limit_is_ignored_by_user_input( + file_connection_with_path_and_files, tmp_path_factory, ): + file_connection, remote_path, uploaded_files = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") downloader = FileDownloader( - connection=file_all_connections, - source_path=source_path, + connection=file_connection, + source_path=remote_path, local_path=local_path, limits=[MaxFilesCount(2)], ) - download_result = downloader.run(upload_test_files) + download_result = downloader.run(uploaded_files) # limit is not being applied to explicit files list - assert len(download_result.successful) == len(upload_test_files) + assert len(download_result.successful) == len(uploaded_files) -def test_downloader_limit_applied_after_filter(file_all_connections, source_path, upload_test_files, tmp_path_factory): +def test_file_downloader_limit_applied_after_filter(file_connection_with_path_and_files, tmp_path_factory): + file_connection, remote_path, uploaded_files = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") downloader = FileDownloader( - connection=file_all_connections, - source_path=source_path, + connection=file_connection, + source_path=remote_path, local_path=local_path, filters=[Glob("*.csv")], limits=[MaxFilesCount(1)], ) excluded = [ - source_path / "exclude_dir/file_4.txt", - source_path / "exclude_dir/file_5.txt", - source_path / "news_parse_zp/exclude_dir/file_1.txt", - source_path / "news_parse_zp/exclude_dir/file_2.txt", - source_path / "news_parse_zp/exclude_dir/file_3.txt", + remote_path / "raw/utf-8.txt", + remote_path / "raw/ascii.txt", + remote_path / "raw/exclude_dir/excluded1.txt", + remote_path / "raw/exclude_dir/nested/excluded2.txt", + remote_path / "raw/nested/exclude_dir/excluded3.txt", ] download_result = downloader.run() @@ -841,7 +839,7 @@ def test_downloader_limit_applied_after_filter(file_all_connections, source_path assert download_result.successful filtered = { - local_path / file.relative_to(source_path) for file in upload_test_files if os.fspath(file) not in excluded + local_path / file.relative_to(remote_path) for file in uploaded_files if os.fspath(file) not in excluded } # limit should be applied to files which satisfy the filter, not to all files in the source_path @@ -849,18 +847,17 @@ def test_downloader_limit_applied_after_filter(file_all_connections, source_path assert len(download_result.successful) == 1 -def test_downloader_detect_hwm_type_snap_batch_strategy( - file_all_connections, - source_path, - upload_test_files, +def test_file_downloader_detect_hwm_type_snap_batch_strategy( + file_connection_with_path, tmp_path_factory, ): + file_connection, remote_path = file_connection_with_path local_path = tmp_path_factory.mktemp("local_path") downloader = FileDownloader( - connection=file_all_connections, + connection=file_connection, local_path=local_path, - source_path=local_path, + source_path=remote_path, hwm_type="file_list", ) @@ -869,18 +866,17 @@ def test_downloader_detect_hwm_type_snap_batch_strategy( downloader.run() -def test_downloader_detect_hwm_type_inc_batch_strategy( - file_all_connections, - source_path, - upload_test_files, +def test_file_downloader_detect_hwm_type_inc_batch_strategy( + file_connection_with_path, tmp_path_factory, ): + file_connection, remote_path = file_connection_with_path local_path = tmp_path_factory.mktemp("local_path") downloader = FileDownloader( - connection=file_all_connections, + connection=file_connection, local_path=local_path, - source_path=source_path, + source_path=remote_path, hwm_type="file_list", ) @@ -891,19 +887,18 @@ def test_downloader_detect_hwm_type_inc_batch_strategy( downloader.run() -def test_downloader_detect_hwm_type_snapshot_strategy( - file_all_connections, - source_path, - upload_test_files, +def test_file_downloader_detect_hwm_type_snapshot_strategy( + file_connection_with_path, tmp_path_factory, caplog, ): + file_connection, remote_path = file_connection_with_path local_path = tmp_path_factory.mktemp("local_path") downloader = FileDownloader( - connection=file_all_connections, + connection=file_connection, local_path=local_path, - source_path=source_path, + source_path=remote_path, hwm_type="file_list", ) @@ -911,19 +906,18 @@ def test_downloader_detect_hwm_type_snapshot_strategy( downloader.run() -def test_downloader_file_hwm_strategy_with_wrong_parameters( - file_all_connections, - source_path, - upload_test_files, +def test_file_downloader_file_hwm_strategy_with_wrong_parameters( + file_connection_with_path_and_files, tmp_path_factory, caplog, ): + file_connection, remote_path, uploaded_files = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") downloader = FileDownloader( - connection=file_all_connections, + connection=file_connection, local_path=local_path, - source_path=source_path, + source_path=remote_path, hwm_type="file_list", ) @@ -942,20 +936,19 @@ def test_downloader_file_hwm_strategy_with_wrong_parameters( FileListHWM, ], ) -def test_downloader_file_hwm_strategy( - file_all_connections, - source_path, - upload_test_files, +def test_file_downloader_file_hwm_strategy( + file_connection_with_path_and_files, tmp_path_factory, hwm_type, ): + file_connection, remote_path, uploaded_files = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") downloader = FileDownloader( - connection=file_all_connections, + connection=file_connection, local_path=local_path, hwm_type=hwm_type, - source_path=source_path, + source_path=remote_path, ) with IncrementalStrategy(): @@ -971,12 +964,13 @@ def test_downloader_file_hwm_strategy( ], ids=["no temp", "temp_path str", "temp_path PurePosixPath"], ) -def test_downloader_with_temp_path(file_all_connections, source_path, upload_test_files, temp_path, tmp_path_factory): +def test_file_downloader_with_temp_path(file_connection_with_path_and_files, temp_path, tmp_path_factory): + file_connection, remote_path, uploaded_files = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") downloader = FileDownloader( - connection=file_all_connections, - source_path=source_path, + connection=file_connection, + source_path=remote_path, local_path=local_path, temp_path=temp_path, ) @@ -989,7 +983,7 @@ def test_downloader_with_temp_path(file_all_connections, source_path, upload_tes assert download_result.successful assert sorted(download_result.successful) == sorted( - local_path / file.relative_to(source_path) for file in upload_test_files + local_path / file.relative_to(remote_path) for file in uploaded_files ) if temp_path: diff --git a/tests/tests_integration/tests_core_integration/test_file_mover_integration.py b/tests/tests_integration/tests_core_integration/test_file_mover_integration.py index b56bb215c..85e24dc93 100644 --- a/tests/tests_integration/tests_core_integration/test_file_mover_integration.py +++ b/tests/tests_integration/tests_core_integration/test_file_mover_integration.py @@ -5,7 +5,6 @@ from pathlib import Path, PurePosixPath import pytest -from pytest_lazyfixture import lazy_fixture from onetl.exception import DirectoryNotFoundError, NotAFileError from onetl.file import FileMover @@ -14,11 +13,12 @@ from onetl.impl import FailedRemoteFile, FileWriteMode, RemoteFile, RemotePath -def test_mover_view_file(file_all_connections, source_path, upload_test_files): +def test_file_mover_view_file(file_connection_with_path_and_files): + file_connection, source_path, uploaded_files = file_connection_with_path_and_files target_path = f"/tmp/test_upload_{secrets.token_hex(5)}" mover = FileMover( - connection=file_all_connections, + connection=file_connection, source_path=source_path, target_path=target_path, ) @@ -26,7 +26,7 @@ def test_mover_view_file(file_all_connections, source_path, upload_test_files): remote_files = mover.view_files() remote_files_list = [] - for root, _dirs, files in file_all_connections.walk(source_path): + for root, _dirs, files in file_connection.walk(source_path): for file in files: remote_files_list.append(RemotePath(root) / file) @@ -36,23 +36,22 @@ def test_mover_view_file(file_all_connections, source_path, upload_test_files): @pytest.mark.parametrize("path_type", [str, PurePosixPath], ids=["path_type str", "path_type PurePosixPath"]) @pytest.mark.parametrize("workers", [1, 3]) -def test_mover_run( +def test_file_mover_run( request, - file_all_connections, - source_path, - upload_test_files, + file_connection_with_path_and_files, path_type, workers, ): + file_connection, source_path, uploaded_files = file_connection_with_path_and_files target_path = f"/tmp/test_upload_{secrets.token_hex(5)}" def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) mover = FileMover( - connection=file_all_connections, + connection=file_connection, source_path=path_type(source_path), target_path=path_type(target_path), options=FileMover.Options( @@ -63,11 +62,11 @@ def finalizer(): # record files content and size before move files_content = {} files_size = {} - for root, _dirs, files in file_all_connections.walk(source_path): + for root, _dirs, files in file_connection.walk(source_path): for file_name in files: file_path = root / file_name - files_content[file_path] = file_all_connections.read_bytes(file_path) - files_size[file_path] = file_all_connections.get_stat(file_path).st_size + files_content[file_path] = file_connection.read_bytes(file_path) + files_size[file_path] = file_connection.get_stat(file_path).st_size move_result = mover.run() @@ -77,54 +76,53 @@ def finalizer(): assert move_result.successful assert sorted(move_result.successful) == sorted( - target_path / file.relative_to(source_path) for file in upload_test_files + target_path / file.relative_to(source_path) for file in uploaded_files ) for target_file in move_result.successful: assert isinstance(target_file, RemoteFile) old_path = source_path / target_file.relative_to(target_path) - assert file_all_connections.resolve_file(target_file) + assert file_connection.resolve_file(target_file) # file size is same as expected - assert file_all_connections.get_stat(target_file).st_size == files_size[old_path] + assert file_connection.get_stat(target_file).st_size == files_size[old_path] # file content is same as expected - assert file_all_connections.read_bytes(target_file) == files_content[old_path] + assert file_connection.read_bytes(target_file) == files_content[old_path] @pytest.mark.parametrize("path_type", [str, Path]) -def test_mover_file_filter_exclude_dir( +def test_file_mover_file_filter_exclude_dir( request, - file_all_connections, - source_path, - upload_test_files, + file_connection_with_path_and_files, path_type, caplog, ): + file_connection, source_path, uploaded_files = file_connection_with_path_and_files target_path = f"/tmp/test_upload_{secrets.token_hex(5)}" def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) mover = FileMover( - connection=file_all_connections, + connection=file_connection, source_path=source_path, target_path=target_path, - filters=[ExcludeDir(path_type(source_path / "exclude_dir"))], + filters=[ExcludeDir(path_type(source_path / "raw/exclude_dir"))], ) excluded = [ - source_path / "exclude_dir/file_4.txt", - source_path / "exclude_dir/file_5.txt", + source_path / "raw/exclude_dir/excluded1.txt", + source_path / "raw/exclude_dir/nested/excluded2.txt", ] with caplog.at_level(logging.INFO): move_result = mover.run() assert " filters = [" in caplog.text - assert f" ExcludeDir('{source_path}/exclude_dir')," in caplog.text + assert f" ExcludeDir('{source_path}/raw/exclude_dir')," in caplog.text assert " ]" in caplog.text assert not move_result.failed @@ -133,31 +131,32 @@ def finalizer(): assert move_result.successful assert sorted(move_result.successful) == sorted( - target_path / file.relative_to(source_path) for file in upload_test_files if file not in excluded + target_path / file.relative_to(source_path) for file in uploaded_files if file not in excluded ) -def test_mover_file_filter_glob(request, file_all_connections, source_path, upload_test_files, caplog): +def test_file_mover_file_filter_glob(request, file_connection_with_path_and_files, caplog): + file_connection, source_path, uploaded_files = file_connection_with_path_and_files target_path = f"/tmp/test_upload_{secrets.token_hex(5)}" def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) mover = FileMover( - connection=file_all_connections, + connection=file_connection, source_path=source_path, target_path=target_path, filters=[Glob("*.csv")], ) excluded = [ - source_path / "exclude_dir/file_4.txt", - source_path / "exclude_dir/file_5.txt", - source_path / "news_parse_zp/exclude_dir/file_1.txt", - source_path / "news_parse_zp/exclude_dir/file_2.txt", - source_path / "news_parse_zp/exclude_dir/file_3.txt", + source_path / "raw/utf-8.txt", + source_path / "raw/ascii.txt", + source_path / "raw/exclude_dir/excluded1.txt", + source_path / "raw/exclude_dir/nested/excluded2.txt", + source_path / "raw/nested/exclude_dir/excluded3.txt", ] with caplog.at_level(logging.INFO): @@ -172,77 +171,75 @@ def finalizer(): assert move_result.successful assert sorted(move_result.successful) == sorted( - target_path / file.relative_to(source_path) for file in upload_test_files if file not in excluded + target_path / file.relative_to(source_path) for file in uploaded_files if file not in excluded ) -def test_mover_file_filter_is_ignored_by_user_input( +def test_file_mover_file_filter_is_ignored_by_user_input( request, - file_all_connections, - source_path, - upload_test_files, + file_connection_with_path_and_files, ): + file_connection, source_path, uploaded_files = file_connection_with_path_and_files target_path = f"/tmp/test_upload_{secrets.token_hex(5)}" def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) mover = FileMover( - connection=file_all_connections, + connection=file_connection, source_path=source_path, target_path=target_path, filters=[Glob("*.csv")], ) - move_result = mover.run(upload_test_files) + move_result = mover.run(uploaded_files) # filter is not being applied to explicit files list assert sorted(move_result.successful) == sorted( - target_path / file.relative_to(source_path) for file in upload_test_files + target_path / file.relative_to(source_path) for file in uploaded_files ) @pytest.mark.parametrize( - "source_path_value", - [None, lazy_fixture("source_path")], - ids=["Without source_path", "With source path"], + "pass_source_path", + [False, True], + ids=["Without source_path", "With source_path"], ) -def test_mover_run_with_files_absolute( +def test_file_mover_run_with_files_absolute( request, - file_all_connections, - source_path, - upload_test_files, - source_path_value, + file_connection_with_path_and_files, + pass_source_path, caplog, ): + file_connection, source_path, uploaded_files = file_connection_with_path_and_files target_path = RemotePath(f"/tmp/test_upload_{secrets.token_hex(5)}") def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) mover = FileMover( - connection=file_all_connections, - source_path=source_path_value, + connection=file_connection, + source_path=source_path if pass_source_path else None, target_path=target_path, ) # record files content and size before move files_content = {} files_size = {} - for root, _dirs, files in file_all_connections.walk(source_path): + for root, _dirs, files in file_connection.walk(source_path): for file_name in files: file_path = root / file_name - files_content[file_path] = file_all_connections.read_bytes(file_path) - files_size[file_path] = file_all_connections.get_stat(file_path).st_size + files_content[file_path] = file_connection.read_bytes(file_path) + files_size[file_path] = file_connection.get_stat(file_path).st_size with caplog.at_level(logging.WARNING): - move_result = mover.run(upload_test_files) + move_result = mover.run(uploaded_files) - if source_path_value: + if pass_source_path: assert ( "Passed both `source_path` and files list at the same time. Using explicit files list" ) in caplog.text @@ -252,55 +249,54 @@ def finalizer(): assert not move_result.missing assert move_result.successful - if source_path_value: - target_files = [target_path / file.relative_to(source_path) for file in upload_test_files] + if pass_source_path: + target_files = [target_path / file.relative_to(source_path) for file in uploaded_files] else: # no source path - do not preserve folder structure - target_files = [target_path / file.name for file in upload_test_files] + target_files = [target_path / file.name for file in uploaded_files] assert sorted(move_result.successful) == sorted(target_files) - for old_file in upload_test_files: - if source_path_value: + for old_file in uploaded_files: + if pass_source_path: target_file = target_path / old_file.relative_to(source_path) else: target_file = target_path / old_file.name - assert file_all_connections.resolve_file(target_file) + assert file_connection.resolve_file(target_file) # file size is same as expected - assert file_all_connections.get_stat(target_file).st_size == files_size[old_file] + assert file_connection.get_stat(target_file).st_size == files_size[old_file] # file content is same as expected - assert file_all_connections.read_bytes(target_file) == files_content[old_file] + assert file_connection.read_bytes(target_file) == files_content[old_file] -def test_mover_run_with_files_relative_and_source_path( +def test_file_mover_run_with_files_relative_and_source_path( request, - file_all_connections, - source_path, - upload_test_files, + file_connection_with_path_and_files, ): + file_connection, source_path, uploaded_files = file_connection_with_path_and_files target_path = f"/tmp/test_upload_{secrets.token_hex(5)}" def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) - relative_files_path = [file.relative_to(source_path) for file in upload_test_files] + relative_files_path = [file.relative_to(source_path) for file in uploaded_files] # record files content and size before move files_content = {} files_size = {} - for root, _dirs, files in file_all_connections.walk(source_path): + for root, _dirs, files in file_connection.walk(source_path): for file_name in files: file_path = root / file_name - files_content[file_path] = file_all_connections.read_bytes(file_path) - files_size[file_path] = file_all_connections.get_stat(file_path).st_size + files_content[file_path] = file_connection.read_bytes(file_path) + files_size[file_path] = file_connection.get_stat(file_path).st_size mover = FileMover( - connection=file_all_connections, + connection=file_connection, source_path=source_path, target_path=target_path, ) @@ -314,23 +310,23 @@ def finalizer(): assert sorted(move_result.successful) == sorted(target_path / file for file in relative_files_path) - for old_file in upload_test_files: + for old_file in uploaded_files: target_file = target_path / old_file.relative_to(source_path) - assert file_all_connections.resolve_file(target_file) + assert file_connection.resolve_file(target_file) # file size is same as expected - assert file_all_connections.get_stat(target_file).st_size == files_size[old_file] + assert file_connection.get_stat(target_file).st_size == files_size[old_file] # file content is same as expected - assert file_all_connections.read_bytes(target_file) == files_content[old_file] + assert file_connection.read_bytes(target_file) == files_content[old_file] -def test_mover_run_without_files_and_source_path(file_all_connections): +def test_file_mover_run_without_files_and_source_path(file_connection): target_path = f"/tmp/test_upload_{secrets.token_hex(5)}" mover = FileMover( - connection=file_all_connections, + connection=file_connection, target_path=target_path, ) with pytest.raises(ValueError, match="Neither file list nor `source_path` are passed"): @@ -342,22 +338,21 @@ def test_mover_run_without_files_and_source_path(file_all_connections): [False, True], ids=["Without source_path", "With source_path"], ) -def test_mover_run_with_empty_files_input( +def test_file_mover_run_with_empty_files_input( request, - file_all_connections, + file_connection_with_path_and_files, pass_source_path, - upload_test_files, - source_path, ): + file_connection, source_path, _ = file_connection_with_path_and_files target_path = f"/tmp/test_upload_{secrets.token_hex(5)}" def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) mover = FileMover( - connection=file_all_connections, + connection=file_connection, target_path=target_path, source_path=source_path if pass_source_path else None, ) @@ -370,28 +365,28 @@ def finalizer(): assert not move_result.successful -def test_mover_run_with_empty_source_path(request, file_all_connections): +def test_file_mover_run_with_empty_source_path(request, file_connection): source_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") - file_all_connections.create_dir(source_path) - if not file_all_connections.path_exists(source_path): + file_connection.create_dir(source_path) + if not file_connection.path_exists(source_path): # S3 does not support creating directories return def finalizer1(): - file_all_connections.remove_dir(source_path, recursive=True) + file_connection.remove_dir(source_path, recursive=True) request.addfinalizer(finalizer1) target_path = f"/tmp/test_upload_{secrets.token_hex(5)}" def finalizer2(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer2) mover = FileMover( - connection=file_all_connections, + connection=file_connection, target_path=target_path, source_path=source_path, ) @@ -404,11 +399,11 @@ def finalizer2(): assert not move_result.successful -def test_mover_run_relative_path_without_source_path(file_all_connections): +def test_file_mover_run_relative_path_without_source_path(file_connection): target_path = f"/tmp/test_upload_{secrets.token_hex(5)}" mover = FileMover( - connection=file_all_connections, + connection=file_connection, target_path=target_path, ) @@ -416,15 +411,15 @@ def test_mover_run_relative_path_without_source_path(file_all_connections): mover.run(["some/relative/path/file.txt"]) -def test_mover_run_absolute_path_not_match_source_path( - file_all_connections, - source_path, - upload_test_files, +def test_file_mover_run_absolute_path_not_match_source_path( + file_connection_with_path_and_files, ): + # uploading files only because S3 does not support empty directories + file_connection, source_path, _ = file_connection_with_path_and_files target_path = f"/tmp/test_upload_{secrets.token_hex(5)}" mover = FileMover( - connection=file_all_connections, + connection=file_connection, source_path=source_path, target_path=target_path, ) @@ -438,24 +433,25 @@ def test_mover_run_absolute_path_not_match_source_path( "options", [{"mode": "error"}, FileMover.Options(mode="error"), FileMover.Options(mode=FileWriteMode.ERROR)], ) -def test_mover_mode_error(request, file_all_connections, source_path, upload_test_files, options): +def test_file_mover_mode_error(request, file_connection_with_path_and_files, options): + file_connection, source_path, uploaded_files = file_connection_with_path_and_files target_path = f"/tmp/test_upload_{secrets.token_hex(5)}" def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) # create target files before move target_files_size = {} - for test_file in upload_test_files: + for test_file in uploaded_files: target_file = target_path / test_file.relative_to(source_path) - file_all_connections.write_text(target_file, "unchanged") - target_files_size[target_file] = file_all_connections.get_stat(target_file).st_size + file_connection.write_text(target_file, "unchanged") + target_files_size[target_file] = file_connection.get_stat(target_file).st_size mover = FileMover( - connection=file_all_connections, + connection=file_connection, source_path=source_path, target_path=target_path, options=options, @@ -468,44 +464,45 @@ def finalizer(): assert not move_result.skipped assert move_result.failed - assert sorted(move_result.failed) == sorted(upload_test_files) + assert sorted(move_result.failed) == sorted(uploaded_files) for source_file in move_result.failed: assert isinstance(source_file, FailedRemoteFile) - assert file_all_connections.resolve_file(source_file) + assert file_connection.resolve_file(source_file) assert isinstance(source_file.exception, FileExistsError) target_file = target_path / source_file.relative_to(source_path) assert re.search(rf"File '{target_file}' \(kind='file', .*\) already exists", str(source_file.exception)) # file size wasn't changed - assert file_all_connections.get_stat(target_file).st_size != source_file.stat().st_size - assert file_all_connections.get_stat(target_file).st_size == target_files_size[target_file] + assert file_connection.get_stat(target_file).st_size != source_file.stat().st_size + assert file_connection.get_stat(target_file).st_size == target_files_size[target_file] # file content wasn't changed - assert file_all_connections.read_text(target_file) == "unchanged" + assert file_connection.read_text(target_file) == "unchanged" -def test_mover_mode_ignore(request, file_all_connections, source_path, upload_test_files, caplog): +def test_file_mover_mode_ignore(request, file_connection_with_path_and_files, caplog): + file_connection, source_path, uploaded_files = file_connection_with_path_and_files target_path = f"/tmp/test_upload_{secrets.token_hex(5)}" def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) # create target files before move target_files = [] target_files_size = {} - for test_file in upload_test_files: + for test_file in uploaded_files: target_file = target_path / test_file.relative_to(source_path) - file_all_connections.write_text(target_file, "unchanged") + file_connection.write_text(target_file, "unchanged") target_files.append(target_file) - target_files_size[target_file] = file_all_connections.get_stat(target_file).st_size + target_files_size[target_file] = file_connection.get_stat(target_file).st_size mover = FileMover( - connection=file_all_connections, + connection=file_connection, source_path=source_path, target_path=target_path, options=FileMover.Options(mode=FileWriteMode.IGNORE), @@ -522,27 +519,28 @@ def finalizer(): assert not move_result.missing assert move_result.skipped - assert sorted(move_result.skipped) == sorted(upload_test_files) + assert sorted(move_result.skipped) == sorted(uploaded_files) for source_file in move_result.skipped: assert isinstance(source_file, RemoteFile) - assert file_all_connections.resolve_file(source_file) + assert file_connection.resolve_file(source_file) target_file = target_path / source_file.relative_to(source_path) # file size wasn't changed - assert file_all_connections.get_stat(target_file).st_size != source_file.stat().st_size - assert file_all_connections.get_stat(target_file).st_size == target_files_size[target_file] + assert file_connection.get_stat(target_file).st_size != source_file.stat().st_size + assert file_connection.get_stat(target_file).st_size == target_files_size[target_file] # file content wasn't changed - assert file_all_connections.read_text(target_file) == "unchanged" + assert file_connection.read_text(target_file) == "unchanged" -def test_mover_mode_overwrite(request, file_all_connections, source_path, upload_test_files, caplog): +def test_file_mover_mode_overwrite(request, file_connection_with_path_and_files, caplog): + file_connection, source_path, uploaded_files = file_connection_with_path_and_files target_path = f"/tmp/test_upload_{secrets.token_hex(5)}" def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) @@ -551,17 +549,17 @@ def finalizer(): target_files_size = {} source_files_size = {} source_files_content = {} - for test_file in upload_test_files: - source_files_size[test_file] = file_all_connections.get_stat(test_file).st_size - source_files_content[test_file] = file_all_connections.read_text(test_file) + for test_file in uploaded_files: + source_files_size[test_file] = file_connection.get_stat(test_file).st_size + source_files_content[test_file] = file_connection.read_text(test_file) target_file = target_path / test_file.relative_to(source_path) - file_all_connections.write_text(target_file, "unchanged") + file_connection.write_text(target_file, "unchanged") target_files.append(target_file) - target_files_size[target_file] = file_all_connections.get_stat(target_file).st_size + target_files_size[target_file] = file_connection.get_stat(target_file).st_size mover = FileMover( - connection=file_all_connections, + connection=file_connection, source_path=source_path, target_path=target_path, options=FileMover.Options(mode=FileWriteMode.OVERWRITE), @@ -579,46 +577,45 @@ def finalizer(): assert move_result.successful assert sorted(move_result.successful) == sorted( - target_path / file.relative_to(source_path) for file in upload_test_files + target_path / file.relative_to(source_path) for file in uploaded_files ) - for source_file in upload_test_files: + for source_file in uploaded_files: target_file = target_path / source_file.relative_to(source_path) - assert file_all_connections.resolve_file(target_file) - assert not file_all_connections.path_exists(source_file) + assert file_connection.resolve_file(target_file) + assert not file_connection.path_exists(source_file) # file size was changed - assert file_all_connections.get_stat(target_file).st_size != target_files_size[target_file] - assert file_all_connections.get_stat(target_file).st_size == source_files_size[source_file] + assert file_connection.get_stat(target_file).st_size != target_files_size[target_file] + assert file_connection.get_stat(target_file).st_size == source_files_size[source_file] # file content was changed - assert file_all_connections.read_text(target_file) != "unchanged" - assert file_all_connections.read_text(target_file) == source_files_content[source_file] + assert file_connection.read_text(target_file) != "unchanged" + assert file_connection.read_text(target_file) == source_files_content[source_file] @pytest.mark.parametrize("remote_dir_exist", [True, False]) -def test_mover_mode_delete_all( +def test_file_mover_mode_delete_all( request, - file_all_connections, - source_path, - upload_test_files, + file_connection_with_path_and_files, remote_dir_exist, caplog, ): + file_connection, source_path, uploaded_files = file_connection_with_path_and_files target_path = RemotePath(f"/tmp/test_upload_{secrets.token_hex(5)}") def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) temp_file = target_path / secrets.token_hex(5) if remote_dir_exist: - file_all_connections.write_text(temp_file, "unchanged") + file_connection.write_text(temp_file, "unchanged") mover = FileMover( - connection=file_all_connections, + connection=file_connection, source_path=source_path, target_path=target_path, options=FileMover.Options(mode=FileWriteMode.DELETE_ALL), @@ -634,31 +631,32 @@ def finalizer(): assert move_result.successful # folder contains only moved files - content = (root / file.name for root, _dirs, files in file_all_connections.walk(target_path) for file in files) + content = (root / file.name for root, _dirs, files in file_connection.walk(target_path) for file in files) assert sorted(content) == sorted(move_result.successful) - assert not file_all_connections.path_exists(temp_file) + assert not file_connection.path_exists(temp_file) -def test_mover_run_missing_file(request, file_all_connections, upload_test_files, caplog): +def test_file_mover_run_missing_file(request, file_connection_with_path_and_files, caplog): + file_connection, _, uploaded_files = file_connection_with_path_and_files target_path = RemotePath(f"/tmp/test_upload_{secrets.token_hex(5)}") - file_all_connections.create_dir(target_path) + file_connection.create_dir(target_path) def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) # upload files mover = FileMover( - connection=file_all_connections, + connection=file_connection, target_path=target_path, ) missing_file = target_path / "missing" with caplog.at_level(logging.WARNING): - move_result = mover.run(upload_test_files + [missing_file]) + move_result = mover.run(uploaded_files + [missing_file]) assert f"Missing file '{missing_file}', skipping" in caplog.text @@ -667,19 +665,19 @@ def finalizer(): assert move_result.missing assert move_result.successful - assert len(move_result.successful) == len(upload_test_files) + assert len(move_result.successful) == len(uploaded_files) assert len(move_result.missing) == 1 assert move_result.missing == {missing_file} assert isinstance(move_result.missing[0], RemotePath) -def test_mover_source_path_does_not_exist(file_all_connections): +def test_file_mover_source_path_does_not_exist(file_connection): target_path = f"/tmp/test_upload_{secrets.token_hex(5)}" source_path = f"/tmp/test_upload_{secrets.token_hex(5)}" mover = FileMover( - connection=file_all_connections, + connection=file_connection, source_path=source_path, target_path=target_path, ) @@ -688,24 +686,24 @@ def test_mover_source_path_does_not_exist(file_all_connections): mover.run() -def test_mover_source_path_not_a_directory(request, file_all_connections): +def test_file_mover_source_path_not_a_directory(request, file_connection): target_path = f"/tmp/test_upload_{secrets.token_hex(5)}" def finalizer1(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer1) source_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") - file_all_connections.write_text(source_path, "abc") + file_connection.write_text(source_path, "abc") def finalizer2(): - file_all_connections.remove_file(source_path) + file_connection.remove_file(source_path) request.addfinalizer(finalizer2) mover = FileMover( - connection=file_all_connections, + connection=file_connection, source_path=source_path, target_path=target_path, ) @@ -714,25 +712,25 @@ def finalizer2(): mover.run() -def test_mover_target_path_not_a_directory(request, file_all_connections): +def test_file_mover_target_path_not_a_directory(request, file_connection): source_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") - file_all_connections.create_dir(source_path) + file_connection.create_dir(source_path) def finalizer1(): - file_all_connections.remove_dir(source_path, recursive=True) + file_connection.remove_dir(source_path, recursive=True) request.addfinalizer(finalizer1) target_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") - file_all_connections.write_text(target_path, "") + file_connection.write_text(target_path, "") def finalizer2(): - file_all_connections.remove_file(target_path) + file_connection.remove_file(target_path) request.addfinalizer(finalizer2) mover = FileMover( - connection=file_all_connections, + connection=file_connection, source_path=source_path, target_path=target_path, ) @@ -741,25 +739,25 @@ def finalizer2(): mover.run() -def test_mover_run_input_is_not_file(request, file_all_connections): +def test_file_mover_run_input_is_not_file(request, file_connection): target_path = f"/tmp/test_upload_{secrets.token_hex(5)}" source_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") not_a_file = source_path / "not_a_file" - file_all_connections.create_dir(not_a_file) + file_connection.create_dir(not_a_file) - if not file_all_connections.path_exists(not_a_file): + if not file_connection.path_exists(not_a_file): # S3 does not support creating directories return def finalizer(): - file_all_connections.remove_dir(source_path, recursive=True) + file_connection.remove_dir(source_path, recursive=True) request.addfinalizer(finalizer) mover = FileMover( - connection=file_all_connections, + connection=file_connection, target_path=target_path, ) @@ -767,12 +765,13 @@ def finalizer(): mover.run([not_a_file]) -def test_mover_file_limit_custom(file_all_connections, source_path, upload_test_files, caplog): +def test_file_mover_file_limit_custom(file_connection_with_path_and_files, caplog): + file_connection, source_path, uploaded_files = file_connection_with_path_and_files limit = 2 target_path = f"/tmp/test_upload_{secrets.token_hex(5)}" mover = FileMover( - connection=file_all_connections, + connection=file_connection, source_path=source_path, target_path=target_path, limits=[MaxFilesCount(2)], @@ -790,31 +789,31 @@ def test_mover_file_limit_custom(file_all_connections, source_path, upload_test_ assert len(move_result.successful) == limit -def test_mover_file_limit_is_ignored_by_user_input( - file_all_connections, - source_path, - upload_test_files, +def test_file_mover_file_limit_is_ignored_by_user_input( + file_connection_with_path_and_files, ): + file_connection, source_path, uploaded_files = file_connection_with_path_and_files target_path = f"/tmp/test_upload_{secrets.token_hex(5)}" mover = FileMover( - connection=file_all_connections, + connection=file_connection, source_path=source_path, target_path=target_path, limits=[MaxFilesCount(2)], ) - move_result = mover.run(upload_test_files) + move_result = mover.run(uploaded_files) # limit is not being applied to explicit files list - assert len(move_result.successful) == len(upload_test_files) + assert len(move_result.successful) == len(uploaded_files) -def test_mover_limit_applied_after_filter(file_all_connections, source_path, upload_test_files): +def test_file_mover_limit_applied_after_filter(file_connection_with_path_and_files): + file_connection, source_path, uploaded_files = file_connection_with_path_and_files target_path = f"/tmp/test_upload_{secrets.token_hex(5)}" mover = FileMover( - connection=file_all_connections, + connection=file_connection, source_path=source_path, target_path=target_path, filters=[Glob("*.csv")], @@ -822,11 +821,11 @@ def test_mover_limit_applied_after_filter(file_all_connections, source_path, upl ) excluded = [ - source_path / "exclude_dir/file_4.txt", - source_path / "exclude_dir/file_5.txt", - source_path / "news_parse_zp/exclude_dir/file_1.txt", - source_path / "news_parse_zp/exclude_dir/file_2.txt", - source_path / "news_parse_zp/exclude_dir/file_3.txt", + source_path / "raw/utf-8.txt", + source_path / "raw/ascii.txt", + source_path / "raw/exclude_dir/excluded1.txt", + source_path / "raw/exclude_dir/nested/excluded2.txt", + source_path / "raw/nested/exclude_dir/excluded3.txt", ] move_result = mover.run() @@ -837,7 +836,7 @@ def test_mover_limit_applied_after_filter(file_all_connections, source_path, upl assert move_result.successful filtered = { - target_path / file.relative_to(source_path) for file in upload_test_files if os.fspath(file) not in excluded + target_path / file.relative_to(source_path) for file in uploaded_files if os.fspath(file) not in excluded } # limit should be applied to files which satisfy the filter, not to all files in the source_path diff --git a/tests/tests_integration/tests_core_integration/test_file_uploader_integration.py b/tests/tests_integration/tests_core_integration/test_file_uploader_integration.py index 7597adc9d..6be9f84f7 100644 --- a/tests/tests_integration/tests_core_integration/test_file_uploader_integration.py +++ b/tests/tests_integration/tests_core_integration/test_file_uploader_integration.py @@ -12,12 +12,12 @@ from onetl.impl import FailedLocalFile, FileWriteMode, LocalPath, RemoteFile -def test_uploader_view_files(file_all_connections, resource_path): +def test_file_uploader_view_files(file_connection, resource_path): target_path = f"/tmp/test_upload_{secrets.token_hex(5)}" # upload files uploader = FileUploader( - connection=file_all_connections, + connection=file_connection, target_path=target_path, local_path=resource_path, ) @@ -41,17 +41,17 @@ def test_uploader_view_files(file_all_connections, resource_path): ids=["run_path_type str", "run_path_type Path"], ) @pytest.mark.parametrize("workers", [1, 3]) -def test_uploader_run_with_files(request, file_all_connections, test_files, run_path_type, path_type, workers): +def test_file_uploader_run_with_files(request, file_connection, test_files, run_path_type, path_type, workers): target_path = path_type(f"/tmp/test_upload_{secrets.token_hex(5)}") def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) # upload files uploader = FileUploader( - connection=file_all_connections, + connection=file_connection, target_path=target_path, options=FileUploader.Options( workers=workers, @@ -78,25 +78,25 @@ def finalizer(): local_file = next(file for file in test_files if file.name == remote_file.name) # file size is same as expected - assert file_all_connections.get_stat(remote_file).st_size == local_file.stat().st_size - assert file_all_connections.get_stat(remote_file).st_size == remote_file.stat().st_size + assert file_connection.get_stat(remote_file).st_size == local_file.stat().st_size + assert file_connection.get_stat(remote_file).st_size == remote_file.stat().st_size # file content is same as expected - assert file_all_connections.read_bytes(remote_file) == local_file.read_bytes() + assert file_connection.read_bytes(remote_file) == local_file.read_bytes() @pytest.mark.parametrize("path_type", [str, PurePosixPath], ids=["path_type str", "path_type Path"]) -def test_uploader_run_with_local_path(request, file_all_connections, resource_path, path_type): +def test_file_uploader_run_with_local_path(request, file_connection, resource_path, path_type): target_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) # upload files uploader = FileUploader( - connection=file_all_connections, + connection=file_connection, target_path=target_path, local_path=path_type(resource_path), ) @@ -127,24 +127,24 @@ def finalizer(): local_file = next(file for file in local_files_list if file.name == remote_file.name) # file size is same as expected - assert file_all_connections.get_stat(remote_file).st_size == local_file.stat().st_size - assert file_all_connections.get_stat(remote_file).st_size == remote_file.stat().st_size + assert file_connection.get_stat(remote_file).st_size == local_file.stat().st_size + assert file_connection.get_stat(remote_file).st_size == remote_file.stat().st_size # file content is same as expected - assert file_all_connections.read_bytes(remote_file) == local_file.read_bytes() + assert file_connection.read_bytes(remote_file) == local_file.read_bytes() -def test_uploader_run_missing_file(request, file_all_connections, test_files, caplog): +def test_file_uploader_run_missing_file(request, file_connection, test_files, caplog): target_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) # upload files uploader = FileUploader( - connection=file_all_connections, + connection=file_connection, target_path=target_path, ) @@ -170,16 +170,16 @@ def finalizer(): assert not missing_file.exists() -def test_uploader_run_delete_local(request, resource_path, test_files, file_all_connections, caplog): +def test_file_uploader_run_delete_local(request, resource_path, test_files, file_connection, caplog): target_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) uploader = FileUploader( - connection=file_all_connections, + connection=file_connection, target_path=target_path, options=FileUploader.Options(delete_local=True), ) @@ -223,11 +223,11 @@ def finalizer(): local_file = next(file for file in test_files if file.name == remote_file.name) # file size is same as expected - assert file_all_connections.get_stat(remote_file).st_size == local_files_stat[local_file].st_size - assert file_all_connections.get_stat(remote_file).st_size == remote_file.stat().st_size + assert file_connection.get_stat(remote_file).st_size == local_files_stat[local_file].st_size + assert file_connection.get_stat(remote_file).st_size == remote_file.stat().st_size # file content is same as expected - assert file_all_connections.read_bytes(remote_file) == local_files_bytes[local_file] + assert file_connection.read_bytes(remote_file) == local_files_bytes[local_file] # uploaded file is removed assert local_file not in existing_files @@ -241,23 +241,23 @@ def finalizer(): "options", [{"mode": "error"}, FileUploader.Options(mode="error"), FileUploader.Options(mode=FileWriteMode.ERROR)], ) -def test_uploader_run_mode_error(request, file_all_connections, test_files, options): +def test_file_uploader_run_mode_error(request, file_connection, test_files, options): target_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") # make copy of files to upload in the target_path remote_files = [] for test_file in test_files: remote_file = target_path / test_file.name - remote_files.append(file_all_connections.write_text(remote_file, "unchanged")) + remote_files.append(file_connection.write_text(remote_file, "unchanged")) def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) # upload changed files uploader = FileUploader( - connection=file_all_connections, + connection=file_connection, target_path=target_path, options=options, ) @@ -284,30 +284,30 @@ def finalizer(): assert re.search(rf"File '{remote_file}' \(kind='file', .*\) already exists", str(local_file.exception)) # file size wasn't changed - assert file_all_connections.get_stat(remote_file).st_size != local_file.stat().st_size - assert file_all_connections.get_stat(remote_file).st_size == remote_file.stat().st_size + assert file_connection.get_stat(remote_file).st_size != local_file.stat().st_size + assert file_connection.get_stat(remote_file).st_size == remote_file.stat().st_size # file content wasn't changed - assert file_all_connections.read_text(remote_file) == "unchanged" + assert file_connection.read_text(remote_file) == "unchanged" -def test_uploader_run_mode_ignore(request, file_all_connections, test_files, caplog): +def test_file_uploader_run_mode_ignore(request, file_connection, test_files, caplog): target_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") # make copy of files to upload in the target_path remote_files = [] for test_file in test_files: remote_file = target_path / test_file.name - remote_files.append(file_all_connections.write_text(remote_file, "unchanged")) + remote_files.append(file_connection.write_text(remote_file, "unchanged")) def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) # upload changed files uploader = FileUploader( - connection=file_all_connections, + connection=file_connection, target_path=target_path, options=FileUploader.Options(mode=FileWriteMode.IGNORE), ) @@ -335,30 +335,30 @@ def finalizer(): remote_file = remote_files[remote_files.index(target_path / local_file.name)] # file size wasn't changed - assert file_all_connections.get_stat(remote_file).st_size != local_file.stat().st_size - assert file_all_connections.get_stat(remote_file).st_size == remote_file.stat().st_size + assert file_connection.get_stat(remote_file).st_size != local_file.stat().st_size + assert file_connection.get_stat(remote_file).st_size == remote_file.stat().st_size # file content wasn't changed - assert file_all_connections.read_text(remote_file) == "unchanged" + assert file_connection.read_text(remote_file) == "unchanged" -def test_uploader_run_mode_overwrite(request, file_all_connections, test_files, caplog): +def test_file_uploader_run_mode_overwrite(request, file_connection, test_files, caplog): target_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") # make copy of files to upload in the target_path remote_files = [] for test_file in test_files: remote_file = target_path / test_file.name - remote_files.append(file_all_connections.write_text(remote_file, "unchanged")) + remote_files.append(file_connection.write_text(remote_file, "unchanged")) def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) # upload changed files uploader = FileUploader( - connection=file_all_connections, + connection=file_connection, target_path=target_path, options=FileUploader.Options(mode=FileWriteMode.OVERWRITE), ) @@ -387,20 +387,20 @@ def finalizer(): local_file = next(file for file in test_files if file.name == remote_file.name) # file size was changed - assert file_all_connections.get_stat(remote_file).st_size != old_remote_file.stat().st_size - assert file_all_connections.get_stat(remote_file).st_size == local_file.stat().st_size - assert file_all_connections.get_stat(remote_file).st_size == remote_file.stat().st_size + assert file_connection.get_stat(remote_file).st_size != old_remote_file.stat().st_size + assert file_connection.get_stat(remote_file).st_size == local_file.stat().st_size + assert file_connection.get_stat(remote_file).st_size == remote_file.stat().st_size # file content was changed - assert file_all_connections.read_text(remote_file) != "unchanged" - assert file_all_connections.read_bytes(remote_file) == local_file.read_bytes() + assert file_connection.read_text(remote_file) != "unchanged" + assert file_connection.read_bytes(remote_file) == local_file.read_bytes() @pytest.mark.parametrize("remote_dir_exist", [True, False]) -def test_uploader_run_mode_delete_all( +def test_file_uploader_run_mode_delete_all( request, resource_path, - file_all_connections, + file_connection, test_files, remote_dir_exist, caplog, @@ -411,15 +411,15 @@ def test_uploader_run_mode_delete_all( temp_file = target_path / secrets.token_hex(5) if remote_dir_exist: - file_all_connections.write_text(temp_file, "abc") + file_connection.write_text(temp_file, "abc") def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) uploader = FileUploader( - connection=file_all_connections, + connection=file_connection, target_path=target_path, options=FileUploader.Options(mode=FileWriteMode.DELETE_ALL), ) @@ -434,57 +434,57 @@ def finalizer(): assert upload_result.successful target_path_content = [] - for root, _dirs, files in file_all_connections.walk(target_path): + for root, _dirs, files in file_connection.walk(target_path): target_path_content.extend(root / file for file in files) # target path contains only downloaded files assert sorted(target_path_content) == sorted(upload_result.successful) - assert not file_all_connections.path_exists(temp_file) + assert not file_connection.path_exists(temp_file) -def test_uploader_run_local_path_does_not_exist(file_all_connections, tmp_path_factory): +def test_file_uploader_run_local_path_does_not_exist(file_connection, tmp_path_factory): target_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") local_path_parent = tmp_path_factory.mktemp("local_path") local_path = local_path_parent / "abc" - uploader = FileUploader(connection=file_all_connections, target_path=target_path, local_path=local_path) + uploader = FileUploader(connection=file_connection, target_path=target_path, local_path=local_path) with pytest.raises(DirectoryNotFoundError, match=f"'{local_path}' does not exist"): uploader.run() -def test_uploader_run_local_path_not_a_directory(file_all_connections): +def test_file_uploader_run_local_path_not_a_directory(file_connection): target_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") with tempfile.NamedTemporaryFile() as file: - uploader = FileUploader(connection=file_all_connections, target_path=target_path, local_path=file.name) + uploader = FileUploader(connection=file_connection, target_path=target_path, local_path=file.name) with pytest.raises(NotADirectoryError, match=rf"'{file.name}' \(kind='file', .*\) is not a directory"): uploader.run() -def test_uploader_run_target_path_not_a_directory(request, file_all_connections, resource_path): +def test_file_uploader_run_target_path_not_a_directory(request, file_connection, resource_path): target_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") - file_all_connections.write_text(target_path, "abc") + file_connection.write_text(target_path, "abc") def finalizer(): - file_all_connections.remove_file(target_path) + file_connection.remove_file(target_path) request.addfinalizer(finalizer) - uploader = FileUploader(connection=file_all_connections, target_path=target_path, local_path=resource_path) + uploader = FileUploader(connection=file_connection, target_path=target_path, local_path=resource_path) with pytest.raises(NotADirectoryError, match=rf"'{target_path}' \(kind='file', .*\) is not a directory"): uploader.run() -def test_uploader_run_input_is_not_file(file_all_connections, test_files): +def test_file_uploader_run_input_is_not_file(file_connection, test_files): target_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") # upload files uploader = FileUploader( - connection=file_all_connections, + connection=file_connection, target_path=target_path, ) @@ -498,12 +498,12 @@ def test_uploader_run_input_is_not_file(file_all_connections, test_files): [False, True], ids=["Without local_path", "With local_path"], ) -def test_uploader_run_with_empty_files(file_all_connections, pass_local_path, tmp_path_factory): +def test_file_uploader_run_with_empty_files(file_connection, pass_local_path, tmp_path_factory): target_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") local_path = tmp_path_factory.mktemp("local_path") downloader = FileUploader( - connection=file_all_connections, + connection=file_connection, target_path=target_path, local_path=local_path if pass_local_path else None, ) @@ -516,17 +516,17 @@ def test_uploader_run_with_empty_files(file_all_connections, pass_local_path, tm assert not download_result.successful -def test_uploader_run_with_empty_local_path(request, file_all_connections, tmp_path_factory): +def test_file_uploader_run_with_empty_local_path(request, file_connection, tmp_path_factory): target_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") local_path = tmp_path_factory.mktemp("local_path") def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) downloader = FileUploader( - connection=file_all_connections, + connection=file_connection, target_path=target_path, local_path=local_path, ) @@ -539,26 +539,26 @@ def finalizer(): assert not download_result.successful -def test_uploader_without_files_and_without_local_path(file_all_connections): +def test_file_uploader_without_files_and_without_local_path(file_connection): target_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") - uploader = FileUploader(connection=file_all_connections, target_path=target_path) + uploader = FileUploader(connection=file_connection, target_path=target_path) with pytest.raises(ValueError, match="Neither file list nor `local_path` are passed"): uploader.run() -def test_uploader_run_with_relative_files_and_local_path(request, file_all_connections, resource_path, caplog): +def test_file_uploader_run_with_relative_files_and_local_path(request, file_connection, resource_path, caplog): target_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) # upload files uploader = FileUploader( - connection=file_all_connections, + connection=file_connection, target_path=target_path, local_path=resource_path, ) @@ -588,24 +588,24 @@ def finalizer(): local_file = resource_path / remote_file.relative_to(target_path) # file size is same as expected - assert file_all_connections.get_stat(remote_file).st_size == local_file.stat().st_size - assert file_all_connections.get_stat(remote_file).st_size == remote_file.stat().st_size + assert file_connection.get_stat(remote_file).st_size == local_file.stat().st_size + assert file_connection.get_stat(remote_file).st_size == remote_file.stat().st_size # file content is same as expected - assert file_all_connections.read_bytes(remote_file) == local_file.read_bytes() + assert file_connection.read_bytes(remote_file) == local_file.read_bytes() -def test_uploader_run_with_absolute_files_and_local_path(request, file_all_connections, resource_path, caplog): +def test_file_uploader_run_with_absolute_files_and_local_path(request, file_connection, resource_path, caplog): target_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") def finalizer(): - file_all_connections.remove_dir(target_path, recursive=True) + file_connection.remove_dir(target_path, recursive=True) request.addfinalizer(finalizer) # upload files uploader = FileUploader( - connection=file_all_connections, + connection=file_connection, target_path=target_path, local_path=resource_path, ) @@ -634,26 +634,26 @@ def finalizer(): local_file = resource_path / remote_file.relative_to(target_path) # file size is same as expected - assert file_all_connections.get_stat(remote_file).st_size == local_file.stat().st_size - assert file_all_connections.get_stat(remote_file).st_size == remote_file.stat().st_size + assert file_connection.get_stat(remote_file).st_size == local_file.stat().st_size + assert file_connection.get_stat(remote_file).st_size == remote_file.stat().st_size # file content is same as expected - assert file_all_connections.read_bytes(remote_file) == local_file.read_bytes() + assert file_connection.read_bytes(remote_file) == local_file.read_bytes() -def test_uploader_run_absolute_path_not_match_local_path(file_all_connections, resource_path): +def test_file_uploader_run_absolute_path_not_match_local_path(file_connection, resource_path): target_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") - uploader = FileUploader(connection=file_all_connections, target_path=target_path, local_path=resource_path) + uploader = FileUploader(connection=file_connection, target_path=target_path, local_path=resource_path) with pytest.raises(ValueError, match=f"File path '/some/path/1' does not match source_path '{resource_path}'"): uploader.run(["/some/path/1", "/some/path/2"]) -def test_uploader_run_relative_paths_without_local_path(file_all_connections): +def test_file_uploader_run_relative_paths_without_local_path(file_connection): target_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") - uploader = FileUploader(connection=file_all_connections, target_path=target_path) + uploader = FileUploader(connection=file_connection, target_path=target_path) with pytest.raises(ValueError, match="Cannot pass relative file path with empty `local_path`"): uploader.run(["some/path/1", "some/path/2"]) @@ -668,12 +668,12 @@ def test_uploader_run_relative_paths_without_local_path(file_all_connections): ], ids=["no temp", "temp_path str", "temp_path PurePosixPath"], ) -def test_uploader_run_with_temp_path(file_all_connections, test_files, temp_path): +def test_file_uploader_run_with_temp_path(file_connection, test_files, temp_path): target_path = PurePosixPath(f"/tmp/test_upload_{secrets.token_hex(5)}") # upload files uploader = FileUploader( - connection=file_all_connections, + connection=file_connection, target_path=target_path, temp_path=temp_path, ) @@ -687,7 +687,7 @@ def test_uploader_run_with_temp_path(file_all_connections, test_files, temp_path assert sorted(upload_result.successful) == sorted(target_path / file.name for file in test_files) - if temp_path and file_all_connections.path_exists(temp_path): + if temp_path and file_connection.path_exists(temp_path): # temp_path is not removed after upload is finished, # because this may conflict with processes running in parallel - file_all_connections.is_dir(temp_path) + file_connection.is_dir(temp_path) diff --git a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_mongodb_reader_integration.py b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_mongodb_reader_integration.py index 6ad03f30f..bbd4d1f00 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_mongodb_reader_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_mongodb_reader_integration.py @@ -6,7 +6,7 @@ pytestmark = pytest.mark.mongodb -@pytest.fixture(scope="function") +@pytest.fixture() def df_schema(): from pyspark.sql.types import ( DoubleType, diff --git a/tests/tests_integration/tests_file_connection_integration/test_file_connection_common_integration.py b/tests/tests_integration/tests_file_connection_integration/test_file_connection_common_integration.py index 1c1a59615..7fb0842e3 100644 --- a/tests/tests_integration/tests_file_connection_integration/test_file_connection_common_integration.py +++ b/tests/tests_integration/tests_file_connection_integration/test_file_connection_common_integration.py @@ -3,7 +3,6 @@ from pathlib import PurePosixPath import pytest -from pytest_lazyfixture import lazy_fixture from onetl.base import SupportsRenameDir from onetl.exception import DirectoryExistsError, DirectoryNotFoundError, NotAFileError @@ -12,86 +11,90 @@ @pytest.mark.parametrize("path_type", [str, PurePosixPath]) def test_file_connection_rm_dir_recursive( - file_all_connections, - source_path, + file_connection_with_path_and_files, path_type, ): - file_all_connections.remove_dir(path_type(os.fspath(source_path)), recursive=True) + file_connection, remote_path, _ = file_connection_with_path_and_files + file_connection.remove_dir(path_type(os.fspath(remote_path)), recursive=True) - assert not file_all_connections.path_exists(source_path) + assert not file_connection.path_exists(remote_path) - if file_all_connections.path_exists(source_path.parent): + if file_connection.path_exists(remote_path.parent): # S3 does not support creating directories - parent_paths = [os.fspath(path) for path in file_all_connections.list_dir(source_path.parent)] - assert source_path.name not in parent_paths + parent_paths = [os.fspath(path) for path in file_connection.list_dir(remote_path.parent)] + assert remote_path.name not in parent_paths @pytest.mark.parametrize("path_type", [str, PurePosixPath]) -def test_file_connection_remove_dir_non_empty(file_all_connections, source_path, upload_test_files, path_type): +def test_file_connection_remove_dir_non_empty(file_connection_with_path_and_files, path_type): + file_connection, remote_path, _ = file_connection_with_path_and_files with pytest.raises(Exception): - file_all_connections.remove_dir(path_type(os.fspath(source_path))) + file_connection.remove_dir(path_type(os.fspath(remote_path))) @pytest.mark.parametrize("path_type", [str, PurePosixPath]) -def test_file_connection_remove_dir_fake_dir(file_all_connections, upload_test_files, path_type): +def test_file_connection_remove_dir_fake_dir(file_connection, path_type): # Does not raise Exception - file_all_connections.remove_dir(path_type("/some/fake/dir")) + file_connection.remove_dir(path_type("/some/fake/dir")) @pytest.mark.parametrize("path_type", [str, PurePosixPath]) -def test_file_connection_create_dir(file_all_connections, source_path, path_type): - path = source_path / "some_dir" +def test_file_connection_create_dir(file_connection_with_path, path_type): + file_connection, remote_path = file_connection_with_path + path = remote_path / "some_dir" - file_all_connections.close() - file_all_connections.create_dir(path_type(os.fspath(path))) - file_all_connections.close() + file_connection.close() + file_connection.create_dir(path_type(os.fspath(path))) + file_connection.close() # `close` called twice is not an error - file_all_connections.close() + file_connection.close() - if file_all_connections.path_exists(path): + if file_connection.path_exists(path): # S3 does not support creating directories - assert RemotePath("some_dir") in file_all_connections.list_dir(path.parent) + assert RemotePath("some_dir") in file_connection.list_dir(path.parent) @pytest.mark.parametrize("path_type", [str, PurePosixPath]) -def test_file_connection_rename_file(file_all_connections, source_path, upload_test_files, path_type): - with file_all_connections as connection: +def test_file_connection_rename_file(file_connection_with_path_and_files, path_type): + file_connection, remote_path, _ = file_connection_with_path_and_files + with file_connection as connection: connection.rename_file( - source_file_path=path_type(os.fspath(source_path / "exclude_dir/file_5.txt")), - target_file_path=path_type(os.fspath(source_path / "exclude_dir/file_55.txt")), + source_file_path=path_type(os.fspath(remote_path / "raw/ascii.txt")), + target_file_path=path_type(os.fspath(remote_path / "raw/new.txt")), ) - list_dir = file_all_connections.list_dir(source_path / "exclude_dir/") + list_dir = file_connection.list_dir(remote_path / "raw/") - assert RemotePath("file_55.txt") in list_dir - assert RemotePath("file_5.txt") not in list_dir + assert RemotePath("new.txt") in list_dir + assert RemotePath("ascii.txt") not in list_dir @pytest.mark.parametrize("path_type", [str, PurePosixPath]) -def test_file_connection_rename_dir(file_all_connections, source_path, upload_test_files, path_type): - if not isinstance(file_all_connections, SupportsRenameDir): +def test_file_connection_rename_dir(file_connection_with_path_and_files, path_type): + file_connection, remote_path, _ = file_connection_with_path_and_files + if not isinstance(file_connection, SupportsRenameDir): # S3 does not have directories return def stringify(items): return list(map(os.fspath, items)) - old_dir = source_path / "exclude_dir" - new_dir = source_path / "exclude_dir1" - files_before = list(file_all_connections.walk(old_dir)) + old_dir = remote_path / "exclude_dir" + new_dir = remote_path / "exclude_dir1" + files_before = list(file_connection.walk(old_dir)) - file_all_connections.rename_dir( + file_connection.rename_dir( source_dir_path=path_type(os.fspath(old_dir)), target_dir_path=path_type(os.fspath(new_dir)), ) - list_dir = file_all_connections.list_dir(source_path) + list_dir = file_connection.list_dir(remote_path) assert RemotePath("exclude_dir") not in list_dir assert RemotePath("exclude_dir1") in list_dir # root has different name, but all directories content is the same files_after = [ - (os.fspath(root), stringify(dirs), stringify(files)) for root, dirs, files in file_all_connections.walk(new_dir) + (os.fspath(root), stringify(dirs), stringify(files)) for root, dirs, files in file_connection.walk(new_dir) ] assert files_after == [ (os.fspath(new_dir / root.relative_to(old_dir)), stringify(dirs), stringify(files)) @@ -99,61 +102,63 @@ def stringify(items): ] -def test_file_connection_rename_dir_already_exists(request, file_all_connections, source_path, upload_test_files): - if not isinstance(file_all_connections, SupportsRenameDir): +def test_file_connection_rename_dir_already_exists(request, file_connection_with_path_and_files): + file_connection, remote_path, upload_files = file_connection_with_path_and_files + if not isinstance(file_connection, SupportsRenameDir): # S3 does not have directories return - old_dir = source_path / "exclude_dir" - new_dir = source_path / "exclude_dir1" + old_dir = remote_path / "exclude_dir" + new_dir = remote_path / "exclude_dir1" def finalizer(): - file_all_connections.remove_dir(new_dir) + file_connection.remove_dir(new_dir) request.addfinalizer(finalizer) - file_all_connections.create_dir(new_dir) + file_connection.create_dir(new_dir) with pytest.raises(DirectoryExistsError): - file_all_connections.rename_dir( + file_connection.rename_dir( source_dir_path=old_dir, target_dir_path=new_dir, ) -def test_file_connection_rename_dir_replace(request, file_all_connections, source_path, upload_test_files): - if not isinstance(file_all_connections, SupportsRenameDir): +def test_file_connection_rename_dir_replace(request, file_connection_with_path_and_files): + file_connection, remote_path, _ = file_connection_with_path_and_files + if not isinstance(file_connection, SupportsRenameDir): # S3 does not have directories return def stringify(items): return list(map(os.fspath, items)) - old_dir = source_path / "exclude_dir" - new_dir = source_path / "exclude_dir1" + old_dir = remote_path / "exclude_dir" + new_dir = remote_path / "exclude_dir1" def finalizer(): - file_all_connections.remove_dir(new_dir, recursive=True) + file_connection.remove_dir(new_dir, recursive=True) request.addfinalizer(finalizer) - file_all_connections.create_dir(new_dir) + file_connection.create_dir(new_dir) - files_before = list(file_all_connections.walk(old_dir)) + files_before = list(file_connection.walk(old_dir)) - file_all_connections.rename_dir( + file_connection.rename_dir( source_dir_path=old_dir, target_dir_path=new_dir, replace=True, ) - list_dir = file_all_connections.list_dir(source_path) + list_dir = file_connection.list_dir(remote_path) assert RemotePath("exclude_dir") not in list_dir assert RemotePath("exclude_dir1") in list_dir # root has different name, but all directories content is the same files_after = [ - (os.fspath(root), stringify(dirs), stringify(files)) for root, dirs, files in file_all_connections.walk(new_dir) + (os.fspath(root), stringify(dirs), stringify(files)) for root, dirs, files in file_connection.walk(new_dir) ] assert files_after == [ (os.fspath(new_dir / root.relative_to(old_dir)), stringify(dirs), stringify(files)) @@ -161,199 +166,205 @@ def finalizer(): ] -def test_file_connection_read_text(file_all_connections, upload_files_with_encoding): - read_text = file_all_connections.read_text(path=upload_files_with_encoding["utf"]) +def test_file_connection_read_text(file_connection_with_path_and_files): + file_connection, remote_path, _ = file_connection_with_path_and_files + content = file_connection.read_text(path=remote_path / "raw/utf-8.txt") - assert isinstance(read_text, str) - assert read_text == "тестовый текст в тестовом файле\n" + assert isinstance(content, str) + assert content == "тестовый текст в тестовом файле\n" -def test_file_connection_read_bytes(file_all_connections, upload_files_with_encoding): - read_bytes = file_all_connections.read_bytes(path=upload_files_with_encoding["ascii"]) +def test_file_connection_read_bytes(file_connection_with_path_and_files): + file_connection, remote_path, _ = file_connection_with_path_and_files + content = file_connection.read_bytes(path=remote_path / "raw/ascii.txt") - assert isinstance(read_bytes, bytes) - assert read_bytes == b"test text in test file\n" + assert isinstance(content, bytes) + assert content == b"test text in test file\n" @pytest.mark.parametrize( - "path,exception", - [(lazy_fixture("source_path"), NotAFileError), ("/no_such_file.txt", FileNotFoundError)], + "pass_real_path, exception", + [(True, NotAFileError), (False, FileNotFoundError)], ) def test_file_connection_read_text_negative( - file_all_connections, - source_path, - upload_files_with_encoding, - path, + file_connection_with_path_and_files, + pass_real_path, exception, ): + # uploading files only because S3 does not support empty directories + file_connection, remote_path, _ = file_connection_with_path_and_files + fake_path = "/no_such_file.txt" with pytest.raises(exception): - file_all_connections.read_text(path=path) + file_connection.read_text(path=remote_path if pass_real_path else fake_path) @pytest.mark.parametrize( - "path,exception", - [(lazy_fixture("source_path"), NotAFileError), ("/no_such_file.txt", FileNotFoundError)], + "pass_real_path, exception", + [(True, NotAFileError), (False, FileNotFoundError)], ) def test_file_connection_read_bytes_negative( - file_all_connections, - source_path, - upload_files_with_encoding, - path, + file_connection_with_path_and_files, + pass_real_path, exception, ): + # uploading files only because S3 does not support empty directories + file_connection, remote_path, _ = file_connection_with_path_and_files + fake_path = "/no_such_file.txt" with pytest.raises(exception): - file_all_connections.read_bytes(path=path) + file_connection.read_bytes(path=remote_path if pass_real_path else fake_path) @pytest.mark.parametrize( "file_name", - ["file_connection_write_text.txt", "file_connection_utf.txt"], - ids=["new file", "file existed"], + ["new.txt", "utf-8.txt"], + ids=["new file", "existing file"], ) -def test_file_connection_write_text(file_all_connections, source_path, file_name, upload_files_with_encoding): - file_all_connections.write_text(path=source_path / file_name, content="тестовый текст в utf-8") - - assert file_all_connections.read_text(source_path / file_name) == "тестовый текст в utf-8" +def test_file_connection_write_text(file_connection_with_path_and_files, file_name): + file_connection, remote_path, _ = file_connection_with_path_and_files + file_connection.write_text(path=remote_path / file_name, content="тестовый текст в utf-8") + assert file_connection.read_text(remote_path / file_name) == "тестовый текст в utf-8" @pytest.mark.parametrize( "file_name", - ["file_connection_write_bytes.txt", "file_connection_utf.txt"], - ids=["new file", "file existed"], + ["new.txt", "utf-8.txt"], + ids=["new file", "existing file"], ) -def test_file_connection_write_bytes(file_all_connections, source_path, file_name, upload_files_with_encoding): - file_all_connections.write_bytes(path=source_path / file_name, content=b"ascii test text") - assert file_all_connections.read_bytes(source_path / file_name) == b"ascii test text" +def test_file_connection_write_bytes(file_connection_with_path_and_files, file_name): + file_connection, remote_path, _ = file_connection_with_path_and_files + file_connection.write_bytes(path=remote_path / file_name, content=b"ascii test text") + assert file_connection.read_bytes(remote_path / file_name) == b"ascii test text" -def test_file_connection_write_text_fail_on_bytes_input(file_all_connections, source_path): +def test_file_connection_write_text_fail_on_bytes_input(file_connection_with_path): + file_connection, remote_path = file_connection_with_path with pytest.raises(TypeError): - file_all_connections.write_text(path=source_path / "some_file_name.txt", content=b"bytes to text") + file_connection.write_text(path=remote_path / "some_file_name.txt", content=b"bytes to text") -def test_file_connection_write_bytes_fail_on_text_input(file_all_connections, source_path): +def test_file_connection_write_bytes_fail_on_text_input(file_connection_with_path): + file_connection, remote_path = file_connection_with_path with pytest.raises(TypeError): - file_all_connections.write_bytes(path=source_path / "some_file_name.txt", content="text to bytes") + file_connection.write_bytes(path=remote_path / "some_file_name.txt", content="text to bytes") -def test_file_connection_write_encoding(file_all_connections, source_path): - file_all_connections.write_text( - path=source_path / "cp_1251_file", +def test_file_connection_write_encoding(file_connection_with_path): + file_connection, remote_path = file_connection_with_path + file_connection.write_text( + path=remote_path / "cp_1251_file", content="тестовый текст в utf-8", encoding="cp1251", ) - assert file_all_connections.read_bytes(path=source_path / "cp_1251_file") == "тестовый текст в utf-8".encode( + assert file_connection.read_bytes(path=remote_path / "cp_1251_file") == "тестовый текст в utf-8".encode( "cp1251", ) -def test_file_connection_read_encoding(file_all_connections, source_path): - file_all_connections.write_bytes( - path=source_path / "cp_1251_file", +def test_file_connection_read_encoding(file_connection_with_path): + file_connection, remote_path = file_connection_with_path + file_connection.write_bytes( + path=remote_path / "cp_1251_file", content="тестовый текст в utf-8".encode("cp1251"), ) - assert ( - file_all_connections.read_text(path=source_path / "cp_1251_file", encoding="cp1251") - == "тестовый текст в utf-8" - ) + assert file_connection.read_text(path=remote_path / "cp_1251_file", encoding="cp1251") == "тестовый текст в utf-8" @pytest.mark.parametrize("path_type", [str, PurePosixPath]) -def test_file_connection_path_exists(file_all_connections, source_path, upload_test_files, path_type): - assert file_all_connections.path_exists(path_type(os.fspath(source_path / "exclude_dir/file_5.txt"))) - assert file_all_connections.path_exists(path_type(os.fspath(source_path / "exclude_dir"))) - assert not file_all_connections.path_exists(path_type(os.fspath(source_path / "path_not_exist"))) +def test_file_connection_path_exists(file_connection_with_path_and_files, path_type): + file_connection, remote_path, _ = file_connection_with_path_and_files + assert file_connection.path_exists(path_type(os.fspath(remote_path / "raw/ascii.txt"))) + assert file_connection.path_exists(path_type(os.fspath(remote_path / "raw"))) + assert not file_connection.path_exists(path_type(os.fspath(remote_path / "path_not_exist"))) @pytest.mark.parametrize("path_type", [str, PurePosixPath]) -def test_file_connection_is_dir(file_all_connections, source_path, upload_test_files, path_type): - assert file_all_connections.is_dir(path_type(os.fspath(source_path / "exclude_dir"))) - assert not file_all_connections.is_dir(path_type(os.fspath(source_path / "exclude_dir/file_5.txt"))) +def test_file_connection_is_dir(file_connection_with_path_and_files, path_type): + file_connection, remote_path, _ = file_connection_with_path_and_files + assert file_connection.is_dir(path_type(os.fspath(remote_path / "raw"))) + assert not file_connection.is_dir(path_type(os.fspath(remote_path / "raw/ascii.txt"))) with pytest.raises(DirectoryNotFoundError): - file_all_connections.is_dir(path_type(os.fspath(source_path / "path_not_exist"))) + file_connection.is_dir(path_type(os.fspath(remote_path / "path_not_exist"))) @pytest.mark.parametrize("path_type", [str, PurePosixPath]) -def test_file_connection_is_file(file_all_connections, source_path, upload_test_files, path_type): - assert file_all_connections.is_file(path_type(os.fspath(source_path / "exclude_dir/file_5.txt"))) - assert not file_all_connections.is_file(path_type(os.fspath(source_path / "exclude_dir"))) +def test_file_connection_is_file(file_connection_with_path_and_files, path_type): + file_connection, remote_path, _ = file_connection_with_path_and_files + assert file_connection.is_file(path_type(os.fspath(remote_path / "raw/ascii.txt"))) + assert not file_connection.is_file(path_type(os.fspath(remote_path / "raw"))) with pytest.raises(FileNotFoundError): - file_all_connections.is_file(path_type(os.fspath(source_path / "path_not_exist"))) + file_connection.is_file(path_type(os.fspath(remote_path / "path_not_exist"))) @pytest.mark.parametrize("path_type", [str, PurePosixPath]) def test_file_connection_download_file( - file_all_connections, - source_path, - upload_test_files, + file_connection_with_path_and_files, tmp_path_factory, path_type, ): + file_connection, remote_path, _ = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") - remote_file_path = source_path / "news_parse_zp/2018_03_05_10_00_00/newsage-zp-2018_03_05_10_00_00.csv" + remote_file_path = remote_path / "raw/some.csv" - download_result = file_all_connections.download_file( + download_result = file_connection.download_file( remote_file_path=path_type(os.fspath(remote_file_path)), local_file_path=path_type(local_path / "file.csv"), ) assert download_result.exists() - assert download_result.stat().st_size == file_all_connections.resolve_file(remote_file_path).stat().st_size - assert download_result.read_text() == file_all_connections.read_text(remote_file_path) + assert download_result.stat().st_size == file_connection.resolve_file(remote_file_path).stat().st_size + assert download_result.read_text() == file_connection.read_text(remote_file_path) @pytest.mark.parametrize("path_type", [str, PurePosixPath]) -def test_file_connection_upload_file(file_all_connections, test_files, path_type): - upload_result = file_all_connections.upload_file( +def test_file_connection_upload_file(file_connection, test_files, path_type): + upload_result = file_connection.upload_file( local_file_path=path_type(test_files[0]), remote_file_path=path_type(path_type(f"/tmp/test_upload_{secrets.token_hex(5)}")), ) assert upload_result.exists() assert upload_result.stat().st_size == test_files[0].stat().st_size - assert file_all_connections.read_text(upload_result) == test_files[0].read_text() + assert file_connection.read_text(upload_result) == test_files[0].read_text() @pytest.mark.parametrize( "path,exception", [ - ("exclude_dir/", NotAFileError), - ("exclude_dir/file_not_exists", FileNotFoundError), + ("raw/exclude_dir/", NotAFileError), + ("raw/exclude_dir/file_not_exists", FileNotFoundError), ], ids=["directory", "file"], ) def test_file_connection_download_file_wrong_source_type( - file_all_connections, - upload_test_files, + file_connection_with_path_and_files, tmp_path_factory, - source_path, path, exception, ): + file_connection, remote_path, _ = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") with pytest.raises(exception): - file_all_connections.download_file( - remote_file_path=source_path / path, - local_file_path=local_path / "file_5.txt", + file_connection.download_file( + remote_file_path=remote_path / path, + local_file_path=local_path / "fil.txt", ) @pytest.mark.parametrize("replace", [True, False]) def test_file_connection_download_file_wrong_target_type( - file_all_connections, - source_path, - upload_test_files, + file_connection_with_path_and_files, tmp_path_factory, replace, ): + file_connection, remote_path, _ = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") with pytest.raises(NotAFileError): - file_all_connections.download_file( - remote_file_path=source_path / "exclude_dir/file_5.txt", + file_connection.download_file( + remote_file_path=remote_path / "raw/ascii.txt", local_file_path=local_path, replace=replace, ) @@ -361,12 +372,12 @@ def test_file_connection_download_file_wrong_target_type( @pytest.mark.parametrize( "source,exception", - [("news_parse_zp", NotAFileError), ("file_not_exist", FileNotFoundError)], - ids=["directory", "file"], + [("raw", NotAFileError), ("does_not_exist", FileNotFoundError)], + ids=["directory", "missing"], ) -def test_file_connection_upload_file_wrong_source_type(file_all_connections, resource_path, source, exception): +def test_file_connection_upload_file_wrong_source(file_connection, resource_path, source, exception): with pytest.raises(exception): - file_all_connections.upload_file( + file_connection.upload_file( local_file_path=resource_path / source, remote_file_path=f"/tmp/test_upload_{secrets.token_hex(5)}", ) @@ -374,57 +385,54 @@ def test_file_connection_upload_file_wrong_source_type(file_all_connections, res @pytest.mark.parametrize("replace", [True, False]) def test_file_connection_upload_file_wrong_target_type( - file_all_connections, - source_path, - upload_test_files, + file_connection_with_path_and_files, test_files, replace, ): + file_connection, remote_path, _ = file_connection_with_path_and_files with pytest.raises(NotAFileError): - file_all_connections.upload_file( + file_connection.upload_file( local_file_path=test_files[0], - remote_file_path=source_path / "exclude_dir", + remote_file_path=remote_path / "raw/exclude_dir", replace=replace, ) @pytest.mark.parametrize("path_type", [str, PurePosixPath]) def test_file_connection_download_replace_target( - file_all_connections, - source_path, - upload_files_with_encoding, + file_connection_with_path_and_files, tmp_path_factory, path_type, ): + file_connection, remote_path, _ = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") file_path = local_path / "file.txt" file_path.write_text("text to replace") - remote_file_path = source_path / "file_connection_utf.txt" + remote_file_path = remote_path / "raw/utf-8.txt" - download_result = file_all_connections.download_file( + download_result = file_connection.download_file( remote_file_path=path_type(remote_file_path), local_file_path=path_type(file_path), replace=True, ) assert download_result.exists() - assert download_result.stat().st_size == file_all_connections.resolve_file(remote_file_path).stat().st_size + assert download_result.stat().st_size == file_connection.resolve_file(remote_file_path).stat().st_size assert download_result.read_text() == "тестовый текст в тестовом файле\n" def test_file_connection_download_replace_target_negative( - file_all_connections, - source_path, - upload_files_with_encoding, + file_connection_with_path_and_files, tmp_path_factory, ): + file_connection, remote_path, _ = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") file_path = local_path / "file.txt" file_path.write_text("test file") - remote_file_path = source_path / "file_connection_utf.txt" + remote_file_path = remote_path / "raw/utf-8.txt" with pytest.raises(FileExistsError): - file_all_connections.download_file( + file_connection.download_file( remote_file_path=remote_file_path, local_file_path=file_path, replace=False, @@ -434,45 +442,40 @@ def test_file_connection_download_replace_target_negative( @pytest.mark.parametrize("path_type", [str, PurePosixPath]) def test_file_connection_upload_replace_target( - file_all_connections, - source_path, - upload_test_files, + file_connection_with_path_and_files, tmp_path_factory, path_type, ): + file_connection, remote_path, _ = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") file_path = local_path / "file.txt" file_path.write_text("test local file") - upload_result = file_all_connections.upload_file( + upload_result = file_connection.upload_file( local_file_path=path_type(file_path), - remote_file_path=path_type(os.fspath(source_path / "exclude_dir/file_5.txt")), + remote_file_path=path_type(os.fspath(remote_path / "raw/new.txt")), replace=True, ) assert upload_result.exists() assert upload_result.stat().st_size == file_path.stat().st_size - assert file_all_connections.read_text(upload_result) == "test local file" + assert file_connection.read_text(upload_result) == "test local file" def test_file_connection_upload_replace_target_negative( - file_all_connections, - source_path, + file_connection_with_path_and_files, tmp_path_factory, - upload_files_with_encoding, - test_files, ): + file_connection, remote_path, _ = file_connection_with_path_and_files local_path = tmp_path_factory.mktemp("local_path") file_path = local_path / "file.txt" file_path.write_text("test local file") with pytest.raises(FileExistsError): - file_all_connections.upload_file( + file_connection.upload_file( local_file_path=file_path, - remote_file_path=source_path / "file_connection_utf.txt", + remote_file_path=remote_path / "raw/utf-8.txt", replace=False, ) - assert ( - file_all_connections.read_text(source_path / "file_connection_utf.txt") == "тестовый текст в тестовом файле\n" - ) + assert file_connection.read_text(remote_path / "raw/utf-8.txt") == "тестовый текст в тестовом файле\n" diff --git a/tests/tests_integration/tests_file_connection_integration/test_ftp_integration.py b/tests/tests_integration/tests_file_connection_integration/test_ftp_file_connection_integration.py similarity index 52% rename from tests/tests_integration/tests_file_connection_integration/test_ftp_integration.py rename to tests/tests_integration/tests_file_connection_integration/test_ftp_file_connection_integration.py index 44500faf6..df99d72d9 100644 --- a/tests/tests_integration/tests_file_connection_integration/test_ftp_integration.py +++ b/tests/tests_integration/tests_file_connection_integration/test_ftp_file_connection_integration.py @@ -5,38 +5,39 @@ pytestmark = [pytest.mark.ftp, pytest.mark.file_connection, pytest.mark.connection] -def test_ftp_check(ftp_connection, caplog): +def test_ftp_file_connection_check_success(ftp_file_connection, caplog): + ftp = ftp_file_connection with caplog.at_level(logging.INFO): - assert ftp_connection.check() == ftp_connection + assert ftp.check() == ftp assert "type = FTP" in caplog.text - assert f"host = '{ftp_connection.host}'" in caplog.text - assert f"port = {ftp_connection.port}" in caplog.text - assert f"user = '{ftp_connection.user}'" in caplog.text + assert f"host = '{ftp.host}'" in caplog.text + assert f"port = {ftp.port}" in caplog.text + assert f"user = '{ftp.user}'" in caplog.text assert "password = SecretStr('**********')" in caplog.text - assert ftp_connection.password.get_secret_value() not in caplog.text + assert ftp.password.get_secret_value() not in caplog.text assert "Connection is available" in caplog.text -def test_ftp_check_anonymous(ftp_server, caplog): +def test_ftp_file_connection_check_anonymous(ftp_server, caplog): from onetl.connection import FTP - anonymous_connection = FTP(host=ftp_server.host, port=ftp_server.port) + anonymous = FTP(host=ftp_server.host, port=ftp_server.port) with caplog.at_level(logging.INFO): - assert anonymous_connection.check() == anonymous_connection + assert anonymous.check() == anonymous assert "type = FTP" in caplog.text - assert f"host = '{anonymous_connection.host}'" in caplog.text - assert f"port = {anonymous_connection.port}" in caplog.text + assert f"host = '{anonymous.host}'" in caplog.text + assert f"port = {anonymous.port}" in caplog.text assert "user = " not in caplog.text assert "password = " not in caplog.text assert "Connection is available" in caplog.text -def test_ftp_wrong_source_check(ftp_server): +def test_ftp_file_connection_check_failed(ftp_server): from onetl.connection import FTP ftp = FTP(host=ftp_server.host, port=ftp_server.port, user="unknown", password="unknown") diff --git a/tests/tests_integration/tests_file_connection_integration/test_ftps_integration.py b/tests/tests_integration/tests_file_connection_integration/test_ftps_file_connection_integration.py similarity index 52% rename from tests/tests_integration/tests_file_connection_integration/test_ftps_integration.py rename to tests/tests_integration/tests_file_connection_integration/test_ftps_file_connection_integration.py index 9a5043ee9..f3756801e 100644 --- a/tests/tests_integration/tests_file_connection_integration/test_ftps_integration.py +++ b/tests/tests_integration/tests_file_connection_integration/test_ftps_file_connection_integration.py @@ -5,38 +5,39 @@ pytestmark = [pytest.mark.ftps, pytest.mark.file_connection, pytest.mark.connection] -def test_ftps_check(ftps_connection, caplog): +def test_ftps_file_connection_check_success(ftps_file_connection, caplog): + ftps = ftps_file_connection with caplog.at_level(logging.INFO): - assert ftps_connection.check() == ftps_connection + assert ftps.check() == ftps assert "type = FTPS" in caplog.text - assert f"host = '{ftps_connection.host}'" in caplog.text - assert f"port = {ftps_connection.port}" in caplog.text - assert f"user = '{ftps_connection.user}'" in caplog.text + assert f"host = '{ftps.host}'" in caplog.text + assert f"port = {ftps.port}" in caplog.text + assert f"user = '{ftps.user}'" in caplog.text assert "password = SecretStr('**********')" in caplog.text - assert ftps_connection.password.get_secret_value() not in caplog.text + assert ftps.password.get_secret_value() not in caplog.text assert "Connection is available" in caplog.text -def test_ftps_check_anonymous(ftps_server, caplog): +def test_ftps_file_connection_check_anonymous(ftps_server, caplog): from onetl.connection import FTPS - anonymous_connection = FTPS(host=ftps_server.host, port=ftps_server.port) + anonymous = FTPS(host=ftps_server.host, port=ftps_server.port) with caplog.at_level(logging.INFO): - assert anonymous_connection.check() == anonymous_connection + assert anonymous.check() == anonymous assert "type = FTP" in caplog.text - assert f"host = '{anonymous_connection.host}'" in caplog.text - assert f"port = {anonymous_connection.port}" in caplog.text + assert f"host = '{anonymous.host}'" in caplog.text + assert f"port = {anonymous.port}" in caplog.text assert "user = " not in caplog.text assert "password = " not in caplog.text assert "Connection is available" in caplog.text -def test_ftps_wrong_source_check(ftps_server): +def test_ftps_file_connection_check_failed(ftps_server): from onetl.connection import FTPS ftps = FTPS(host=ftps_server.host, port=ftps_server.port, user="unknown", password="unknown") diff --git a/tests/tests_integration/tests_file_connection_integration/test_hdfs_integration.py b/tests/tests_integration/tests_file_connection_integration/test_hdfs_file_connection_integration.py similarity index 88% rename from tests/tests_integration/tests_file_connection_integration/test_hdfs_integration.py rename to tests/tests_integration/tests_file_connection_integration/test_hdfs_file_connection_integration.py index 65a70c9ca..46ce07051 100644 --- a/tests/tests_integration/tests_file_connection_integration/test_hdfs_integration.py +++ b/tests/tests_integration/tests_file_connection_integration/test_hdfs_file_connection_integration.py @@ -12,13 +12,14 @@ pytestmark = [pytest.mark.hdfs, pytest.mark.file_connection, pytest.mark.connection] -def test_hdfs_check_anonymous(hdfs_connection, caplog): +def test_hdfs_file_connection_check_anonymous(hdfs_file_connection, caplog): + hdfs = hdfs_file_connection with caplog.at_level(logging.INFO): - assert hdfs_connection.check() == hdfs_connection + assert hdfs.check() == hdfs assert "type = HDFS" in caplog.text - assert f"host = '{hdfs_connection.host}'" in caplog.text - assert f"port = {hdfs_connection.webhdfs_port}" in caplog.text + assert f"host = '{hdfs.host}'" in caplog.text + assert f"port = {hdfs.webhdfs_port}" in caplog.text assert "timeout = 10" in caplog.text assert "user = " not in caplog.text assert "keytab =" not in caplog.text @@ -27,7 +28,7 @@ def test_hdfs_check_anonymous(hdfs_connection, caplog): assert "Connection is available" in caplog.text -def test_hdfs_check_with_keytab(mocker, hdfs_server, caplog, request, tmp_path_factory): +def test_hdfs_file_connection_check_with_keytab(mocker, hdfs_server, caplog, request, tmp_path_factory): from onetl.connection import HDFS from onetl.connection.file_connection import hdfs @@ -60,7 +61,7 @@ def finalizer(): assert "Connection is available" in caplog.text -def test_hdfs_check_with_password(mocker, hdfs_server, caplog): +def test_hdfs_file_connection_check_with_password(mocker, hdfs_server, caplog): from onetl.connection import HDFS from onetl.connection.file_connection import hdfs @@ -83,14 +84,14 @@ def test_hdfs_check_with_password(mocker, hdfs_server, caplog): assert "Connection is available" in caplog.text -def test_hdfs_wrong_source_check_error(): +def test_hdfs_file_connection_check_failed(): from onetl.connection import HDFS with pytest.raises(RuntimeError, match="Connection is unavailable"): HDFS(host="hive1", port=1234).check() -def test_hdfs_check_with_hooks(request, hdfs_server): +def test_hdfs_file_connection_check_with_hooks(request, hdfs_server): from onetl.connection import HDFS @HDFS.slots.is_namenode_active.bind diff --git a/tests/tests_integration/tests_file_connection_integration/test_s3_file_connection_integration.py b/tests/tests_integration/tests_file_connection_integration/test_s3_file_connection_integration.py new file mode 100644 index 000000000..6c8c2dde8 --- /dev/null +++ b/tests/tests_integration/tests_file_connection_integration/test_s3_file_connection_integration.py @@ -0,0 +1,52 @@ +import logging +import os + +import pytest + +pytestmark = [pytest.mark.s3, pytest.mark.file_connection, pytest.mark.connection] + + +def test_s3_file_connection_check_success(caplog, s3_file_connection): + s3 = s3_file_connection + with caplog.at_level(logging.INFO): + assert s3.check() == s3 + + assert "type = S3" in caplog.text + assert f"host = '{s3.host}'" in caplog.text + assert f"port = {s3.port}" in caplog.text + assert f"bucket = '{s3.bucket}'" in caplog.text + assert f"access_key = '{s3.access_key}'" in caplog.text + assert "secret_key = SecretStr('**********')" in caplog.text + assert s3.secret_key.get_secret_value() not in caplog.text + assert "session_token =" not in caplog.text + + assert "Connection is available" in caplog.text + + +def test_s3_file_connection_check_failed(s3_server): + from onetl.connection import S3 + + anonymous = S3( + host=s3_server.host, + port=s3_server.port, + bucket=s3_server.bucket, + protocol=s3_server.protocol, + access_key="unknown", + secret_key="unknown", + ) + + with pytest.raises(RuntimeError, match="Connection is unavailable"): + anonymous.check() + + +@pytest.mark.parametrize("path_prefix", ["/", ""]) +def test_s3_file_connection_list_dir(path_prefix, s3_file_connection_with_path_and_files): + s3, _, _ = s3_file_connection_with_path_and_files + + def dir_content(path): + return sorted(os.fspath(file) for file in s3.list_dir(path)) + + assert dir_content(f"{path_prefix}data/raw/exclude_dir") == ["excluded1.txt", "nested"] + assert dir_content(f"{path_prefix}data/raw") == ["ascii.txt", "exclude_dir", "nested", "some.csv", "utf-8.txt"] + assert dir_content(f"{path_prefix}data") == ["raw"] + assert "data" in dir_content(path_prefix) # "tmp" could present diff --git a/tests/tests_integration/tests_file_connection_integration/test_s3_integration.py b/tests/tests_integration/tests_file_connection_integration/test_s3_integration.py deleted file mode 100644 index f705c84fd..000000000 --- a/tests/tests_integration/tests_file_connection_integration/test_s3_integration.py +++ /dev/null @@ -1,61 +0,0 @@ -import logging -import os - -import pytest - -pytestmark = [pytest.mark.s3, pytest.mark.file_connection, pytest.mark.connection] - - -def test_s3_connection_check(caplog, s3_connection): - with caplog.at_level(logging.INFO): - assert s3_connection.check() == s3_connection - - assert "type = S3" in caplog.text - assert f"host = '{s3_connection.host}'" in caplog.text - assert f"port = {s3_connection.port}" in caplog.text - assert f"bucket = '{s3_connection.bucket}'" in caplog.text - assert f"access_key = '{s3_connection.access_key}'" in caplog.text - assert "secret_key = SecretStr('**********')" in caplog.text - assert s3_connection.secret_key.get_secret_value() not in caplog.text - assert "session_token =" not in caplog.text - - assert "Connection is available" in caplog.text - - -def test_s3_wrong_source_check(s3_server): - from onetl.connection import S3 - - anonymous_connection = S3( - host=s3_server.host, - port=s3_server.port, - bucket=s3_server.bucket, - protocol=s3_server.protocol, - access_key="unknown", - secret_key="unknown", - ) - - with pytest.raises(RuntimeError, match="Connection is unavailable"): - anonymous_connection.check() - - -@pytest.mark.parametrize("path_prefix", ["/", ""]) -def test_s3_connection_list_dir(path_prefix, s3_connection): - s3_connection.client.fput_object( - s3_connection.bucket, - "export/resources/src/exclude_dir/file_4.txt", - "tests/resources/src/exclude_dir/file_4.txt", - ) - s3_connection.client.fput_object( - s3_connection.bucket, - "export/resources/src/exclude_dir/file_5.txt", - "tests/resources/src/exclude_dir/file_5.txt", - ) - - def dir_content(path): - return [os.fspath(file) for file in s3_connection.list_dir(path)] - - assert dir_content(f"{path_prefix}export/resources/src/exclude_dir") == ["file_4.txt", "file_5.txt"] - assert dir_content(f"{path_prefix}export/resources/src") == ["exclude_dir"] - assert dir_content(f"{path_prefix}export/resources") == ["src"] - assert dir_content(f"{path_prefix}export") == ["resources"] - assert "export" in dir_content(path_prefix) # "tmp" could present diff --git a/tests/tests_integration/tests_file_connection_integration/test_sftp_integration.py b/tests/tests_integration/tests_file_connection_integration/test_sftp_file_connection_integration.py similarity index 63% rename from tests/tests_integration/tests_file_connection_integration/test_sftp_integration.py rename to tests/tests_integration/tests_file_connection_integration/test_sftp_file_connection_integration.py index 3e0e98360..70faf498e 100644 --- a/tests/tests_integration/tests_file_connection_integration/test_sftp_integration.py +++ b/tests/tests_integration/tests_file_connection_integration/test_sftp_file_connection_integration.py @@ -5,25 +5,26 @@ pytestmark = [pytest.mark.sftp, pytest.mark.file_connection, pytest.mark.connection] -def test_sftp_check(sftp_connection, caplog): +def test_sftp_file_connection_check_success(sftp_file_connection, caplog): + sftp = sftp_file_connection with caplog.at_level(logging.INFO): - assert sftp_connection.check() == sftp_connection + assert sftp.check() == sftp assert "type = SFTP" in caplog.text - assert f"host = '{sftp_connection.host}'" in caplog.text - assert f"port = {sftp_connection.port}" in caplog.text - assert f"user = '{sftp_connection.user}'" in caplog.text + assert f"host = '{sftp.host}'" in caplog.text + assert f"port = {sftp.port}" in caplog.text + assert f"user = '{sftp.user}'" in caplog.text assert "timeout = 10" in caplog.text assert "host_key_check = False" in caplog.text assert "compress = True" in caplog.text assert "key_file" not in caplog.text assert "password = SecretStr('**********')" in caplog.text - assert sftp_connection.password.get_secret_value() not in caplog.text + assert sftp.password.get_secret_value() not in caplog.text assert "Connection is available" in caplog.text -def test_sftp_wrong_source_check(sftp_server): +def test_sftp_file_connection_check_failed(sftp_server): from onetl.connection import SFTP sftp = SFTP(host=sftp_server.host, port=sftp_server.port, user="unknown", password="unknown") diff --git a/tests/tests_integration/tests_file_connection_integration/test_webdav_integration.py b/tests/tests_integration/tests_file_connection_integration/test_webdav_file_connection_integration.py similarity index 53% rename from tests/tests_integration/tests_file_connection_integration/test_webdav_integration.py rename to tests/tests_integration/tests_file_connection_integration/test_webdav_file_connection_integration.py index a41903d2b..3a91523ce 100644 --- a/tests/tests_integration/tests_file_connection_integration/test_webdav_integration.py +++ b/tests/tests_integration/tests_file_connection_integration/test_webdav_file_connection_integration.py @@ -5,23 +5,24 @@ pytestmark = [pytest.mark.webdav, pytest.mark.file_connection, pytest.mark.connection] -def test_webdav_check(webdav_connection, caplog): +def test_webdav_file_connection_check_success(webdav_file_connection, caplog): + webdav = webdav_file_connection with caplog.at_level(logging.INFO): - assert webdav_connection.check() == webdav_connection + assert webdav.check() == webdav assert "type = WebDAV" in caplog.text - assert f"host = '{webdav_connection.host}'" in caplog.text - assert f"port = {webdav_connection.port}" in caplog.text - assert f"protocol = '{webdav_connection.protocol}'" in caplog.text - assert f"ssl_verify = {webdav_connection.ssl_verify}" in caplog.text - assert f"user = '{webdav_connection.user}'" in caplog.text + assert f"host = '{webdav.host}'" in caplog.text + assert f"port = {webdav.port}" in caplog.text + assert f"protocol = '{webdav.protocol}'" in caplog.text + assert f"ssl_verify = {webdav.ssl_verify}" in caplog.text + assert f"user = '{webdav.user}'" in caplog.text assert "password = SecretStr('**********')" in caplog.text - assert webdav_connection.password.get_secret_value() not in caplog.text + assert webdav.password.get_secret_value() not in caplog.text assert "Connection is available" in caplog.text -def test_webdav_wrong_source_check(webdav_server): +def test_webdav_file_connection_check_failed(webdav_server): from onetl.connection import WebDAV webdav = WebDAV( diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_batch_mongodb/test_strategy_incremental_batch_mongodb.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_batch_mongodb/test_strategy_incremental_batch_mongodb.py index 835ddff3d..71763dc01 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_batch_mongodb/test_strategy_incremental_batch_mongodb.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_batch_mongodb/test_strategy_incremental_batch_mongodb.py @@ -11,7 +11,7 @@ pytestmark = pytest.mark.mongodb -@pytest.fixture(scope="function") +@pytest.fixture() def df_schema(): from pyspark.sql.types import ( DoubleType, diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_file.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_file.py index 8170852a1..4d3e16681 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_file.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_file.py @@ -10,18 +10,17 @@ def test_file_downloader_increment( - file_all_connections, - source_path, - upload_test_files, + file_connection_with_path_and_files, tmp_path_factory, tmp_path, ): + file_connection, remote_path, uploaded_files = file_connection_with_path_and_files hwm_store = YAMLHWMStore(path=tmp_path_factory.mktemp("hwmstore")) # noqa: S306 local_path = tmp_path_factory.mktemp("local_path") downloader = FileDownloader( - connection=file_all_connections, - source_path=source_path, + connection=file_connection, + source_path=remote_path, local_path=local_path, hwm_type="file_list", ) @@ -33,14 +32,14 @@ def test_file_downloader_increment( downloaded = downloader.run() # without HWM value all the files are shown and uploaded - assert len(available) == len(downloaded.successful) == len(upload_test_files) - assert sorted(available) == sorted(upload_test_files) + assert len(available) == len(downloaded.successful) == len(uploaded_files) + assert sorted(available) == sorted(uploaded_files) - remote_file_folder = RemoteFolder(name=source_path, instance=file_all_connections.instance_url) + remote_file_folder = RemoteFolder(name=remote_path, instance=file_connection.instance_url) file_hwm = FileListHWM(source=remote_file_folder) file_hwm_name = file_hwm.qualified_name - source_files = {RelativePath(file.relative_to(source_path)) for file in upload_test_files} + source_files = {RelativePath(file.relative_to(remote_path)) for file in uploaded_files} assert source_files == hwm_store.get(file_hwm_name).value for _ in "first_inc", "second_inc": @@ -48,7 +47,7 @@ def test_file_downloader_increment( tmp_file = tmp_path / new_file_name tmp_file.write_text(f"{secrets.token_hex(10)}") - file_all_connections.upload_file(tmp_file, source_path / new_file_name) + file_connection.upload_file(tmp_file, remote_path / new_file_name) with hwm_store: with IncrementalStrategy(): @@ -68,18 +67,17 @@ def test_file_downloader_increment( def test_file_downloader_increment_fail( - file_all_connections, - source_path, - upload_test_files, + file_connection_with_path_and_files, tmp_path_factory, tmp_path, ): + file_connection, remote_path, uploaded_files = file_connection_with_path_and_files hwm_store = YAMLHWMStore(path=tmp_path_factory.mktemp("hwmstore")) local_path = tmp_path_factory.mktemp("local_path") downloader = FileDownloader( - connection=file_all_connections, - source_path=source_path, + connection=file_connection, + source_path=remote_path, local_path=local_path, hwm_type="file_list", ) @@ -90,15 +88,15 @@ def test_file_downloader_increment_fail( downloaded = downloader.run() # without HWM value all the files are shown and uploaded - assert len(available) == len(downloaded.successful) == len(upload_test_files) - assert sorted(available) == sorted(upload_test_files) + assert len(available) == len(downloaded.successful) == len(uploaded_files) + assert sorted(available) == sorted(uploaded_files) - remote_file_folder = RemoteFolder(name=source_path, instance=file_all_connections.instance_url) + remote_file_folder = RemoteFolder(name=remote_path, instance=file_connection.instance_url) file_hwm = FileListHWM(source=remote_file_folder) file_hwm_name = file_hwm.qualified_name # HWM is updated in HWMStore - source_files = {RelativePath(file.relative_to(source_path)) for file in upload_test_files} + source_files = {RelativePath(file.relative_to(remote_path)) for file in uploaded_files} assert source_files == hwm_store.get(file_hwm_name).value for _ in "first_inc", "second_inc": @@ -106,7 +104,7 @@ def test_file_downloader_increment_fail( tmp_file = tmp_path / new_file_name tmp_file.write_text(f"{secrets.token_hex(10)}") - file_all_connections.upload_file(tmp_file, source_path / new_file_name) + file_connection.upload_file(tmp_file, remote_path / new_file_name) # while loading data, a crash occurs before exiting the context manager with contextlib.suppress(RuntimeError): @@ -128,18 +126,17 @@ def test_file_downloader_increment_fail( def test_file_downloader_increment_hwm_is_ignored_for_user_input( - file_all_connections, - source_path, - upload_test_files, + file_connection_with_path_and_files, tmp_path_factory, tmp_path, ): + file_connection, remote_path, uploaded_files = file_connection_with_path_and_files hwm_store = YAMLHWMStore(path=tmp_path_factory.mktemp("hwm_store")) local_path = tmp_path_factory.mktemp("local_path") downloader = FileDownloader( - connection=file_all_connections, - source_path=source_path, + connection=file_connection, + source_path=remote_path, local_path=local_path, hwm_type="file_list", options=FileDownloader.Options(mode="overwrite"), @@ -151,7 +148,7 @@ def test_file_downloader_increment_hwm_is_ignored_for_user_input( downloader.run() # download files from list - download_result = downloader.run(upload_test_files) + download_result = downloader.run(uploaded_files) # all the files are downloaded, HWM is ignored - assert len(download_result.successful) == len(upload_test_files) + assert len(download_result.successful) == len(uploaded_files) diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_mongodb.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_mongodb.py index 45f443318..71528bfc8 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_mongodb.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_mongodb.py @@ -9,7 +9,7 @@ pytestmark = pytest.mark.mongodb -@pytest.fixture(scope="function") +@pytest.fixture() def df_schema(): from pyspark.sql.types import ( DoubleType, diff --git a/tests/tests_unit/test_db/test_db_reader_unit/test_mongodb_reader_unit.py b/tests/tests_unit/test_db/test_db_reader_unit/test_mongodb_reader_unit.py index c296b2742..bd92a8638 100644 --- a/tests/tests_unit/test_db/test_db_reader_unit/test_mongodb_reader_unit.py +++ b/tests/tests_unit/test_db/test_db_reader_unit/test_mongodb_reader_unit.py @@ -6,7 +6,7 @@ pytestmark = pytest.mark.mongodb -@pytest.fixture(scope="function") +@pytest.fixture() def df_schema(): from pyspark.sql.types import ( DoubleType,