Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DOP-8837] - add Samba file_connection class #150

Merged
merged 16 commits into from
Sep 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .env.docker
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,16 @@ ONETL_SFTP_PORT=2222
ONETL_SFTP_USER=onetl
ONETL_SFTP_PASSWORD=AesujeifohgoaCu0Boosiet5aimeitho

# Samba
ONETL_SAMBA_HOST=samba
ONETL_SAMBA_PROTOCOL=SMB
ONETL_SAMBA_UID=1000
ONETL_SAMBA_GID=1000
ONETL_SAMBA_PORT=445
ONETL_SAMBA_SHARE=SmbShare
ONETL_SAMBA_USER=onetl
ONETL_SAMBA_PASSWORD=awd123fd1

# Webdav
ONETL_WEBDAV_HOST=webdav
ONETL_WEBDAV_PORT=80
Expand Down
10 changes: 10 additions & 0 deletions .env.local
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,16 @@ export ONETL_SFTP_PORT=2222
export ONETL_SFTP_USER=onetl
export ONETL_SFTP_PASSWORD=AesujeifohgoaCu0Boosiet5aimeitho

# Samba
maxim-lixakov marked this conversation as resolved.
Show resolved Hide resolved
export ONETL_SAMBA_HOST=localhost
export ONETL_SAMBA_PROTOCOL=SMB
export ONETL_SAMBA_UID=1000
export ONETL_SAMBA_GID=1000
export ONETL_SAMBA_PORT=445
export ONETL_SAMBA_SHARE=SmbShare
export ONETL_SAMBA_USER=onetl
export ONETL_SAMBA_PASSWORD=awd123fd1

# Webdav
export ONETL_WEBDAV_HOST=localhost
export ONETL_WEBDAV_PORT=8000
Expand Down
1 change: 1 addition & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,6 @@
"tests.fixtures.connections.local_fs",
"tests.fixtures.connections.s3",
"tests.fixtures.connections.sftp",
"tests.fixtures.connections.samba",
"tests.fixtures.connections.webdav",
]
12 changes: 12 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,18 @@ services:
networks:
- onetl

samba:
image: elswork/samba
restart: unless-stopped
ports:
- "139:139"
- "445:445"
volumes:
- ./docker/samba:/share/folder
command: '-u "1000:1000:onetl:onetl:awd123fd1" -s "SmbShare:/share/folder:rw:onetl"'
networks:
- onetl

s3:
image: ${S3_IMAGE:-bitnami/minio:latest}
restart: unless-stopped
Expand Down
1 change: 1 addition & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ RUN pip install \
-r /app/requirements/hdfs.txt \
-r /app/requirements/s3.txt \
-r /app/requirements/sftp.txt \
-r /app/requirements/samba.txt \
-r /app/requirements/webdav.txt \
-r /app/requirements/kerberos.txt \
-r /app/requirements/docs.txt \
Expand Down
4 changes: 4 additions & 0 deletions docker/samba/on_post_init.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/usr/bin/env bash

# allow create files and directories
chmod 777 /share/folder
2 changes: 2 additions & 0 deletions onetl/connection/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from onetl.connection.file_connection.ftps import FTPS
from onetl.connection.file_connection.hdfs import HDFS
from onetl.connection.file_connection.s3 import S3
from onetl.connection.file_connection.samba import Samba
from onetl.connection.file_connection.sftp import SFTP
from onetl.connection.file_connection.webdav import WebDAV
from onetl.connection.file_df_connection.spark_hdfs import SparkHDFS
Expand All @@ -62,6 +63,7 @@
"HDFS": "hdfs",
"S3": "s3",
"SFTP": "sftp",
"Samba": "samba",
"WebDAV": "webdav",
}

Expand Down
292 changes: 292 additions & 0 deletions onetl/connection/file_connection/samba.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,292 @@
# Copyright 2023 MTS (Mobile Telesystems)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import os
import stat
import textwrap
from io import BytesIO
from logging import getLogger
from pathlib import Path
from typing import Literal, Optional, Union

from etl_entities.instance import Host
from pydantic import SecretStr, validator

from onetl.connection.file_connection.file_connection import FileConnection
from onetl.hooks import slot, support_hooks
from onetl.impl import LocalPath, RemotePath, RemotePathStat

try:
from smb.smb_structs import OperationFailure
from smb.SMBConnection import SMBConnection
except (ImportError, NameError) as e:
raise ImportError(
textwrap.dedent(
"""
Cannot import module "pysmb".

You should install package as follows:
pip install onetl[samba]

or
pip install onetl[files]
""",
).strip(),
) from e


log = getLogger(__name__)


@support_hooks
class Samba(FileConnection):
"""Samba file connection.

Based on `pysmb library <https://pypi.org/project/pysmb/>`_.

Parameters
----------
host : str
Host of Samba source. For example: ``mydomain.com``.

share : str
The name of the share on the Samba server.

protocol : str, default: ``SMB``
The protocol to use for the connection. Either ``SMB`` or ``NetBIOS``.
Affects the default port and the `is_direct_tcp` flag in `SMBConnection`.

port : int, default: 445
Port of Samba source.

domain : str, default: ``
Domain name for the Samba connection. Empty strings means use ``host`` as domain name.

auth_type : str, default: ``NTLMv2``
The authentication type to use. Either ``NTLMv2`` or ``NTLMv1``.
Affects the `use_ntlm_v2` flag in `SMBConnection`.

user : str, default: None
User, which have access to the file source. Can be `None` for anonymous connection.

password : str, default: None
Password for file source connection. Can be `None` for anonymous connection.

"""

host: Host
share: str
protocol: Union[Literal["SMB"], Literal["NetBIOS"]] = "SMB"
port: Optional[int] = None
domain: Optional[str] = ""
auth_type: Union[Literal["NTLMv1"], Literal["NTLMv2"]] = "NTLMv2"
user: Optional[str] = None
password: Optional[SecretStr] = None

@property
def instance_url(self) -> str:
return f"smb://{self.host}:{self.port}"

@slot
def check(self):
log.info("|%s| Checking connection availability...", self.__class__.__name__)
self._log_parameters()
try:
available_shares = {share.name for share in self.client.listShares()}
if self.share in available_shares:
log.info("|%s| Connection is available.", self.__class__.__name__)
else:
maxim-lixakov marked this conversation as resolved.
Show resolved Hide resolved
log.error(
"|%s| Share %r not found among existing shares %r",
self.__class__.__name__,
self.share,
available_shares,
)
raise ConnectionError("Failed to connect to the Samba server.")
except Exception as exc:
log.exception("|%s| Connection is unavailable", self.__class__.__name__)
raise RuntimeError("Connection is unavailable") from exc

return self

@slot
def path_exists(self, path: os.PathLike | str) -> bool:
try:
self.client.getAttributes(self.share, os.fspath(path))
return True
except OperationFailure:
return False

def _scan_entries(self, path: RemotePath) -> list:
if self._is_dir(path):
return [
entry
for entry in self.client.listPath(
self.share,
os.fspath(path),
)
if entry.filename not in {".", ".."} # Filter out '.' and '..'
]
return [self.client.getAttributes(self.share, os.fspath(path))]

def _extract_name_from_entry(self, entry) -> str:
return entry.filename

def _is_dir_entry(self, top: RemotePath, entry) -> bool:
return entry.isDirectory

def _is_file_entry(self, top: RemotePath, entry) -> bool:
return not entry.isDirectory

def _extract_stat_from_entry(self, top: RemotePath, entry) -> RemotePathStat:
if entry.isDirectory:
return RemotePathStat(st_mode=stat.S_IFDIR)

return RemotePathStat(
st_size=entry.file_size,
st_mtime=entry.last_write_time,
st_uid=entry.filename,
)

def _get_client(self) -> SMBConnection:
is_direct_tcp = self.protocol == "SMB"
use_ntlm_v2 = self.auth_type == "NTLMv2"
conn = SMBConnection(
username=self.user,
password=self.password.get_secret_value() if self.password else None,
my_name="optional_client_name",
remote_name=self.host,
domain=self.domain,
use_ntlm_v2=use_ntlm_v2,
sign_options=2,
is_direct_tcp=is_direct_tcp,
)
conn.connect(self.host, port=self.port)
return conn

def _is_client_closed(self, client: SMBConnection) -> bool:
try:
socket_fileno = client.sock.fileno()
except (AttributeError, OSError):
return True

return socket_fileno == -1

def _close_client(self, client: SMBConnection) -> None:
self.client.close()

def _download_file(self, remote_file_path: RemotePath, local_file_path: LocalPath) -> None:
with open(local_file_path, "wb") as local_file:
self.client.retrieveFile(
self.share,
os.fspath(remote_file_path),
local_file,
)

def _get_stat(self, path: RemotePath) -> RemotePathStat:
info = self.client.getAttributes(self.share, os.fspath(path))

if self.is_dir(os.fspath(path)):
return RemotePathStat(st_mode=stat.S_IFDIR)

return RemotePathStat(
st_size=info.file_size,
st_mtime=info.last_write_time,
st_uid=info.filename,
)

def _remove_file(self, remote_file_path: RemotePath) -> None:
self.client.deleteFiles(
self.share,
os.fspath(remote_file_path),
)

def _create_dir(self, path: RemotePath) -> None:
path_obj = Path(path)
for parent in reversed(path_obj.parents):
# create dirs sequentially as .createDirectory(...) cannot create nested dirs
try:
self.client.getAttributes(self.share, os.fspath(parent))
except OperationFailure:
self.client.createDirectory(self.share, os.fspath(parent))

self.client.createDirectory(self.share, os.fspath(path))

def _upload_file(self, local_file_path: LocalPath, remote_file_path: RemotePath) -> None:
with open(local_file_path, "rb") as file_obj:
self.client.storeFile(
self.share,
os.fspath(remote_file_path),
file_obj,
)

def _rename_file(self, source: RemotePath, target: RemotePath) -> None:
self.client.rename(
self.share,
os.fspath(source),
os.fspath(target),
)

def _remove_dir(self, path: RemotePath) -> None:
files = self.client.listPath(self.share, os.fspath(path))

for item in files:
if item.filename not in {".", ".."}: # skip current and parent directory entries
full_path = path / item.filename
if item.isDirectory:
# recursively delete subdirectory
self._remove_dir(full_path)
else:
self.client.deleteFiles(self.share, os.fspath(full_path))

self.client.deleteDirectory(self.share, os.fspath(path))

def _read_text(self, path: RemotePath, encoding: str) -> str:
return self._read_bytes(path).decode(encoding)

def _read_bytes(self, path: RemotePath) -> bytes:
file_obj = BytesIO()
self.client.retrieveFile(
self.share,
os.fspath(path),
file_obj,
)
file_obj.seek(0)
return file_obj.read()

def _write_text(self, path: RemotePath, content: str, encoding: str) -> None:
self._write_bytes(path, bytes(content, encoding))

def _write_bytes(self, path: RemotePath, content: bytes) -> None:
file_obj = BytesIO(content)

self.client.storeFile(
self.share,
os.fspath(path),
file_obj,
)

def _is_dir(self, path: RemotePath) -> bool:
return self.client.getAttributes(self.share, os.fspath(path)).isDirectory

def _is_file(self, path: RemotePath) -> bool:
return not self.client.getAttributes(self.share, os.fspath(path)).isDirectory

@validator("port", pre=True, always=True)
def _set_port_based_on_protocol(cls, port, values):
if port is None:
return 445 if values.get("protocol") == "SMB" else 139
return port
1 change: 1 addition & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,6 @@ markers =
postgres: Postgres tests
s3: S3 tests
sftp: SFTP tests
samba: Samba tests
teradata: Teradata tests
webdav: WebDAV tests
1 change: 1 addition & 0 deletions requirements/samba.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pysmb
1 change: 1 addition & 0 deletions requirements/tests/samba.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pysmb
Loading
Loading