diff --git a/sky/data/storage.py b/sky/data/storage.py index a9d6ce9f95c..66276e0f254 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -20,6 +20,7 @@ from sky.data import data_transfer from sky.data import data_utils from sky.data import mounting_utils +from sky.data import storage_utils from sky import exceptions from sky import global_user_state from sky import sky_logging @@ -1092,10 +1093,14 @@ def get_file_sync_command(base_dir_path, file_names): def get_dir_sync_command(src_dir_path, dest_dir_name): # we exclude .git directory from the sync - sync_command = ( - 'aws s3 sync --no-follow-symlinks --exclude ".git/*" ' - f'{src_dir_path} ' - f's3://{self.name}/{dest_dir_name}') + excluded_list = storage_utils.get_excluded_files_from_gitignore( + src_dir_path) + excluded_list.append('.git/*') + excludes = ' '.join( + [f'--exclude "{file_name}"' for file_name in excluded_list]) + sync_command = (f'aws s3 sync --no-follow-symlinks {excludes} ' + f'{src_dir_path} ' + f's3://{self.name}/{dest_dir_name}') return sync_command # Generate message for upload @@ -1515,10 +1520,14 @@ def get_file_sync_command(base_dir_path, file_names): return sync_command def get_dir_sync_command(src_dir_path, dest_dir_name): + excluded_list = storage_utils.get_excluded_files_from_gitignore( + src_dir_path) # we exclude .git directory from the sync + excluded_list.append(r'^\.git/.*$') + excludes = '|'.join(excluded_list) gsutil_alias, alias_gen = data_utils.get_gsutil_command() sync_command = (f'{alias_gen}; {gsutil_alias} ' - f'rsync -e -r -x \'.git/*\' {src_dir_path} ' + f'rsync -e -r -x \'({excludes})\' {src_dir_path} ' f'gs://{self.name}/{dest_dir_name}') return sync_command @@ -1843,15 +1852,20 @@ def get_file_sync_command(base_dir_path, file_names): def get_dir_sync_command(src_dir_path, dest_dir_name): # we exclude .git directory from the sync + excluded_list = storage_utils.get_excluded_files_from_gitignore( + src_dir_path) + excluded_list.append('.git/*') + excludes = ' '.join( + [f'--exclude "{file_name}"' for file_name in excluded_list]) endpoint_url = cloudflare.create_endpoint() - sync_command = ( - 'AWS_SHARED_CREDENTIALS_FILE=' - f'{cloudflare.R2_CREDENTIALS_PATH} ' - 'aws s3 sync --no-follow-symlinks --exclude ".git/*" ' - f'{src_dir_path} ' - f's3://{self.name}/{dest_dir_name} ' - f'--endpoint {endpoint_url} ' - f'--profile={cloudflare.R2_PROFILE_NAME}') + + sync_command = ('AWS_SHARED_CREDENTIALS_FILE=' + f'{cloudflare.R2_CREDENTIALS_PATH} ' + f'aws s3 sync --no-follow-symlinks {excludes} ' + f'{src_dir_path} ' + f's3://{self.name}/{dest_dir_name} ' + f'--endpoint {endpoint_url} ' + f'--profile={cloudflare.R2_PROFILE_NAME}') return sync_command # Generate message for upload diff --git a/sky/data/storage_utils.py b/sky/data/storage_utils.py index c3b157ae9b7..8ee493f66b6 100644 --- a/sky/data/storage_utils.py +++ b/sky/data/storage_utils.py @@ -1,12 +1,22 @@ """Utility functions for the storage module.""" +import colorama +import os +import subprocess from typing import Any, Dict, List +from sky import exceptions from sky import sky_logging from sky.utils import log_utils from sky.utils.cli_utils import status_utils logger = sky_logging.init_logger(__name__) +_FILE_EXCLUSION_FROM_GITIGNORE_FAILURE_MSG = ( + f'{colorama.Fore.YELLOW}Warning: Files/dirs ' + 'specified in .gitignore will be uploaded ' + 'to the cloud storage for {path!r}' + 'due to the following error: {error_msg!r}') + def format_storage_table(storages: List[Dict[str, Any]], show_all: bool = False) -> str: @@ -49,3 +59,103 @@ def format_storage_table(storages: List[Dict[str, Any]], return str(storage_table) else: return 'No existing storage.' + + +def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]: + """ Lists files and patterns ignored by git in the source directory + + Runs `git status --ignored` which returns a list of excluded files and + patterns read from .gitignore and .git/info/exclude using git. + `git init` is run if SRC_DIR_PATH is not a git repository and removed + after obtaining excluded list. + + Returns: + List[str] containing files and patterns to be ignored. Some of the + patterns include, **/mydir/*.txt, !myfile.log, or file-*/. + """ + expand_src_dir_path = os.path.expanduser(src_dir_path) + + git_exclude_path = os.path.join(expand_src_dir_path, '.git/info/exclude') + gitignore_path = os.path.join(expand_src_dir_path, '.gitignore') + + git_exclude_exists = os.path.isfile(git_exclude_path) + gitignore_exists = os.path.isfile(gitignore_path) + + # This command outputs a list to be excluded according to .gitignore + # and .git/info/exclude + filter_cmd = f'git -C {expand_src_dir_path} status --ignored --porcelain=v1' + excluded_list: List[str] = [] + + if git_exclude_exists or gitignore_exists: + try: + output = subprocess.run(filter_cmd, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=True, + text=True) + except subprocess.CalledProcessError as e: + # when the SRC_DIR_PATH is not a git repo and .git + # does not exist in it + if e.returncode == exceptions.GIT_FATAL_EXIT_CODE: + if 'not a git repository' in e.stderr: + # Check if the user has 'write' permission to + # SRC_DIR_PATH + if not os.access(expand_src_dir_path, os.W_OK): + error_msg = 'Write permission denial' + logger.warning( + _FILE_EXCLUSION_FROM_GITIGNORE_FAILURE_MSG.format( + path=src_dir_path, error_msg=error_msg)) + return excluded_list + init_cmd = f'git -C {expand_src_dir_path} init' + try: + subprocess.run(init_cmd, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=True) + output = subprocess.run(filter_cmd, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=True, + text=True) + except subprocess.CalledProcessError as e: + logger.warning( + _FILE_EXCLUSION_FROM_GITIGNORE_FAILURE_MSG.format( + path=src_dir_path, error_msg=e.stderr)) + return excluded_list + if git_exclude_exists: + # removes all the files/dirs created with 'git init' + # under .git/ except .git/info/exclude + remove_files_cmd = (f'find {expand_src_dir_path}' \ + f'/.git -path {git_exclude_path}' \ + ' -prune -o -type f -exec rm -f ' \ + '{} +') + remove_dirs_cmd = (f'find {expand_src_dir_path}' \ + f'/.git -path {git_exclude_path}' \ + ' -o -type d -empty -delete') + subprocess.run(remove_files_cmd, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=True) + subprocess.run(remove_dirs_cmd, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=True) + + output_list = output.stdout.split('\n') + for line in output_list: + # FILTER_CMD outputs items preceded by '!!' + # to specify excluded files/dirs + # e.g., '!! mydir/' or '!! mydir/myfile.txt' + if line.startswith('!!'): + to_be_excluded = line[3:] + if line.endswith('/'): + # aws s3 sync and gsutil rsync require * to exclude + # files/dirs under the specified directory. + to_be_excluded += '*' + excluded_list.append(to_be_excluded) + return excluded_list diff --git a/sky/exceptions.py b/sky/exceptions.py index 07cd28afd68..6096bd77214 100644 --- a/sky/exceptions.py +++ b/sky/exceptions.py @@ -13,6 +13,8 @@ RSYNC_FILE_NOT_FOUND_CODE = 23 # Arbitrarily chosen value. Used in SkyPilot's storage mounting scripts MOUNT_PATH_NON_EMPTY_CODE = 42 +# Return code when git command is ran in a dir that is not git repo +GIT_FATAL_EXIT_CODE = 128 class ResourcesUnavailableError(Exception): diff --git a/tests/git_info_exclude_test b/tests/git_info_exclude_test new file mode 100644 index 00000000000..801578d7cd5 --- /dev/null +++ b/tests/git_info_exclude_test @@ -0,0 +1,8 @@ +question_mark/excluded?.txt +square_bracket/excluded[0-9].txt +square_bracket_single/excluded[01].txt +square_bracket_excla/excluded[!01].txt +square_bracket_alpha/excluded[a-z].txt +excluded_dir/ +nested_double_asterisk/**/also_exclude.txt +nested_wildcard_dir/*day/also_exclude.txt diff --git a/tests/gitignore_test b/tests/gitignore_test new file mode 100644 index 00000000000..f60bce9bccb --- /dev/null +++ b/tests/gitignore_test @@ -0,0 +1,11 @@ +# This test should upload only included.txt, included.log, and included_dir/included.log +**/double_asterisk_excluded +**/double_asterisk_excluded_dir +**/parent/*.txt +**/parent/child/*.txt +*.log +exp-*/ +!included.* +excluded.* +/front_slash_excluded +no_slash_excluded diff --git a/tests/test_smoke.py b/tests/test_smoke.py index ecb313872a7..7f28be2f760 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -25,6 +25,7 @@ import hashlib import inspect import pathlib +import shutil import subprocess import sys import tempfile @@ -2445,6 +2446,97 @@ class TestStorageWithCredentials: 'abc_', # ends with an underscore ] + GITIGNORE_SYNC_TEST_DIR_STRUCTURE = { + 'double_asterisk': { + 'double_asterisk_excluded': None, + 'double_asterisk_excluded_dir': { + 'dir_excluded': None, + }, + }, + 'double_asterisk_parent': { + 'parent': { + 'also_excluded.txt': None, + 'child': { + 'double_asterisk_parent_child_excluded.txt': None, + }, + 'double_asterisk_parent_excluded.txt': None, + }, + }, + 'excluded.log': None, + 'excluded_dir': { + 'excluded.txt': None, + 'nested_excluded': { + 'excluded': None, + }, + }, + 'exp-1': { + 'be_excluded': None, + }, + 'exp-2': { + 'be_excluded': None, + }, + 'front_slash_excluded': None, + 'included.log': None, + 'included.txt': None, + 'include_dir': { + 'excluded.log': None, + 'included.log': None, + }, + 'nested_double_asterisk': { + 'one': { + 'also_exclude.txt': None, + }, + 'two': { + 'also_exclude.txt': None, + }, + }, + 'nested_wildcard_dir': { + 'monday': { + 'also_exclude.txt': None, + }, + 'tuesday': { + 'also_exclude.txt': None, + }, + }, + 'no_slash_excluded': None, + 'no_slash_tests': { + 'no_slash_excluded': { + 'also_excluded.txt': None, + }, + }, + 'question_mark': { + 'excluded1.txt': None, + 'excluded@.txt': None, + }, + 'square_bracket': { + 'excluded1.txt': None, + }, + 'square_bracket_alpha': { + 'excludedz.txt': None, + }, + 'square_bracket_excla': { + 'excluded2.txt': None, + 'excluded@.txt': None, + }, + 'square_bracket_single': { + 'excluded0.txt': None, + }, + } + + @staticmethod + def create_dir_structure(base_path, structure): + # creates a given file STRUCTURE in BASE_PATH + for name, substructure in structure.items(): + path = os.path.join(base_path, name) + if substructure is None: + # Create a file + open(path, 'a').close() + else: + # Create a subdirectory + os.mkdir(path) + TestStorageWithCredentials.create_dir_structure( + path, substructure) + @staticmethod def cli_delete_cmd(store_type, bucket_name): if store_type == storage_lib.StoreType.S3: @@ -2481,6 +2573,35 @@ def cli_ls_cmd(store_type, bucket_name, suffix=''): url = f's3://{bucket_name}' return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls {url} --endpoint {endpoint_url} --profile=r2' + @staticmethod + def cli_count_name_in_bucket(store_type, bucket_name, file_name, suffix=''): + if store_type == storage_lib.StoreType.S3: + if suffix: + return f'aws s3api list-objects --bucket "{bucket_name}" --prefix {suffix} --query "length(Contents[?contains(Key,\'{file_name}\')].Key)"' + else: + return f'aws s3api list-objects --bucket "{bucket_name}" --query "length(Contents[?contains(Key,\'{file_name}\')].Key)"' + elif store_type == storage_lib.StoreType.GCS: + if suffix: + return f'gsutil ls -r gs://{bucket_name}/{suffix} | grep "{file_name}" | wc -l' + else: + return f'gsutil ls -r gs://{bucket_name} | grep "{file_name}" | wc -l' + elif store_type == storage_lib.StoreType.R2: + endpoint_url = cloudflare.create_endpoint() + if suffix: + return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3api list-objects --bucket "{bucket_name}" --prefix {suffix} --query "length(Contents[?contains(Key,\'{file_name}\')].Key)" --endpoint {endpoint_url} --profile=r2' + else: + return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3api list-objects --bucket "{bucket_name}" --query "length(Contents[?contains(Key,\'{file_name}\')].Key)" --endpoint {endpoint_url} --profile=r2' + + @staticmethod + def cli_count_file_in_bucket(store_type, bucket_name): + if store_type == storage_lib.StoreType.S3: + return f'aws s3 ls s3://{bucket_name} --recursive | wc -l' + elif store_type == storage_lib.StoreType.GCS: + return f'gsutil ls -r gs://{bucket_name}/** | wc -l' + elif store_type == storage_lib.StoreType.R2: + endpoint_url = cloudflare.create_endpoint() + return f'AWS_SHARED_CREDENTIALS_FILE={cloudflare.R2_CREDENTIALS_PATH} aws s3 ls s3://{bucket_name} --recursive --endpoint {endpoint_url} --profile=r2 | wc -l' + @pytest.fixture def tmp_source(self, tmp_path): # Creates a temporary directory with a file in it @@ -2593,6 +2714,37 @@ def tmp_copy_mnt_existing_storage_obj(self, tmp_scratch_storage_obj): yield from self.yield_storage_object(name=storage_name, mode=storage_lib.StorageMode.COPY) + @pytest.fixture + def tmp_gitignore_storage_obj(self, tmp_bucket_name, gitignore_structure): + # Creates a temporary storage object for testing .gitignore filter. + # GITIGINORE_STRUCTURE is representing a file structure in a dictionary + # foramt. Created storage object will contain the file structure along + # with .gitignore and .git/info/exclude files to test exclude filter. + # Stores must be added in the test. + with tempfile.TemporaryDirectory() as tmpdir: + # Creates file structure to be uploaded in the Storage + self.create_dir_structure(tmpdir, gitignore_structure) + + # Create .gitignore and list files/dirs to be excluded in it + skypilot_path = os.path.dirname(os.path.dirname(sky.__file__)) + temp_path = f'{tmpdir}/.gitignore' + file_path = os.path.join(skypilot_path, 'tests/gitignore_test') + shutil.copyfile(file_path, temp_path) + + # Create .git/info/exclude and list files/dirs to be excluded in it + temp_path = f'{tmpdir}/.git/info/' + os.makedirs(temp_path) + temp_exclude_path = os.path.join(temp_path, 'exclude') + file_path = os.path.join(skypilot_path, + 'tests/git_info_exclude_test') + shutil.copyfile(file_path, temp_exclude_path) + + # Create sky Storage with the files created + yield from self.yield_storage_object( + name=tmp_bucket_name, + source=tmpdir, + mode=storage_lib.StorageMode.COPY) + @pytest.fixture def tmp_awscli_bucket(self, tmp_bucket_name): # Creates a temporary bucket using awscli @@ -2891,6 +3043,44 @@ def test_invalid_names(self, invalid_name_list, store_type): storage_obj = storage_lib.Storage(name=name) storage_obj.add_store(store_type) + @pytest.mark.parametrize( + 'gitignore_structure, store_type', + [(GITIGNORE_SYNC_TEST_DIR_STRUCTURE, storage_lib.StoreType.S3), + (GITIGNORE_SYNC_TEST_DIR_STRUCTURE, storage_lib.StoreType.GCS), + pytest.param(GITIGNORE_SYNC_TEST_DIR_STRUCTURE, + storage_lib.StoreType.R2, + marks=pytest.mark.cloudflare)]) + def test_excluded_file_cloud_storage_upload_copy(self, gitignore_structure, + store_type, + tmp_gitignore_storage_obj): + # tests if files included in .gitignore and .git/info/exclude are + # excluded from being transferred to Storage + + tmp_gitignore_storage_obj.add_store(store_type) + + upload_file_name = 'included' + # Count the number of files with the given file name + up_cmd = self.cli_count_name_in_bucket(store_type, \ + tmp_gitignore_storage_obj.name, file_name=upload_file_name) + git_exclude_cmd = self.cli_count_name_in_bucket(store_type, \ + tmp_gitignore_storage_obj.name, file_name='.git') + cnt_num_file_cmd = self.cli_count_file_in_bucket( + store_type, tmp_gitignore_storage_obj.name) + + up_output = subprocess.check_output(up_cmd, shell=True) + git_exclude_output = subprocess.check_output(git_exclude_cmd, + shell=True) + cnt_output = subprocess.check_output(cnt_num_file_cmd, shell=True) + + assert '3' in up_output.decode('utf-8'), \ + 'Files to be included are not completely uploaded.' + # 1 is read as .gitignore is uploaded + assert '1' in git_exclude_output.decode('utf-8'), \ + '.git directory should not be uploaded.' + # 4 files include .gitignore, included.log, included.txt, include_dir/included.log + assert '4' in cnt_output.decode('utf-8'), \ + 'Some items listed in .gitignore and .git/info/exclude are not excluded.' + # ---------- Testing YAML Specs ---------- # Our sky storage requires credentials to check the bucket existance when