Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Storage][Spot] Resolving failure to exclude local files for managed spot jobs using git --ignored outputs #2018

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
b87ed7e
update excluding commands
landscapepainter May 7, 2023
fd4ccbb
exclude from .git/info/exclude
landscapepainter May 7, 2023
6492b98
added test in test_smoke.py
landscapepainter May 9, 2023
8879355
test-fix indent
landscapepainter May 9, 2023
40af8ed
format
landscapepainter May 10, 2023
45dc8ad
remove unused
landscapepainter May 10, 2023
2db6df3
fix
landscapepainter May 10, 2023
9eee64f
remove unused
landscapepainter May 10, 2023
8b925ae
robust formatting
landscapepainter May 17, 2023
a0795bc
update for [!01] and ? for gsutil
landscapepainter May 17, 2023
be85cd1
nit
landscapepainter May 17, 2023
436bfd0
add robust test case for gitignore sync
landscapepainter May 22, 2023
e478c1e
updating the test
landscapepainter May 24, 2023
9aca4fa
test update
landscapepainter May 25, 2023
d8283e5
nit
landscapepainter May 25, 2023
c95a2ef
.gitignore exclusion filter using rsync
landscapepainter May 27, 2023
36a845f
Merge branch 'master' into managed_spot_exclude_fix_rsync
landscapepainter May 27, 2023
5315a72
update test_smoke
landscapepainter May 28, 2023
5919521
format
landscapepainter May 28, 2023
3de671c
nit
landscapepainter May 28, 2023
4f18304
nit
landscapepainter May 28, 2023
0f7ab57
nit
landscapepainter May 28, 2023
ddc23b5
gitignore filter using git
landscapepainter Jun 3, 2023
c79f7f2
Update sky/data/storage.py
landscapepainter Jun 9, 2023
96ed4c7
nti updates
landscapepainter Jun 9, 2023
26596ef
update comments
landscapepainter Jun 10, 2023
da64fad
updates
landscapepainter Jun 11, 2023
d73cf40
test_smoke update
landscapepainter Jun 11, 2023
f624f91
format
landscapepainter Jun 11, 2023
de85068
Merge branch 'master' into managed_spot_exclude_fix_git
landscapepainter Jul 12, 2023
4a6e435
Update sky/data/storage_utils.py
landscapepainter Jul 26, 2023
f520aa9
Merge branch 'master' into managed_spot_exlcude_fix_git
landscapepainter Jul 26, 2023
b08cf44
Merge branch 'master' and update try except block
landscapepainter Jul 26, 2023
5a6ca6c
format
landscapepainter Jul 26, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 27 additions & 13 deletions sky/data/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from sky.data import data_transfer
from sky.data import data_utils
from sky.data import mounting_utils
from sky.data import storage_utils
from sky import exceptions
from sky import global_user_state
from sky import sky_logging
Expand Down Expand Up @@ -1092,10 +1093,14 @@ def get_file_sync_command(base_dir_path, file_names):

def get_dir_sync_command(src_dir_path, dest_dir_name):
# we exclude .git directory from the sync
sync_command = (
'aws s3 sync --no-follow-symlinks --exclude ".git/*" '
f'{src_dir_path} '
f's3://{self.name}/{dest_dir_name}')
excluded_list = storage_utils.get_excluded_files_from_gitignore(
src_dir_path)
excluded_list.append('.git/*')
excludes = ' '.join(
[f'--exclude "{file_name}"' for file_name in excluded_list])
sync_command = (f'aws s3 sync --no-follow-symlinks {excludes} '
f'{src_dir_path} '
f's3://{self.name}/{dest_dir_name}')
return sync_command

# Generate message for upload
Expand Down Expand Up @@ -1515,10 +1520,14 @@ def get_file_sync_command(base_dir_path, file_names):
return sync_command

def get_dir_sync_command(src_dir_path, dest_dir_name):
excluded_list = storage_utils.get_excluded_files_from_gitignore(
src_dir_path)
# we exclude .git directory from the sync
excluded_list.append(r'^\.git/.*$')
excludes = '|'.join(excluded_list)
gsutil_alias, alias_gen = data_utils.get_gsutil_command()
sync_command = (f'{alias_gen}; {gsutil_alias} '
f'rsync -e -r -x \'.git/*\' {src_dir_path} '
f'rsync -e -r -x \'({excludes})\' {src_dir_path} '
f'gs://{self.name}/{dest_dir_name}')
return sync_command

Expand Down Expand Up @@ -1843,15 +1852,20 @@ def get_file_sync_command(base_dir_path, file_names):

def get_dir_sync_command(src_dir_path, dest_dir_name):
# we exclude .git directory from the sync
excluded_list = storage_utils.get_excluded_files_from_gitignore(
src_dir_path)
excluded_list.append('.git/*')
excludes = ' '.join(
[f'--exclude "{file_name}"' for file_name in excluded_list])
endpoint_url = cloudflare.create_endpoint()
sync_command = (
'AWS_SHARED_CREDENTIALS_FILE='
f'{cloudflare.R2_CREDENTIALS_PATH} '
'aws s3 sync --no-follow-symlinks --exclude ".git/*" '
f'{src_dir_path} '
f's3://{self.name}/{dest_dir_name} '
f'--endpoint {endpoint_url} '
f'--profile={cloudflare.R2_PROFILE_NAME}')

sync_command = ('AWS_SHARED_CREDENTIALS_FILE='
f'{cloudflare.R2_CREDENTIALS_PATH} '
f'aws s3 sync --no-follow-symlinks {excludes} '
f'{src_dir_path} '
f's3://{self.name}/{dest_dir_name} '
f'--endpoint {endpoint_url} '
f'--profile={cloudflare.R2_PROFILE_NAME}')
return sync_command

# Generate message for upload
Expand Down
110 changes: 110 additions & 0 deletions sky/data/storage_utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,22 @@
"""Utility functions for the storage module."""
import colorama
import os
import subprocess
from typing import Any, Dict, List

from sky import exceptions
from sky import sky_logging
from sky.utils import log_utils
from sky.utils.cli_utils import status_utils

logger = sky_logging.init_logger(__name__)

_FILE_EXCLUSION_FROM_GITIGNORE_FAILURE_MSG = (
f'{colorama.Fore.YELLOW}Warning: Files/dirs '
'specified in .gitignore will be uploaded '
'to the cloud storage for {path!r}'
'due to the following error: {error_msg!r}')


def format_storage_table(storages: List[Dict[str, Any]],
show_all: bool = False) -> str:
Expand Down Expand Up @@ -49,3 +59,103 @@ def format_storage_table(storages: List[Dict[str, Any]],
return str(storage_table)
else:
return 'No existing storage.'


def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
""" Lists files and patterns ignored by git in the source directory

Runs `git status --ignored` which returns a list of excluded files and
patterns read from .gitignore and .git/info/exclude using git.
`git init` is run if SRC_DIR_PATH is not a git repository and removed
after obtaining excluded list.

Returns:
List[str] containing files and patterns to be ignored. Some of the
patterns include, **/mydir/*.txt, !myfile.log, or file-*/.
"""
expand_src_dir_path = os.path.expanduser(src_dir_path)

git_exclude_path = os.path.join(expand_src_dir_path, '.git/info/exclude')
gitignore_path = os.path.join(expand_src_dir_path, '.gitignore')

git_exclude_exists = os.path.isfile(git_exclude_path)
gitignore_exists = os.path.isfile(gitignore_path)

# This command outputs a list to be excluded according to .gitignore
# and .git/info/exclude
filter_cmd = f'git -C {expand_src_dir_path} status --ignored --porcelain=v1'
excluded_list: List[str] = []

if git_exclude_exists or gitignore_exists:
try:
output = subprocess.run(filter_cmd,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True,
text=True)
except subprocess.CalledProcessError as e:
# when the SRC_DIR_PATH is not a git repo and .git
# does not exist in it
if e.returncode == exceptions.GIT_FATAL_EXIT_CODE:
if 'not a git repository' in e.stderr:
# Check if the user has 'write' permission to
# SRC_DIR_PATH
if not os.access(expand_src_dir_path, os.W_OK):
error_msg = 'Write permission denial'
logger.warning(
_FILE_EXCLUSION_FROM_GITIGNORE_FAILURE_MSG.format(
path=src_dir_path, error_msg=error_msg))
return excluded_list
init_cmd = f'git -C {expand_src_dir_path} init'
try:
subprocess.run(init_cmd,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True)
output = subprocess.run(filter_cmd,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True,
text=True)
except subprocess.CalledProcessError as e:
logger.warning(
_FILE_EXCLUSION_FROM_GITIGNORE_FAILURE_MSG.format(
path=src_dir_path, error_msg=e.stderr))
return excluded_list
if git_exclude_exists:
# removes all the files/dirs created with 'git init'
# under .git/ except .git/info/exclude
remove_files_cmd = (f'find {expand_src_dir_path}' \
f'/.git -path {git_exclude_path}' \
' -prune -o -type f -exec rm -f ' \
'{} +')
remove_dirs_cmd = (f'find {expand_src_dir_path}' \
f'/.git -path {git_exclude_path}' \
' -o -type d -empty -delete')
subprocess.run(remove_files_cmd,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True)
subprocess.run(remove_dirs_cmd,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True)

output_list = output.stdout.split('\n')
for line in output_list:
# FILTER_CMD outputs items preceded by '!!'
# to specify excluded files/dirs
# e.g., '!! mydir/' or '!! mydir/myfile.txt'
if line.startswith('!!'):
to_be_excluded = line[3:]
if line.endswith('/'):
# aws s3 sync and gsutil rsync require * to exclude
# files/dirs under the specified directory.
to_be_excluded += '*'
excluded_list.append(to_be_excluded)
return excluded_list
2 changes: 2 additions & 0 deletions sky/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
RSYNC_FILE_NOT_FOUND_CODE = 23
# Arbitrarily chosen value. Used in SkyPilot's storage mounting scripts
MOUNT_PATH_NON_EMPTY_CODE = 42
# Return code when git command is ran in a dir that is not git repo
GIT_FATAL_EXIT_CODE = 128


class ResourcesUnavailableError(Exception):
Expand Down
8 changes: 8 additions & 0 deletions tests/git_info_exclude_test
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
question_mark/excluded?.txt
square_bracket/excluded[0-9].txt
square_bracket_single/excluded[01].txt
square_bracket_excla/excluded[!01].txt
square_bracket_alpha/excluded[a-z].txt
excluded_dir/
nested_double_asterisk/**/also_exclude.txt
nested_wildcard_dir/*day/also_exclude.txt
11 changes: 11 additions & 0 deletions tests/gitignore_test
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# This test should upload only included.txt, included.log, and included_dir/included.log
**/double_asterisk_excluded
**/double_asterisk_excluded_dir
**/parent/*.txt
**/parent/child/*.txt
*.log
exp-*/
!included.*
excluded.*
/front_slash_excluded
no_slash_excluded
Loading