forked from datalad/datalad-next
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
iter_gitworktree()
for processing work tree content
The iterator is also integrated with `ls-file-collection` as collection type `gitworktree`. Closes datalad#350 Ping datalad#323
- Loading branch information
Showing
4 changed files
with
312 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,7 @@ | |
:toctree: generated | ||
directory | ||
gitworktree | ||
tarfile | ||
utils | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,247 @@ | ||
"""Report on the content of a Git repository worktree | ||
The main functionality is provided by the :func:`iter_gitworktree()` function. | ||
""" | ||
from __future__ import annotations | ||
|
||
from dataclasses import dataclass | ||
from enum import Enum | ||
import logging | ||
from pathlib import ( | ||
Path, | ||
PurePath, | ||
PurePosixPath, | ||
) | ||
import re | ||
from typing import ( | ||
Dict, | ||
Generator, | ||
List, | ||
Tuple, | ||
) | ||
|
||
from datalad_next.runners import ( | ||
DEVNULL, | ||
LineSplitter, | ||
ThreadedRunner, | ||
StdOutCaptureGeneratorProtocol, | ||
) | ||
|
||
from .utils import ( | ||
FileSystemItem, | ||
PathBasedItem, | ||
) | ||
|
||
lgr = logging.getLogger('datalad.ext.next.iter_collections.git_worktree') | ||
|
||
|
||
# TODO Could be `StrEnum`, came with PY3.11 | ||
class GitTreeItemType(Enum): | ||
"""Enumeration of item types of Git trees | ||
""" | ||
file = 'file' | ||
executablefile = 'executablefile' | ||
symlink = 'symlink' | ||
directory = 'directory' | ||
submodule = 'submodule' | ||
|
||
|
||
# TODO maybe establish GitTreeItem and derive from that | ||
@dataclass | ||
class GitWorktreeItem(PathBasedItem): | ||
name: PurePosixPath | ||
# gitsha is not the sha1 of the file content, but the output | ||
# of `git hash-object` which does something like | ||
# `printf "blob $(wc -c < "$file_name")\0$(cat "$file_name")" | sha1sum` | ||
gitsha: str | None = None | ||
gittype: GitTreeItemType | None = None | ||
|
||
|
||
@dataclass | ||
class GitWorktreeFileSystemItem(FileSystemItem): | ||
# gitsha is not the sha1 of the file content, but the output | ||
# of `git hash-object` which does something like | ||
# `printf "blob $(wc -c < "$file_name")\0$(cat "$file_name")" | sha1sum` | ||
gitsha: str | None = None | ||
gittype: GitTreeItemType | None = None | ||
|
||
|
||
# stolen from GitRepo.get_content_info() | ||
_lsfiles_props_re = re.compile( | ||
r'(?P<mode>[0-9]+) (?P<gitsha>.*) (.*)\t(?P<fname>.*)$' | ||
) | ||
|
||
_mode_type_map = { | ||
'100644': GitTreeItemType.file, | ||
'100755': GitTreeItemType.executablefile, | ||
'040000': GitTreeItemType.directory, | ||
'120000': GitTreeItemType.symlink, | ||
'160000': GitTreeItemType.submodule, | ||
} | ||
|
||
lsfiles_untracked_args = { | ||
'all': | ||
('--exclude-standard', '--others',), | ||
'whole-dir': | ||
('--exclude-standard', '--others', '--directory'), | ||
'no-empty-dir': | ||
('--exclude-standard', '--others', '--directory', '--no-empty-directory'), | ||
} | ||
|
||
|
||
def iter_gitworktree( | ||
path: Path, | ||
*, | ||
include_untracked: str | None = 'all', | ||
link_target: bool = False, | ||
hash: List[str] | None = None, | ||
) -> Generator[GitWorktreeItem | GitWorktreeFileSystemItem, None, None]: | ||
"""Uses ``git ls-files`` to report on a work tree of a Git repository | ||
This iterator can be used to report on all tracked, and untracked content | ||
of a Git repository's work tree. This includes files that have been removed | ||
from the work tree (deleted), unless their removal has already been staged. | ||
For any tracked content, yielded items include type information and gitsha | ||
as last known to Git. This means that such reports reflect the last | ||
committed or staged content, not the state of a potential unstaged | ||
modification in the work tree. | ||
When no reporting of link targets or hashes is requested, items of type | ||
:class:`GitWorktreeItem` are yielded, otherwise | ||
:class:`GitWorktreeFileSystemItem` instances. In both cases, ``gitsha`` | ||
and ``gittype`` properties are provided. Either of them being ``None`` | ||
indicates untracked work tree content. | ||
.. note:: | ||
The ``gitsha`` is not equivalent to a SHA1 hash of a file's content, | ||
but is the SHA-type blob identifier as reported and used by Git. | ||
Parameters | ||
---------- | ||
path: Path | ||
Path of a directory in a Git repository to report on. This directory | ||
need not be the root directory of the repository, but must be part of | ||
the repository's work tree. | ||
untracked: {'all', 'whole-dir', 'no-empty'} or None, optional | ||
If not ``None``, also reports on untracked work tree content. | ||
``all`` reports on any untracked file; ``whole-dir`` yields a single | ||
report for a directory that is entirely untracked, and not individual | ||
untracked files in it; ``no-empty-dir`` skips any reports on | ||
untracked empty directories. Any untracked content is yielded as | ||
a ``PurePosixPath``. | ||
Yields | ||
------ | ||
:class:`GitWorktreeItem` or `GitWorktreeFileSystemItem` | ||
""" | ||
lsfiles_args = ['--stage', '--cached'] | ||
if include_untracked: | ||
lsfiles_args.extend(lsfiles_untracked_args[include_untracked]) | ||
|
||
# helper to handle multi-stage reports by ls-files | ||
pending_item = None | ||
|
||
for line in _git_ls_files(path, *lsfiles_args): | ||
ipath, lsfiles_props = _lsfiles_line2props(line) | ||
if pending_item is not None and pending_item[0] != ipath: | ||
# report on a pending item, this is not a "higher-stage" | ||
# report by ls-files | ||
yield _get_item(path, link_target, hash, *pending_item) | ||
pending_item = None | ||
|
||
if lsfiles_props is None: | ||
# when no properties were produces, this is a | ||
# category "other" report (i.e., untracked content) | ||
# the path is always relative-POSIX | ||
yield _get_item(path, link_target, hash, ipath) | ||
continue | ||
|
||
pending_item = ( | ||
ipath, | ||
_mode_type_map[lsfiles_props['mode']], | ||
lsfiles_props['gitsha'] | ||
) | ||
# do not yield immediately, wait for a possible higher-stage | ||
# report in the next loop iteration | ||
if pending_item: | ||
yield _get_item(path, link_target, hash, *pending_item) | ||
|
||
|
||
def _get_item( | ||
basepath: Path, | ||
link_target: bool, | ||
hash: List[str] | None, | ||
ipath: PurePosixPath, | ||
type: GitTreeItemType | None = None, | ||
gitsha: str | None = None, | ||
) -> GitWorktreeItem | GitWorktreeFileSystemItem: | ||
if link_target or hash: | ||
fullpath = basepath / ipath | ||
item = GitWorktreeFileSystemItem.from_path( | ||
fullpath, | ||
link_target=link_target, | ||
hash=hash, | ||
) | ||
# make sure the name/id is the path relative to the basepath | ||
item.name = PurePath(ipath) | ||
if type is not None: | ||
item.gittype = type | ||
if gitsha is not None: | ||
item.gitsha = gitsha | ||
return item | ||
else: | ||
return GitWorktreeItem( | ||
name=ipath, | ||
gittype=type, | ||
gitsha=gitsha, | ||
) | ||
|
||
|
||
def _lsfiles_line2props( | ||
line: str | ||
) -> Tuple[PurePosixPath, Dict[str, str] | None]: | ||
props = _lsfiles_props_re.match(line) | ||
if not props: | ||
# Kludge: Filter out paths starting with .git/ to work around | ||
# an `ls-files -o` bug that was fixed in Git 2.25. | ||
# | ||
# TODO: Drop this condition when GIT_MIN_VERSION is at least | ||
# 2.25. | ||
if line.startswith(".git/"): | ||
lgr.debug("Filtering out .git/ file: %s", line) | ||
return | ||
# not known to Git, but Git always reports POSIX | ||
path = PurePosixPath(line) | ||
# early exist, we have nothing but the path (untracked) | ||
return path, None | ||
|
||
# again Git reports always in POSIX | ||
path = PurePosixPath(props.group('fname')) | ||
return path, dict( | ||
gitsha=props.group('gitsha'), | ||
mode=props.group('mode'), | ||
) | ||
|
||
|
||
def _git_ls_files(path, *args): | ||
# we use a plain runner to avoid the overhead of a GitRepo instance | ||
runner = ThreadedRunner( | ||
cmd=[ | ||
'git', 'ls-files', | ||
# we rely on zero-byte splitting below | ||
'-z', | ||
# otherwise take whatever is coming in | ||
*args, | ||
], | ||
protocol_class=StdOutCaptureGeneratorProtocol, | ||
stdin=DEVNULL, | ||
# run in the directory we want info on | ||
cwd=path, | ||
) | ||
line_splitter = LineSplitter('\0', keep_ends=False) | ||
# for each command output chunk received by the runner | ||
for content in runner.run(): | ||
# for each zerobyte-delimited "line" in the output | ||
for line in line_splitter.process(content.decode('utf-8')): | ||
yield line |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters