Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: increase file scanning performance #486

Merged
merged 3 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 54 additions & 45 deletions tagstudio/src/core/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -737,7 +737,9 @@ def _map_filenames_to_entry_ids(self):
"""Maps a full filepath to its corresponding Entry's ID."""
self.filename_to_entry_id_map.clear()
for entry in self.entries:
self.filename_to_entry_id_map[(entry.path / entry.filename)] = entry.id
self.filename_to_entry_id_map[
(self.library_dir / entry.path / entry.filename)
] = entry.id

# def _map_filenames_to_entry_ids(self):
# """Maps the file paths of entries to their index in the library list."""
Expand Down Expand Up @@ -884,59 +886,71 @@ def refresh_dir(self) -> Generator:

# Scans the directory for files, keeping track of:
# - Total file count
# - Files without library entries
# for type in TYPES:
start_time = time.time()
# - Files without Library entries
start_time_total = time.time()
start_time_loop = time.time()
ext_set = set(self.ext_list) # Should be slightly faster
for f in self.library_dir.glob("**/*"):
end_time_loop = time.time()
# Yield output every 1/30 of a second
if (end_time_loop - start_time_loop) > 0.034:
yield self.dir_file_count
start_time_loop = time.time()
try:
# Skip this file if it should be excluded
ext: str = f.suffix.lower()
if (ext in ext_set and self.is_exclude_list) or (
ext not in ext_set and not self.is_exclude_list
):
continue

# Finish if the file/path is already mapped in the Library
if self.filename_to_entry_id_map.get(f) is not None:
# No other checks are required.
self.dir_file_count += 1
continue

# If the file is new, check for validity
if (
"$RECYCLE.BIN" not in f.parts
and TS_FOLDER_NAME not in f.parts
and "tagstudio_thumbs" not in f.parts
and not f.is_dir()
"$RECYCLE.BIN" in f.parts
or TS_FOLDER_NAME in f.parts
or "tagstudio_thumbs" in f.parts
or f.is_dir()
):
if f.suffix.lower() not in self.ext_list and self.is_exclude_list:
self.dir_file_count += 1
file = f.relative_to(self.library_dir)
if file not in self.filename_to_entry_id_map:
self.files_not_in_library.append(file)
elif f.suffix.lower() in self.ext_list and not self.is_exclude_list:
self.dir_file_count += 1
file = f.relative_to(self.library_dir)
try:
_ = self.filename_to_entry_id_map[file]
except KeyError:
# print(file)
self.files_not_in_library.append(file)
continue

# Add the validated new file to the Library
self.dir_file_count += 1
self.files_not_in_library.append(f)

except PermissionError:
logging.info(
f"The File/Folder {f} cannot be accessed, because it requires higher permission!"
)
end_time = time.time()
# Yield output every 1/30 of a second
if (end_time - start_time) > 0.034:
yield self.dir_file_count
start_time = time.time()
# Sorts the files by date modified, descending.
logging.info(f'[LIBRARY] Cannot access "{f}": PermissionError')

yield self.dir_file_count
end_time_total = time.time()
logging.info(
f"[LIBRARY] Scanned directories in {(end_time_total - start_time_total):.3f} seconds"
)
# Sorts the files by date modified, descending
if len(self.files_not_in_library) <= 150000:
try:
if platform.system() == "Windows" or platform.system() == "Darwin":
self.files_not_in_library = sorted(
self.files_not_in_library,
key=lambda t: -(self.library_dir / t).stat().st_birthtime, # type: ignore[attr-defined]
key=lambda t: -(t).stat().st_birthtime, # type: ignore[attr-defined]
)
else:
self.files_not_in_library = sorted(
self.files_not_in_library,
key=lambda t: -(self.library_dir / t).stat().st_ctime,
key=lambda t: -(t).stat().st_ctime,
)
except (FileExistsError, FileNotFoundError):
print(
"[LIBRARY] [ERROR] Couldn't sort files, some were moved during the scanning/sorting process."
logging.info(
"[LIBRARY][ERROR] Couldn't sort files, some were moved during the scanning/sorting process."
)
pass
else:
print(
logging.info(
"[LIBRARY][INFO] Not bothering to sort files because there's OVER 150,000! Better sorting methods will be added in the future."
)

Expand All @@ -957,7 +971,7 @@ def remove_entry(self, entry_id: int) -> None:
# Step [1/2]:
# Remove this Entry from the Entries list.
entry = self.get_entry(entry_id)
path = entry.path / entry.filename
path = self.library_dir / entry.path / entry.filename
# logging.info(f'Removing path: {path}')

del self.filename_to_entry_id_map[path]
Expand Down Expand Up @@ -1087,8 +1101,8 @@ def refresh_dupe_files(self, results_filepath: str | Path):
)
)
for match in matches:
file_1 = files[match[0]].relative_to(self.library_dir)
file_2 = files[match[1]].relative_to(self.library_dir)
file_1 = files[match[0]]
file_2 = files[match[1]]

if (
file_1 in self.filename_to_entry_id_map.keys()
Expand Down Expand Up @@ -1289,8 +1303,7 @@ def add_new_files_as_entries(self) -> list[int]:
"""Adds files from the `files_not_in_library` list to the Library as Entries. Returns list of added indices."""
new_ids: list[int] = []
for file in self.files_not_in_library:
path = Path(file)
# print(os.path.split(file))
path = Path(*file.parts[len(self.library_dir.parts) :])
entry = Entry(
id=self._next_entry_id, filename=path.name, path=path.parent, fields=[]
)
Expand All @@ -1301,8 +1314,6 @@ def add_new_files_as_entries(self) -> list[int]:
self.files_not_in_library.clear()
return new_ids

self.files_not_in_library.clear()

def get_entry(self, entry_id: int) -> Entry:
"""Returns an Entry object given an Entry ID."""
return self.entries[self._entry_id_to_index_map[int(entry_id)]]
Expand All @@ -1323,9 +1334,7 @@ def get_entry_id_from_filepath(self, filename: Path):
"""Returns an Entry ID given the full filepath it points to."""
try:
if self.entries:
return self.filename_to_entry_id_map[
Path(filename).relative_to(self.library_dir)
]
return self.filename_to_entry_id_map[filename]
except KeyError:
return -1

Expand Down
11 changes: 5 additions & 6 deletions tagstudio/src/qt/modals/drop_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ def copy_files(self):
continue

dest_file = self.get_relative_path(file)
full_dest_path: Path = self.driver.lib.library_dir / dest_file

if file in self.duplicate_files:
duplicated_files_progress += 1
Expand All @@ -115,14 +116,12 @@ def copy_files(self):
if self.choice == 2: # rename
new_name = self.get_renamed_duplicate_filename_in_lib(dest_file)
dest_file = dest_file.with_name(new_name)
self.driver.lib.files_not_in_library.append(dest_file)
self.driver.lib.files_not_in_library.append(full_dest_path)
else: # override is simply copying but not adding a new entry
self.driver.lib.files_not_in_library.append(dest_file)
self.driver.lib.files_not_in_library.append(full_dest_path)

(self.driver.lib.library_dir / dest_file).parent.mkdir(
parents=True, exist_ok=True
)
shutil.copyfile(file, self.driver.lib.library_dir / dest_file)
(full_dest_path).parent.mkdir(parents=True, exist_ok=True)
shutil.copyfile(file, full_dest_path)

fileCount += 1
yield [fileCount, duplicated_files_progress]
Expand Down