Skip to content

Commit

Permalink
Improve egnyte connector (#3626)
Browse files Browse the repository at this point in the history
  • Loading branch information
Weves authored Jan 8, 2025
1 parent 7175608 commit eac73a1
Showing 1 changed file with 25 additions and 27 deletions.
52 changes: 25 additions & 27 deletions backend/onyx/connectors/egnyte/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None
def _get_files_list(
self,
path: str,
) -> list[dict[str, Any]]:
) -> Generator[dict[str, Any], None, None]:
if not self.access_token or not self.domain:
raise ConnectorMissingCredentialError("Egnyte")

Expand All @@ -245,48 +245,46 @@ def _get_files_list(
raise RuntimeError(f"Failed to fetch files from Egnyte: {response.text}")

data = response.json()
all_files: list[dict[str, Any]] = []

# Add files from current directory
all_files.extend(data.get("files", []))
# Yield files from current directory
for file in data.get("files", []):
yield file

# Recursively traverse folders
for item in data.get("folders", []):
all_files.extend(self._get_files_list(item["path"]))
for folder in data.get("folders", []):
yield from self._get_files_list(folder["path"])

return all_files

def _filter_files(
def _should_index_file(
self,
files: list[dict[str, Any]],
file: dict[str, Any],
start_time: datetime | None = None,
end_time: datetime | None = None,
) -> list[dict[str, Any]]:
filtered_files = []
for file in files:
if file["is_folder"]:
continue

file_modified = _parse_last_modified(file["last_modified"])
if start_time and file_modified < start_time:
continue
if end_time and file_modified > end_time:
continue
) -> bool:
"""Return True if file should be included based on filters."""
if file["is_folder"]:
return False

filtered_files.append(file)
file_modified = _parse_last_modified(file["last_modified"])
if start_time and file_modified < start_time:
return False
if end_time and file_modified > end_time:
return False

return filtered_files
return True

def _process_files(
self,
start_time: datetime | None = None,
end_time: datetime | None = None,
) -> Generator[list[Document], None, None]:
files = self._get_files_list(self.folder_path)
files = self._filter_files(files, start_time, end_time)

current_batch: list[Document] = []
for file in files:

# Iterate through yielded files and filter them
for file in self._get_files_list(self.folder_path):
if not self._should_index_file(file, start_time, end_time):
logger.debug(f"Skipping file '{file['path']}'.")
continue

try:
# Set up request with streaming enabled
headers = {
Expand Down

0 comments on commit eac73a1

Please sign in to comment.