Skip to content

Commit

Permalink
feat: add support for SharePoint as a ImportRagFiles source.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 677936135
  • Loading branch information
vertex-sdk-bot authored and copybara-github committed Sep 23, 2024
1 parent b456ce3 commit f89df1f
Show file tree
Hide file tree
Showing 6 changed files with 297 additions and 5 deletions.
119 changes: 119 additions & 0 deletions tests/unit/vertex_rag/test_rag_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
RagCorpus,
RagFile,
RagResource,
SharePointSource,
SharePointSources,
SlackChannelsSource,
SlackChannel,
JiraSource,
Expand All @@ -42,6 +44,7 @@
JiraSource as GapicJiraSource,
RagCorpus as GapicRagCorpus,
RagFile as GapicRagFile,
SharePointSources as GapicSharePointSources,
SlackSource as GapicSlackSource,
RagContexts,
RetrieveContextsResponse,
Expand Down Expand Up @@ -390,6 +393,122 @@
import_rag_files_config=TEST_IMPORT_FILES_CONFIG_JIRA_SOURCE,
)

# SharePoint sources
TEST_SHARE_POINT_SOURCE = SharePointSources(
share_point_sources=[
SharePointSource(
sharepoint_folder_path="test-sharepoint-folder-path",
drive_name="test-drive-name",
client_id="test-client-id",
client_secret="test-client-secret",
tenant_id="test-tenant-id",
sharepoint_site_name="test-sharepoint-site-name",
)
],
)
TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE = ImportRagFilesConfig(
rag_file_chunking_config=RagFileChunkingConfig(
chunk_size=TEST_CHUNK_SIZE,
chunk_overlap=TEST_CHUNK_OVERLAP,
),
share_point_sources=GapicSharePointSources(
share_point_sources=[
GapicSharePointSources.SharePointSource(
sharepoint_folder_path="test-sharepoint-folder-path",
drive_name="test-drive-name",
client_id="test-client-id",
client_secret=api_auth.ApiAuth.ApiKeyConfig(
api_key_secret_version="test-client-secret"
),
tenant_id="test-tenant-id",
sharepoint_site_name="test-sharepoint-site-name",
)
]
),
)

TEST_IMPORT_REQUEST_SHARE_POINT_SOURCE = ImportRagFilesRequest(
parent=TEST_RAG_CORPUS_RESOURCE_NAME,
import_rag_files_config=TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE,
)

TEST_SHARE_POINT_SOURCE_2_DRIVES = SharePointSources(
share_point_sources=[
SharePointSource(
sharepoint_folder_path="test-sharepoint-folder-path",
drive_name="test-drive-name",
drive_id="test-drive-id",
client_id="test-client-id",
client_secret="test-client-secret",
tenant_id="test-tenant-id",
sharepoint_site_name="test-sharepoint-site-name",
)
],
)

TEST_SHARE_POINT_SOURCE_NO_DRIVES = SharePointSources(
share_point_sources=[
SharePointSource(
sharepoint_folder_path="test-sharepoint-folder-path",
client_id="test-client-id",
client_secret="test-client-secret",
tenant_id="test-tenant-id",
sharepoint_site_name="test-sharepoint-site-name",
)
],
)

TEST_SHARE_POINT_SOURCE_2_FOLDERS = SharePointSources(
share_point_sources=[
SharePointSource(
sharepoint_folder_path="test-sharepoint-folder-path",
sharepoint_folder_id="test-sharepoint-folder-id",
drive_name="test-drive-name",
client_id="test-client-id",
client_secret="test-client-secret",
tenant_id="test-tenant-id",
sharepoint_site_name="test-sharepoint-site-name",
)
],
)

TEST_SHARE_POINT_SOURCE_NO_FOLDERS = SharePointSources(
share_point_sources=[
SharePointSource(
drive_name="test-drive-name",
client_id="test-client-id",
client_secret="test-client-secret",
tenant_id="test-tenant-id",
sharepoint_site_name="test-sharepoint-site-name",
)
],
)

TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE_NO_FOLDERS = ImportRagFilesConfig(
rag_file_chunking_config=RagFileChunkingConfig(
chunk_size=TEST_CHUNK_SIZE,
chunk_overlap=TEST_CHUNK_OVERLAP,
),
share_point_sources=GapicSharePointSources(
share_point_sources=[
GapicSharePointSources.SharePointSource(
drive_name="test-drive-name",
client_id="test-client-id",
client_secret=api_auth.ApiAuth.ApiKeyConfig(
api_key_secret_version="test-client-secret"
),
tenant_id="test-tenant-id",
sharepoint_site_name="test-sharepoint-site-name",
)
]
),
)

TEST_IMPORT_REQUEST_SHARE_POINT_SOURCE_NO_FOLDERS = ImportRagFilesRequest(
parent=TEST_RAG_CORPUS_RESOURCE_NAME,
import_rag_files_config=TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE,
)

# Retrieval
TEST_QUERY_TEXT = "What happen to the fox and the dog?"
TEST_CONTEXTS = RagContexts(
Expand Down
50 changes: 50 additions & 0 deletions tests/unit/vertex_rag/test_rag_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,56 @@ def test_prepare_import_files_request_jira_source(self):
)
import_files_request_eq(request, tc.TEST_IMPORT_REQUEST_JIRA_SOURCE)

def test_prepare_import_files_request_sharepoint_source(self):
request = prepare_import_files_request(
corpus_name=tc.TEST_RAG_CORPUS_RESOURCE_NAME,
source=tc.TEST_SHARE_POINT_SOURCE,
chunk_size=tc.TEST_CHUNK_SIZE,
chunk_overlap=tc.TEST_CHUNK_OVERLAP,
)
import_files_request_eq(request, tc.TEST_IMPORT_REQUEST_SHARE_POINT_SOURCE)

def test_prepare_import_files_request_sharepoint_source_2_drives(self):
with pytest.raises(ValueError) as e:
prepare_import_files_request(
corpus_name=tc.TEST_RAG_CORPUS_RESOURCE_NAME,
source=tc.TEST_SHARE_POINT_SOURCE_2_DRIVES,
chunk_size=tc.TEST_CHUNK_SIZE,
chunk_overlap=tc.TEST_CHUNK_OVERLAP,
)
e.match("drive_name and drive_id cannot both be set.")

def test_prepare_import_files_request_sharepoint_source_2_folders(self):
with pytest.raises(ValueError) as e:
prepare_import_files_request(
corpus_name=tc.TEST_RAG_CORPUS_RESOURCE_NAME,
source=tc.TEST_SHARE_POINT_SOURCE_2_FOLDERS,
chunk_size=tc.TEST_CHUNK_SIZE,
chunk_overlap=tc.TEST_CHUNK_OVERLAP,
)
e.match("sharepoint_folder_path and sharepoint_folder_id cannot both be set.")

def test_prepare_import_files_request_sharepoint_source_no_drives(self):
with pytest.raises(ValueError) as e:
prepare_import_files_request(
corpus_name=tc.TEST_RAG_CORPUS_RESOURCE_NAME,
source=tc.TEST_SHARE_POINT_SOURCE_NO_DRIVES,
chunk_size=tc.TEST_CHUNK_SIZE,
chunk_overlap=tc.TEST_CHUNK_OVERLAP,
)
e.match("Either drive_name and drive_id must be set.")

def test_prepare_import_files_request_sharepoint_source_no_folders(self):
request = prepare_import_files_request(
corpus_name=tc.TEST_RAG_CORPUS_RESOURCE_NAME,
source=tc.TEST_SHARE_POINT_SOURCE_NO_FOLDERS,
chunk_size=tc.TEST_CHUNK_SIZE,
chunk_overlap=tc.TEST_CHUNK_OVERLAP,
)
import_files_request_eq(
request, tc.TEST_IMPORT_REQUEST_SHARE_POINT_SOURCE_NO_FOLDERS
)

def test_set_embedding_model_config_set_both_error(self):
embedding_model_config = rag.EmbeddingModelConfig(
publisher_model="whatever",
Expand Down
4 changes: 4 additions & 0 deletions vertexai/preview/rag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
RagCorpus,
RagFile,
RagResource,
SharePointSource,
SharePointSources,
SlackChannel,
SlackChannelsSource,
VertexFeatureStore,
Expand All @@ -61,6 +63,8 @@
"RagFile",
"RagResource",
"Retrieval",
"SharePointSource",
"SharePointSources",
"SlackChannel",
"SlackChannelsSource",
"VertexFeatureStore",
Expand Down
31 changes: 29 additions & 2 deletions vertexai/preview/rag/rag_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
Pinecone,
RagCorpus,
RagFile,
SharePointSources,
SlackChannelsSource,
VertexFeatureStore,
VertexVectorSearch,
Expand Down Expand Up @@ -290,7 +291,7 @@ def upload_file(
def import_files(
corpus_name: str,
paths: Optional[Sequence[str]] = None,
source: Optional[Union[SlackChannelsSource, JiraSource]] = None,
source: Optional[Union[SlackChannelsSource, JiraSource, SharePointSources]] = None,
chunk_size: int = 1024,
chunk_overlap: int = 200,
timeout: int = 600,
Expand Down Expand Up @@ -354,6 +355,19 @@ def import_files(
chunk_overlap=100,
)
# SharePoint Example.
sharepoint_query = rag.SharePointSource(
sharepoint_folder_path="https://my-sharepoint-site.com/my-folder",
sharepoint_site_name="my-sharepoint-site.com",
client_id="my-client-id",
client_secret="my-client-secret",
tenant_id="my-tenant-id",
drive_id="my-drive-id",
)
source = rag.SharePointSources(
share_point_sources=[sharepoint_query],
)
# Return the number of imported RagFiles after completion.
print(response.imported_rag_files_count)
Expand Down Expand Up @@ -420,7 +434,7 @@ def import_files(
async def import_files_async(
corpus_name: str,
paths: Optional[Sequence[str]] = None,
source: Optional[Union[SlackChannelsSource, JiraSource]] = None,
source: Optional[Union[SlackChannelsSource, JiraSource, SharePointSources]] = None,
chunk_size: int = 1024,
chunk_overlap: int = 200,
max_embedding_requests_per_min: int = 1000,
Expand Down Expand Up @@ -484,6 +498,19 @@ async def import_files_async(
chunk_overlap=100,
)
# SharePoint Example.
sharepoint_query = rag.SharePointSource(
sharepoint_folder_path="https://my-sharepoint-site.com/my-folder",
sharepoint_site_name="my-sharepoint-site.com",
client_id="my-client-id",
client_secret="my-client-secret",
tenant_id="my-tenant-id",
drive_id="my-drive-id",
)
source = rag.SharePointSources(
share_point_sources=[sharepoint_query],
)
# Get the result.
await response.result()
Expand Down
53 changes: 50 additions & 3 deletions vertexai/preview/rag/utils/_gapic_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
RagFileParsingConfig,
RagCorpus as GapicRagCorpus,
RagFile as GapicRagFile,
SharePointSources as GapicSharePointSources,
SlackSource as GapicSlackSource,
JiraSource as GapicJiraSource,
RagVectorDbConfig,
Expand All @@ -41,6 +42,7 @@
Pinecone,
RagCorpus,
RagFile,
SharePointSources,
SlackChannelsSource,
JiraSource,
VertexFeatureStore,
Expand Down Expand Up @@ -222,7 +224,7 @@ def convert_path_to_resource_id(


def convert_source_for_rag_import(
source: Union[SlackChannelsSource, JiraSource]
source: Union[SlackChannelsSource, JiraSource, SharePointSources]
) -> Union[GapicSlackSource, GapicJiraSource]:
"""Converts a SlackChannelsSource or JiraSource to a GapicSlackSource or GapicJiraSource."""
if isinstance(source, SlackChannelsSource):
Expand Down Expand Up @@ -269,14 +271,57 @@ def convert_source_for_rag_import(
return GapicJiraSource(
jira_queries=result_source_queries,
)
elif isinstance(source, SharePointSources):
result_source_share_point_sources = []
for share_point_source in source.share_point_sources:
sharepoint_folder_path = share_point_source.sharepoint_folder_path
sharepoint_folder_id = share_point_source.sharepoint_folder_id
drive_name = share_point_source.drive_name
drive_id = share_point_source.drive_id
client_id = share_point_source.client_id
client_secret = share_point_source.client_secret
tenant_id = share_point_source.tenant_id
sharepoint_site_name = share_point_source.sharepoint_site_name
result_share_point_source = GapicSharePointSources.SharePointSource(
client_id=client_id,
client_secret=api_auth.ApiAuth.ApiKeyConfig(
api_key_secret_version=client_secret
),
tenant_id=tenant_id,
sharepoint_site_name=sharepoint_site_name,
)
if sharepoint_folder_path is not None and sharepoint_folder_id is not None:
raise ValueError(
"sharepoint_folder_path and sharepoint_folder_id cannot both be set."
)
elif sharepoint_folder_path is not None:
result_share_point_source.sharepoint_folder_path = (
sharepoint_folder_path
)
elif sharepoint_folder_id is not None:
result_share_point_source.sharepoint_folder_id = sharepoint_folder_id
if drive_name is not None and drive_id is not None:
raise ValueError("drive_name and drive_id cannot both be set.")
elif drive_name is not None:
result_share_point_source.drive_name = drive_name
elif drive_id is not None:
result_share_point_source.drive_id = drive_id
else:
raise ValueError("Either drive_name and drive_id must be set.")
result_source_share_point_sources.append(result_share_point_source)
return GapicSharePointSources(
share_point_sources=result_source_share_point_sources,
)
else:
raise TypeError("source must be a SlackChannelsSource or JiraSource.")
raise TypeError(
"source must be a SlackChannelsSource or JiraSource or SharePointSources."
)


def prepare_import_files_request(
corpus_name: str,
paths: Optional[Sequence[str]] = None,
source: Optional[Union[SlackChannelsSource, JiraSource]] = None,
source: Optional[Union[SlackChannelsSource, JiraSource, SharePointSources]] = None,
chunk_size: int = 1024,
chunk_overlap: int = 200,
max_embedding_requests_per_min: int = 1000,
Expand Down Expand Up @@ -307,6 +352,8 @@ def prepare_import_files_request(
import_rag_files_config.slack_source = gapic_source
if isinstance(gapic_source, GapicJiraSource):
import_rag_files_config.jira_source = gapic_source
if isinstance(gapic_source, GapicSharePointSources):
import_rag_files_config.share_point_sources = gapic_source
else:
uris = []
resource_ids = []
Expand Down
Loading

0 comments on commit f89df1f

Please sign in to comment.