diff --git a/tests/unit/vertex_rag/test_rag_constants.py b/tests/unit/vertex_rag/test_rag_constants.py index e85e176006..7bf576c21d 100644 --- a/tests/unit/vertex_rag/test_rag_constants.py +++ b/tests/unit/vertex_rag/test_rag_constants.py @@ -24,6 +24,8 @@ RagCorpus, RagFile, RagResource, + SharePointSource, + SharePointSources, SlackChannelsSource, SlackChannel, JiraSource, @@ -42,6 +44,7 @@ JiraSource as GapicJiraSource, RagCorpus as GapicRagCorpus, RagFile as GapicRagFile, + SharePointSources as GapicSharePointSources, SlackSource as GapicSlackSource, RagContexts, RetrieveContextsResponse, @@ -390,6 +393,122 @@ import_rag_files_config=TEST_IMPORT_FILES_CONFIG_JIRA_SOURCE, ) +# SharePoint sources +TEST_SHARE_POINT_SOURCE = SharePointSources( + share_point_sources=[ + SharePointSource( + sharepoint_folder_path="test-sharepoint-folder-path", + drive_name="test-drive-name", + client_id="test-client-id", + client_secret="test-client-secret", + tenant_id="test-tenant-id", + sharepoint_site_name="test-sharepoint-site-name", + ) + ], +) +TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE = ImportRagFilesConfig( + rag_file_chunking_config=RagFileChunkingConfig( + chunk_size=TEST_CHUNK_SIZE, + chunk_overlap=TEST_CHUNK_OVERLAP, + ), + share_point_sources=GapicSharePointSources( + share_point_sources=[ + GapicSharePointSources.SharePointSource( + sharepoint_folder_path="test-sharepoint-folder-path", + drive_name="test-drive-name", + client_id="test-client-id", + client_secret=api_auth.ApiAuth.ApiKeyConfig( + api_key_secret_version="test-client-secret" + ), + tenant_id="test-tenant-id", + sharepoint_site_name="test-sharepoint-site-name", + ) + ] + ), +) + +TEST_IMPORT_REQUEST_SHARE_POINT_SOURCE = ImportRagFilesRequest( + parent=TEST_RAG_CORPUS_RESOURCE_NAME, + import_rag_files_config=TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE, +) + +TEST_SHARE_POINT_SOURCE_2_DRIVES = SharePointSources( + share_point_sources=[ + SharePointSource( + sharepoint_folder_path="test-sharepoint-folder-path", + drive_name="test-drive-name", + drive_id="test-drive-id", + client_id="test-client-id", + client_secret="test-client-secret", + tenant_id="test-tenant-id", + sharepoint_site_name="test-sharepoint-site-name", + ) + ], +) + +TEST_SHARE_POINT_SOURCE_NO_DRIVES = SharePointSources( + share_point_sources=[ + SharePointSource( + sharepoint_folder_path="test-sharepoint-folder-path", + client_id="test-client-id", + client_secret="test-client-secret", + tenant_id="test-tenant-id", + sharepoint_site_name="test-sharepoint-site-name", + ) + ], +) + +TEST_SHARE_POINT_SOURCE_2_FOLDERS = SharePointSources( + share_point_sources=[ + SharePointSource( + sharepoint_folder_path="test-sharepoint-folder-path", + sharepoint_folder_id="test-sharepoint-folder-id", + drive_name="test-drive-name", + client_id="test-client-id", + client_secret="test-client-secret", + tenant_id="test-tenant-id", + sharepoint_site_name="test-sharepoint-site-name", + ) + ], +) + +TEST_SHARE_POINT_SOURCE_NO_FOLDERS = SharePointSources( + share_point_sources=[ + SharePointSource( + drive_name="test-drive-name", + client_id="test-client-id", + client_secret="test-client-secret", + tenant_id="test-tenant-id", + sharepoint_site_name="test-sharepoint-site-name", + ) + ], +) + +TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE_NO_FOLDERS = ImportRagFilesConfig( + rag_file_chunking_config=RagFileChunkingConfig( + chunk_size=TEST_CHUNK_SIZE, + chunk_overlap=TEST_CHUNK_OVERLAP, + ), + share_point_sources=GapicSharePointSources( + share_point_sources=[ + GapicSharePointSources.SharePointSource( + drive_name="test-drive-name", + client_id="test-client-id", + client_secret=api_auth.ApiAuth.ApiKeyConfig( + api_key_secret_version="test-client-secret" + ), + tenant_id="test-tenant-id", + sharepoint_site_name="test-sharepoint-site-name", + ) + ] + ), +) + +TEST_IMPORT_REQUEST_SHARE_POINT_SOURCE_NO_FOLDERS = ImportRagFilesRequest( + parent=TEST_RAG_CORPUS_RESOURCE_NAME, + import_rag_files_config=TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE, +) + # Retrieval TEST_QUERY_TEXT = "What happen to the fox and the dog?" TEST_CONTEXTS = RagContexts( diff --git a/tests/unit/vertex_rag/test_rag_data.py b/tests/unit/vertex_rag/test_rag_data.py index c20011c361..72d23662ed 100644 --- a/tests/unit/vertex_rag/test_rag_data.py +++ b/tests/unit/vertex_rag/test_rag_data.py @@ -563,6 +563,56 @@ def test_prepare_import_files_request_jira_source(self): ) import_files_request_eq(request, tc.TEST_IMPORT_REQUEST_JIRA_SOURCE) + def test_prepare_import_files_request_sharepoint_source(self): + request = prepare_import_files_request( + corpus_name=tc.TEST_RAG_CORPUS_RESOURCE_NAME, + source=tc.TEST_SHARE_POINT_SOURCE, + chunk_size=tc.TEST_CHUNK_SIZE, + chunk_overlap=tc.TEST_CHUNK_OVERLAP, + ) + import_files_request_eq(request, tc.TEST_IMPORT_REQUEST_SHARE_POINT_SOURCE) + + def test_prepare_import_files_request_sharepoint_source_2_drives(self): + with pytest.raises(ValueError) as e: + prepare_import_files_request( + corpus_name=tc.TEST_RAG_CORPUS_RESOURCE_NAME, + source=tc.TEST_SHARE_POINT_SOURCE_2_DRIVES, + chunk_size=tc.TEST_CHUNK_SIZE, + chunk_overlap=tc.TEST_CHUNK_OVERLAP, + ) + e.match("drive_name and drive_id cannot both be set.") + + def test_prepare_import_files_request_sharepoint_source_2_folders(self): + with pytest.raises(ValueError) as e: + prepare_import_files_request( + corpus_name=tc.TEST_RAG_CORPUS_RESOURCE_NAME, + source=tc.TEST_SHARE_POINT_SOURCE_2_FOLDERS, + chunk_size=tc.TEST_CHUNK_SIZE, + chunk_overlap=tc.TEST_CHUNK_OVERLAP, + ) + e.match("sharepoint_folder_path and sharepoint_folder_id cannot both be set.") + + def test_prepare_import_files_request_sharepoint_source_no_drives(self): + with pytest.raises(ValueError) as e: + prepare_import_files_request( + corpus_name=tc.TEST_RAG_CORPUS_RESOURCE_NAME, + source=tc.TEST_SHARE_POINT_SOURCE_NO_DRIVES, + chunk_size=tc.TEST_CHUNK_SIZE, + chunk_overlap=tc.TEST_CHUNK_OVERLAP, + ) + e.match("Either drive_name and drive_id must be set.") + + def test_prepare_import_files_request_sharepoint_source_no_folders(self): + request = prepare_import_files_request( + corpus_name=tc.TEST_RAG_CORPUS_RESOURCE_NAME, + source=tc.TEST_SHARE_POINT_SOURCE_NO_FOLDERS, + chunk_size=tc.TEST_CHUNK_SIZE, + chunk_overlap=tc.TEST_CHUNK_OVERLAP, + ) + import_files_request_eq( + request, tc.TEST_IMPORT_REQUEST_SHARE_POINT_SOURCE_NO_FOLDERS + ) + def test_set_embedding_model_config_set_both_error(self): embedding_model_config = rag.EmbeddingModelConfig( publisher_model="whatever", diff --git a/vertexai/preview/rag/__init__.py b/vertexai/preview/rag/__init__.py index 7c7737276b..2deed4c630 100644 --- a/vertexai/preview/rag/__init__.py +++ b/vertexai/preview/rag/__init__.py @@ -44,6 +44,8 @@ RagCorpus, RagFile, RagResource, + SharePointSource, + SharePointSources, SlackChannel, SlackChannelsSource, VertexFeatureStore, @@ -61,6 +63,8 @@ "RagFile", "RagResource", "Retrieval", + "SharePointSource", + "SharePointSources", "SlackChannel", "SlackChannelsSource", "VertexFeatureStore", diff --git a/vertexai/preview/rag/rag_data.py b/vertexai/preview/rag/rag_data.py index e332b0337b..c4b6f96e18 100644 --- a/vertexai/preview/rag/rag_data.py +++ b/vertexai/preview/rag/rag_data.py @@ -48,6 +48,7 @@ Pinecone, RagCorpus, RagFile, + SharePointSources, SlackChannelsSource, VertexFeatureStore, VertexVectorSearch, @@ -290,7 +291,7 @@ def upload_file( def import_files( corpus_name: str, paths: Optional[Sequence[str]] = None, - source: Optional[Union[SlackChannelsSource, JiraSource]] = None, + source: Optional[Union[SlackChannelsSource, JiraSource, SharePointSources]] = None, chunk_size: int = 1024, chunk_overlap: int = 200, timeout: int = 600, @@ -354,6 +355,19 @@ def import_files( chunk_overlap=100, ) + # SharePoint Example. + sharepoint_query = rag.SharePointSource( + sharepoint_folder_path="https://my-sharepoint-site.com/my-folder", + sharepoint_site_name="my-sharepoint-site.com", + client_id="my-client-id", + client_secret="my-client-secret", + tenant_id="my-tenant-id", + drive_id="my-drive-id", + ) + source = rag.SharePointSources( + share_point_sources=[sharepoint_query], + ) + # Return the number of imported RagFiles after completion. print(response.imported_rag_files_count) @@ -420,7 +434,7 @@ def import_files( async def import_files_async( corpus_name: str, paths: Optional[Sequence[str]] = None, - source: Optional[Union[SlackChannelsSource, JiraSource]] = None, + source: Optional[Union[SlackChannelsSource, JiraSource, SharePointSources]] = None, chunk_size: int = 1024, chunk_overlap: int = 200, max_embedding_requests_per_min: int = 1000, @@ -484,6 +498,19 @@ async def import_files_async( chunk_overlap=100, ) + # SharePoint Example. + sharepoint_query = rag.SharePointSource( + sharepoint_folder_path="https://my-sharepoint-site.com/my-folder", + sharepoint_site_name="my-sharepoint-site.com", + client_id="my-client-id", + client_secret="my-client-secret", + tenant_id="my-tenant-id", + drive_id="my-drive-id", + ) + source = rag.SharePointSources( + share_point_sources=[sharepoint_query], + ) + # Get the result. await response.result() diff --git a/vertexai/preview/rag/utils/_gapic_utils.py b/vertexai/preview/rag/utils/_gapic_utils.py index fe513a4ecf..166c3b9107 100644 --- a/vertexai/preview/rag/utils/_gapic_utils.py +++ b/vertexai/preview/rag/utils/_gapic_utils.py @@ -26,6 +26,7 @@ RagFileParsingConfig, RagCorpus as GapicRagCorpus, RagFile as GapicRagFile, + SharePointSources as GapicSharePointSources, SlackSource as GapicSlackSource, JiraSource as GapicJiraSource, RagVectorDbConfig, @@ -41,6 +42,7 @@ Pinecone, RagCorpus, RagFile, + SharePointSources, SlackChannelsSource, JiraSource, VertexFeatureStore, @@ -222,7 +224,7 @@ def convert_path_to_resource_id( def convert_source_for_rag_import( - source: Union[SlackChannelsSource, JiraSource] + source: Union[SlackChannelsSource, JiraSource, SharePointSources] ) -> Union[GapicSlackSource, GapicJiraSource]: """Converts a SlackChannelsSource or JiraSource to a GapicSlackSource or GapicJiraSource.""" if isinstance(source, SlackChannelsSource): @@ -269,14 +271,57 @@ def convert_source_for_rag_import( return GapicJiraSource( jira_queries=result_source_queries, ) + elif isinstance(source, SharePointSources): + result_source_share_point_sources = [] + for share_point_source in source.share_point_sources: + sharepoint_folder_path = share_point_source.sharepoint_folder_path + sharepoint_folder_id = share_point_source.sharepoint_folder_id + drive_name = share_point_source.drive_name + drive_id = share_point_source.drive_id + client_id = share_point_source.client_id + client_secret = share_point_source.client_secret + tenant_id = share_point_source.tenant_id + sharepoint_site_name = share_point_source.sharepoint_site_name + result_share_point_source = GapicSharePointSources.SharePointSource( + client_id=client_id, + client_secret=api_auth.ApiAuth.ApiKeyConfig( + api_key_secret_version=client_secret + ), + tenant_id=tenant_id, + sharepoint_site_name=sharepoint_site_name, + ) + if sharepoint_folder_path is not None and sharepoint_folder_id is not None: + raise ValueError( + "sharepoint_folder_path and sharepoint_folder_id cannot both be set." + ) + elif sharepoint_folder_path is not None: + result_share_point_source.sharepoint_folder_path = ( + sharepoint_folder_path + ) + elif sharepoint_folder_id is not None: + result_share_point_source.sharepoint_folder_id = sharepoint_folder_id + if drive_name is not None and drive_id is not None: + raise ValueError("drive_name and drive_id cannot both be set.") + elif drive_name is not None: + result_share_point_source.drive_name = drive_name + elif drive_id is not None: + result_share_point_source.drive_id = drive_id + else: + raise ValueError("Either drive_name and drive_id must be set.") + result_source_share_point_sources.append(result_share_point_source) + return GapicSharePointSources( + share_point_sources=result_source_share_point_sources, + ) else: - raise TypeError("source must be a SlackChannelsSource or JiraSource.") + raise TypeError( + "source must be a SlackChannelsSource or JiraSource or SharePointSources." + ) def prepare_import_files_request( corpus_name: str, paths: Optional[Sequence[str]] = None, - source: Optional[Union[SlackChannelsSource, JiraSource]] = None, + source: Optional[Union[SlackChannelsSource, JiraSource, SharePointSources]] = None, chunk_size: int = 1024, chunk_overlap: int = 200, max_embedding_requests_per_min: int = 1000, @@ -307,6 +352,8 @@ def prepare_import_files_request( import_rag_files_config.slack_source = gapic_source if isinstance(gapic_source, GapicJiraSource): import_rag_files_config.jira_source = gapic_source + if isinstance(gapic_source, GapicSharePointSources): + import_rag_files_config.share_point_sources = gapic_source else: uris = [] resource_ids = [] diff --git a/vertexai/preview/rag/utils/resources.py b/vertexai/preview/rag/utils/resources.py index 371ccc3e9e..6f86f0a8ad 100644 --- a/vertexai/preview/rag/utils/resources.py +++ b/vertexai/preview/rag/utils/resources.py @@ -232,3 +232,48 @@ class JiraSource: """ queries: Sequence[JiraQuery] + + +@dataclasses.dataclass +class SharePointSource: + """SharePointSource. + + Attributes: + sharepoint_folder_path: The path of the SharePoint folder to download + from. + sharepoint_folder_id: The ID of the SharePoint folder to download + from. + drive_name: The name of the drive to download from. + drive_id: The ID of the drive to download from. + client_id: The Application ID for the app registered in + Microsoft Azure Portal. The application must + also be configured with MS Graph permissions + "Files.ReadAll", "Sites.ReadAll" and + BrowserSiteLists.Read.All. + client_secret: The application secret for the app registered + in Azure. + tenant_id: Unique identifier of the Azure Active + Directory Instance. + sharepoint_site_name: The name of the SharePoint site to download + from. This can be the site name or the site id. + """ + + sharepoint_folder_path: Optional[str] = None + sharepoint_folder_id: Optional[str] = None + drive_name: Optional[str] = None + drive_id: Optional[str] = None + client_id: str = None + client_secret: str = None + tenant_id: str = None + sharepoint_site_name: str = None + + +@dataclasses.dataclass +class SharePointSources: + """SharePointSources. + + Attributes: + share_point_sources: The SharePoint sources. + """ + + share_point_sources: Sequence[SharePointSource]