From 5e5448f12527ed4b164b797b5250759fe4e0b603 Mon Sep 17 00:00:00 2001 From: pablodanswer Date: Tue, 7 Jan 2025 11:54:23 -0800 Subject: [PATCH 01/11] v1 --- .../celery/tasks/shared/RetryDocumentIndex.py | 32 ++- .../background/celery/tasks/shared/tasks.py | 17 +- .../background/celery/tasks/vespa/tasks.py | 9 +- backend/onyx/document_index/interfaces.py | 17 +- backend/onyx/document_index/vespa/index.py | 272 +++++++++--------- .../scripts/force_delete_connector_by_id.py | 15 +- backend/scripts/orphan_doc_cleanup_script.py | 10 +- 7 files changed, 212 insertions(+), 160 deletions(-) diff --git a/backend/onyx/background/celery/tasks/shared/RetryDocumentIndex.py b/backend/onyx/background/celery/tasks/shared/RetryDocumentIndex.py index 62a34196f60..7ed448d1b1d 100644 --- a/backend/onyx/background/celery/tasks/shared/RetryDocumentIndex.py +++ b/backend/onyx/background/celery/tasks/shared/RetryDocumentIndex.py @@ -28,13 +28,37 @@ def __init__(self, index: DocumentIndex): wait=wait_random_exponential(multiplier=1, max=MAX_WAIT), stop=stop_after_delay(STOP_AFTER), ) - def delete_single(self, doc_id: str) -> int: - return self.index.delete_single(doc_id) + def delete_single( + self, + doc_id: str, + large_chunks_enabled: bool, + tenant_id: str | None, + chunk_count: int | None, + ) -> int: + return self.index.delete_single( + doc_id, + large_chunks_enabled=large_chunks_enabled, + tenant_id=tenant_id, + chunk_count=chunk_count, + ) @retry( retry=retry_if_exception_type(httpx.ReadTimeout), wait=wait_random_exponential(multiplier=1, max=MAX_WAIT), stop=stop_after_delay(STOP_AFTER), ) - def update_single(self, doc_id: str, fields: VespaDocumentFields) -> int: - return self.index.update_single(doc_id, fields) + def update_single( + self, + doc_id: str, + large_chunks_enabled: bool, + tenant_id: str | None, + chunk_count: int | None, + fields: VespaDocumentFields, + ) -> int: + return self.index.update_single( + doc_id, + large_chunks_enabled=large_chunks_enabled, + tenant_id=tenant_id, + chunk_count=chunk_count, + fields=fields, + ) diff --git a/backend/onyx/background/celery/tasks/shared/tasks.py b/backend/onyx/background/celery/tasks/shared/tasks.py index 03a7ab83701..e30c3375697 100644 --- a/backend/onyx/background/celery/tasks/shared/tasks.py +++ b/backend/onyx/background/celery/tasks/shared/tasks.py @@ -80,7 +80,13 @@ def document_by_cc_pair_cleanup_task( # delete it from vespa and the db action = "delete" - chunks_affected = retry_index.delete_single(document_id) + # TODO: fix the large chunks enabled + chunks_affected = retry_index.delete_single( + document_id, + large_chunks_enabled=False, + tenant_id=tenant_id, + chunk_count=None, + ) delete_documents_complete__no_commit( db_session=db_session, document_ids=[document_id], @@ -110,7 +116,14 @@ def document_by_cc_pair_cleanup_task( ) # update Vespa. OK if doc doesn't exist. Raises exception otherwise. - chunks_affected = retry_index.update_single(document_id, fields=fields) + # TODO: fix the large chunks enabled + chunks_affected = retry_index.update_single( + document_id, + large_chunks_enabled=False, + tenant_id=tenant_id, + chunk_count=doc.chunk_count, + fields=fields, + ) # there are still other cc_pair references to the doc, so just resync to Vespa delete_document_by_connector_credential_pair__no_commit( diff --git a/backend/onyx/background/celery/tasks/vespa/tasks.py b/backend/onyx/background/celery/tasks/vespa/tasks.py index c00bac354f3..591d55ec91b 100644 --- a/backend/onyx/background/celery/tasks/vespa/tasks.py +++ b/backend/onyx/background/celery/tasks/vespa/tasks.py @@ -992,7 +992,14 @@ def vespa_metadata_sync_task( ) # update Vespa. OK if doc doesn't exist. Raises exception otherwise. - chunks_affected = retry_index.update_single(document_id, fields) + # TODO: fix hte large chunks enabled + chunks_affected = retry_index.update_single( + document_id, + large_chunks_enabled=False, + tenant_id=tenant_id, + chunk_count=doc.chunk_count, + fields=fields, + ) # update db last. Worst case = we crash right before this and # the sync might repeat again later diff --git a/backend/onyx/document_index/interfaces.py b/backend/onyx/document_index/interfaces.py index c97c87fd07e..d54f317247e 100644 --- a/backend/onyx/document_index/interfaces.py +++ b/backend/onyx/document_index/interfaces.py @@ -218,7 +218,13 @@ class Deletable(abc.ABC): """ @abc.abstractmethod - def delete_single(self, doc_id: str) -> int: + def delete_single( + self, + doc_id: str, + large_chunks_enabled: bool, + tenant_id: str | None, + chunk_count: int | None, + ) -> int: """ Given a single document id, hard delete it from the document index @@ -239,7 +245,14 @@ class Updatable(abc.ABC): """ @abc.abstractmethod - def update_single(self, doc_id: str, fields: VespaDocumentFields) -> int: + def update_single( + self, + doc_id: str, + large_chunks_enabled: bool, + chunk_count: int | None, + tenant_id: str | None, + fields: VespaDocumentFields, + ) -> int: """ Updates all chunks for a document with the specified fields. None values mean that the field does not need an update. diff --git a/backend/onyx/document_index/vespa/index.py b/backend/onyx/document_index/vespa/index.py index 80e5504fa8e..54cd608f6ef 100644 --- a/backend/onyx/document_index/vespa/index.py +++ b/backend/onyx/document_index/vespa/index.py @@ -13,11 +13,11 @@ from typing import BinaryIO from typing import cast from typing import List +from uuid import UUID import httpx # type: ignore import requests # type: ignore -from onyx.configs.app_configs import DOCUMENT_INDEX_NAME from onyx.configs.chat_configs import DOC_TIME_DECAY from onyx.configs.chat_configs import NUM_RETURNED_HITS from onyx.configs.chat_configs import TITLE_CONTENT_RATIO @@ -336,37 +336,21 @@ def index( # know precisely which chunks to delete. This information exists for # documents that have `chunk_count` in the database, but not for # `old_version` documents. + enriched_doc_infos: list[EnrichedDocumentIndexingInfo] = [ + VespaIndex.enrich_basic_chunk_info( + index_name=self.index_name, + http_client=http_client, + document_id=doc_id, + previous_chunk_count=doc_id_to_previous_chunk_cnt.get(doc_id), + new_chunk_count=doc_id_to_new_chunk_cnt.get(doc_id, 0), + ) + for doc_id in doc_id_to_new_chunk_cnt.keys() + ] - enriched_doc_infos: list[EnrichedDocumentIndexingInfo] = [] - for document_id, doc_count in doc_id_to_previous_chunk_cnt.items(): - last_indexed_chunk = doc_id_to_previous_chunk_cnt.get(document_id, None) - # If the document has no `chunk_count` in the database, we know that it - # has the old chunk ID system and we must check for the final chunk index - is_old_version = False - if last_indexed_chunk is None: - is_old_version = True - minimal_doc_info = MinimalDocumentIndexingInfo( - doc_id=document_id, - chunk_start_index=doc_id_to_new_chunk_cnt.get(document_id, 0), - ) - last_indexed_chunk = check_for_final_chunk_existence( - minimal_doc_info=minimal_doc_info, - start_index=doc_id_to_new_chunk_cnt[document_id], - index_name=self.index_name, - http_client=http_client, - ) - + for cleaned_doc_info in enriched_doc_infos: # If the document has previously indexed chunks, we know it previously existed - if doc_count or last_indexed_chunk: - existing_docs.add(document_id) - - enriched_doc_info = EnrichedDocumentIndexingInfo( - doc_id=document_id, - chunk_start_index=doc_id_to_new_chunk_cnt.get(document_id, 0), - chunk_end_index=last_indexed_chunk, - old_version=is_old_version, - ) - enriched_doc_infos.append(enriched_doc_info) + if cleaned_doc_info.chunk_end_index: + existing_docs.add(cleaned_doc_info.doc_id) # Now, for each doc, we know exactly where to start and end our deletion # So let's generate the chunk IDs for each chunk to delete @@ -537,19 +521,12 @@ def update(self, update_requests: list[UpdateRequest]) -> None: time.monotonic() - update_start, ) - def update_single(self, doc_id: str, fields: VespaDocumentFields) -> int: - """Note: if the document id does not exist, the update will be a no-op and the - function will complete with no errors or exceptions. - Handle other exceptions if you wish to implement retry behavior - """ - - total_chunks_updated = 0 - - # Handle Vespa character limitations - # Mutating update_request but it's not used later anyway - normalized_doc_id = replace_invalid_doc_id_characters(doc_id) + def update_single_chunk( + self, doc_chunk_id: UUID, index_name: str, fields: VespaDocumentFields + ) -> None: + """Update a single chunk in Vespa using its chunk ID.""" - # Build the _VespaUpdateRequest objects + # Build the update request update_dict: dict[str, dict] = {"fields": {}} if fields.boost is not None: update_dict["fields"][BOOST] = {"assign": fields.boost} @@ -566,7 +543,39 @@ def update_single(self, doc_id: str, fields: VespaDocumentFields) -> int: if not update_dict["fields"]: logger.error("Update request received but nothing to update") - return 0 + return + + with get_vespa_http_client(http2=False) as http_client: + vespa_url = ( + f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}" + ) + + try: + logger.debug(f'update_single PUT on URL "{vespa_url}"') + resp = http_client.put( + vespa_url, + headers={"Content-Type": "application/json"}, + json=update_dict, + ) + resp.raise_for_status() + + except httpx.HTTPStatusError as e: + logger.error(f"Failed to update chunk, details: {e.response.text}") + raise + + def update_single( + self, + doc_id: str, + large_chunks_enabled: bool, + chunk_count: int | None, + tenant_id: str | None, + fields: VespaDocumentFields, + ) -> int: + """Note: if the document id does not exist, the update will be a no-op and the + function will complete with no errors or exceptions. + Handle other exceptions if you wish to implement retry behavior + """ + doc_chunk_count = 0 index_names = [self.index_name] if self.secondary_index_name: @@ -574,66 +583,38 @@ def update_single(self, doc_id: str, fields: VespaDocumentFields) -> int: with get_vespa_http_client(http2=False) as http_client: for index_name in index_names: - params = httpx.QueryParams( - { - "selection": f"{index_name}.document_id=='{normalized_doc_id}'", - "cluster": DOCUMENT_INDEX_NAME, - } + enriched_doc_infos = VespaIndex.enrich_basic_chunk_info( + index_name=index_name, + http_client=http_client, + document_id=doc_id, + previous_chunk_count=chunk_count, + new_chunk_count=0, ) - - while True: - try: - vespa_url = ( - f"{DOCUMENT_ID_ENDPOINT.format(index_name=self.index_name)}" - ) - logger.debug(f'update_single PUT on URL "{vespa_url}"') - resp = http_client.put( - vespa_url, - params=params, - headers={"Content-Type": "application/json"}, - json=update_dict, - ) - - resp.raise_for_status() - except httpx.HTTPStatusError as e: - logger.error( - f"Failed to update chunks, details: {e.response.text}" - ) - raise - - resp_data = resp.json() - - if "documentCount" in resp_data: - chunks_updated = resp_data["documentCount"] - total_chunks_updated += chunks_updated - - # Check for continuation token to handle pagination - if "continuation" not in resp_data: - break # Exit loop if no continuation token - - if not resp_data["continuation"]: - break # Exit loop if continuation token is empty - - params = params.set("continuation", resp_data["continuation"]) - - logger.debug( - f"VespaIndex.update_single: " - f"index={index_name} " - f"doc={normalized_doc_id} " - f"chunks_updated={total_chunks_updated}" + doc_chunk_ids = assemble_document_chunk_info( + enriched_document_info_list=[enriched_doc_infos], + tenant_id=tenant_id, + large_chunks_enabled=large_chunks_enabled, ) + doc_chunk_count += len(doc_chunk_ids) - return total_chunks_updated + for doc_chunk_id in doc_chunk_ids: + self.update_single_chunk( + doc_chunk_id=doc_chunk_id, index_name=index_name, fields=fields + ) + return doc_chunk_count - def delete_single(self, doc_id: str) -> int: + def delete_single( + self, + doc_id: str, + large_chunks_enabled: bool, + tenant_id: str | None, + chunk_count: int | None, + ) -> int: """Possibly faster overall than the delete method due to using a single delete call with a selection query.""" total_chunks_deleted = 0 - # Vespa deletion is poorly documented ... luckily we found this - # https://docs.vespa.ai/en/operations/batch-delete.html#example - doc_id = replace_invalid_doc_id_characters(doc_id) # NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for @@ -642,53 +623,34 @@ def delete_single(self, doc_id: str) -> int: if self.secondary_index_name: index_names.append(self.secondary_index_name) - with get_vespa_http_client(http2=False) as http_client: + with get_vespa_http_client( + http2=False + ) as http_client, concurrent.futures.ThreadPoolExecutor( + max_workers=NUM_THREADS + ) as executor: for index_name in index_names: - params = httpx.QueryParams( - { - "selection": f"{index_name}.document_id=='{doc_id}'", - "cluster": DOCUMENT_INDEX_NAME, - } + enriched_doc_infos = VespaIndex.enrich_basic_chunk_info( + index_name=index_name, + http_client=http_client, + document_id=doc_id, + previous_chunk_count=chunk_count, + new_chunk_count=0, ) - - while True: - try: - vespa_url = ( - f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}" - ) - logger.debug(f'delete_single DELETE on URL "{vespa_url}"') - resp = http_client.delete( - vespa_url, - params=params, - ) - resp.raise_for_status() - except httpx.HTTPStatusError as e: - logger.error( - f"Failed to delete chunk, details: {e.response.text}" - ) - raise - - resp_data = resp.json() - - if "documentCount" in resp_data: - chunks_deleted = resp_data["documentCount"] - total_chunks_deleted += chunks_deleted - - # Check for continuation token to handle pagination - if "continuation" not in resp_data: - break # Exit loop if no continuation token - - if not resp_data["continuation"]: - break # Exit loop if continuation token is empty - - params = params.set("continuation", resp_data["continuation"]) - - logger.debug( - f"VespaIndex.delete_single: " - f"index={index_name} " - f"doc={doc_id} " - f"chunks_deleted={total_chunks_deleted}" + chunks_to_delete = assemble_document_chunk_info( + enriched_document_info_list=[enriched_doc_infos], + tenant_id=tenant_id, + large_chunks_enabled=large_chunks_enabled, ) + for doc_chunk_ids_batch in batch_generator( + chunks_to_delete, BATCH_SIZE + ): + total_chunks_deleted += len(doc_chunk_ids_batch) + delete_vespa_chunks( + doc_chunk_ids=doc_chunk_ids_batch, + index_name=index_name, + http_client=http_client, + executor=executor, + ) return total_chunks_deleted @@ -787,6 +749,40 @@ def admin_retrieval( return query_vespa(params) + @classmethod + def enrich_basic_chunk_info( + cls, + index_name: str, + http_client: httpx.Client, + document_id: str, + previous_chunk_count: int | None = None, + new_chunk_count: int = 0, + ) -> EnrichedDocumentIndexingInfo: + last_indexed_chunk = previous_chunk_count + + # If the document has no `chunk_count` in the database, we know that it + # has the old chunk ID system and we must check for the final chunk index + is_old_version = False + if last_indexed_chunk is None: + is_old_version = True + minimal_doc_info = MinimalDocumentIndexingInfo( + doc_id=document_id, chunk_start_index=new_chunk_count + ) + last_indexed_chunk = check_for_final_chunk_existence( + minimal_doc_info=minimal_doc_info, + start_index=new_chunk_count, + index_name=index_name, + http_client=http_client, + ) + + enriched_doc_info = EnrichedDocumentIndexingInfo( + doc_id=document_id, + chunk_start_index=new_chunk_count, + chunk_end_index=last_indexed_chunk, + old_version=is_old_version, + ) + return enriched_doc_info + @classmethod def delete_entries_by_tenant_id(cls, tenant_id: str, index_name: str) -> None: """ diff --git a/backend/scripts/force_delete_connector_by_id.py b/backend/scripts/force_delete_connector_by_id.py index bc994238870..3f0bf01ad61 100755 --- a/backend/scripts/force_delete_connector_by_id.py +++ b/backend/scripts/force_delete_connector_by_id.py @@ -38,7 +38,6 @@ from onyx.document_index.factory import get_default_document_index from onyx.file_store.file_store import get_default_file_store from onyx.document_index.document_index_utils import get_both_index_names -from onyx.db.document import delete_documents_complete__no_commit # pylint: enable=E402 # flake8: noqa: E402 @@ -71,14 +70,14 @@ def _unsafe_deletion( if not documents: break - document_ids = [document.id for document in documents] - for doc_id in document_ids: - document_index.delete_single(doc_id) + # document_ids = [document.id for document in documents] + # for doc_id in document_ids: + # document_index.delete_single(doc_id) - delete_documents_complete__no_commit( - db_session=db_session, - document_ids=document_ids, - ) + # delete_documents_complete__no_commit( + # db_session=db_session, + # document_ids=document_ids, + # ) num_docs_deleted += len(documents) diff --git a/backend/scripts/orphan_doc_cleanup_script.py b/backend/scripts/orphan_doc_cleanup_script.py index ea776e770eb..6d404f7f52e 100644 --- a/backend/scripts/orphan_doc_cleanup_script.py +++ b/backend/scripts/orphan_doc_cleanup_script.py @@ -83,11 +83,11 @@ def process_doc(doc_id: str) -> str | None: try: print(f"Deleting document {doc_id} in Vespa") - chunks_deleted = vespa_index.delete_single(doc_id) - if chunks_deleted > 0: - print( - f"Deleted {chunks_deleted} chunks for document {doc_id}" - ) + # chunks_deleted = vespa_index.delete_single(doc_id) + # if chunks_deleted > 0: + # print( + # f"Deleted {chunks_deleted} chunks for document {doc_id}" + # ) return doc_id except Exception as e: print( From c35ccd2cf16439380995197628db1cd61ea9f389 Mon Sep 17 00:00:00 2001 From: pablodanswer Date: Tue, 7 Jan 2025 12:43:31 -0800 Subject: [PATCH 02/11] update indexing logic --- .../celery/tasks/shared/RetryDocumentIndex.py | 3 - .../background/celery/tasks/shared/tasks.py | 1 - backend/onyx/document_index/interfaces.py | 8 +- backend/onyx/document_index/vespa/index.py | 87 +++++++++++-------- .../document_index/vespa/indexing_utils.py | 53 ++++++++++- backend/onyx/indexing/indexing_pipeline.py | 36 ++------ backend/onyx/indexing/models.py | 5 ++ 7 files changed, 117 insertions(+), 76 deletions(-) diff --git a/backend/onyx/background/celery/tasks/shared/RetryDocumentIndex.py b/backend/onyx/background/celery/tasks/shared/RetryDocumentIndex.py index 7ed448d1b1d..1f0cd000402 100644 --- a/backend/onyx/background/celery/tasks/shared/RetryDocumentIndex.py +++ b/backend/onyx/background/celery/tasks/shared/RetryDocumentIndex.py @@ -31,13 +31,11 @@ def __init__(self, index: DocumentIndex): def delete_single( self, doc_id: str, - large_chunks_enabled: bool, tenant_id: str | None, chunk_count: int | None, ) -> int: return self.index.delete_single( doc_id, - large_chunks_enabled=large_chunks_enabled, tenant_id=tenant_id, chunk_count=chunk_count, ) @@ -57,7 +55,6 @@ def update_single( ) -> int: return self.index.update_single( doc_id, - large_chunks_enabled=large_chunks_enabled, tenant_id=tenant_id, chunk_count=chunk_count, fields=fields, diff --git a/backend/onyx/background/celery/tasks/shared/tasks.py b/backend/onyx/background/celery/tasks/shared/tasks.py index e30c3375697..e1e901475cd 100644 --- a/backend/onyx/background/celery/tasks/shared/tasks.py +++ b/backend/onyx/background/celery/tasks/shared/tasks.py @@ -83,7 +83,6 @@ def document_by_cc_pair_cleanup_task( # TODO: fix the large chunks enabled chunks_affected = retry_index.delete_single( document_id, - large_chunks_enabled=False, tenant_id=tenant_id, chunk_count=None, ) diff --git a/backend/onyx/document_index/interfaces.py b/backend/onyx/document_index/interfaces.py index d54f317247e..85bfb7ae3d6 100644 --- a/backend/onyx/document_index/interfaces.py +++ b/backend/onyx/document_index/interfaces.py @@ -109,7 +109,7 @@ class UpdateRequest: Does not update any of the None fields """ - document_ids: list[str] + minimal_document_indexing_info: list[MinimalDocumentIndexingInfo] # all other fields except these 4 will always be left alone by the update request access: DocumentAccess | None = None document_sets: set[str] | None = None @@ -221,7 +221,6 @@ class Deletable(abc.ABC): def delete_single( self, doc_id: str, - large_chunks_enabled: bool, tenant_id: str | None, chunk_count: int | None, ) -> int: @@ -248,7 +247,6 @@ class Updatable(abc.ABC): def update_single( self, doc_id: str, - large_chunks_enabled: bool, chunk_count: int | None, tenant_id: str | None, fields: VespaDocumentFields, @@ -270,7 +268,9 @@ def update_single( raise NotImplementedError @abc.abstractmethod - def update(self, update_requests: list[UpdateRequest]) -> None: + def update( + self, update_requests: list[UpdateRequest], tenant_id: str | None + ) -> None: """ Updates some set of chunks. The document and fields to update are specified in the update requests. Each update request in the list applies its changes to a list of document ids. diff --git a/backend/onyx/document_index/vespa/index.py b/backend/onyx/document_index/vespa/index.py index 54cd608f6ef..2519c3e6f4c 100644 --- a/backend/onyx/document_index/vespa/index.py +++ b/backend/onyx/document_index/vespa/index.py @@ -25,6 +25,7 @@ from onyx.configs.constants import KV_REINDEX_KEY from onyx.context.search.models import IndexFilters from onyx.context.search.models import InferenceChunkUncleaned +from onyx.db.engine import get_session_with_tenant from onyx.document_index.document_index_utils import assemble_document_chunk_info from onyx.document_index.interfaces import DocumentIndex from onyx.document_index.interfaces import DocumentInsertionRecord @@ -35,15 +36,15 @@ from onyx.document_index.interfaces import VespaChunkRequest from onyx.document_index.interfaces import VespaDocumentFields from onyx.document_index.vespa.chunk_retrieval import batch_search_api_retrieval -from onyx.document_index.vespa.chunk_retrieval import ( - get_all_vespa_ids_for_document_id, -) from onyx.document_index.vespa.chunk_retrieval import ( parallel_visit_api_retrieval, ) from onyx.document_index.vespa.chunk_retrieval import query_vespa from onyx.document_index.vespa.deletion import delete_vespa_chunks from onyx.document_index.vespa.indexing_utils import batch_index_vespa_chunks +from onyx.document_index.vespa.indexing_utils import ( + check_enable_large_chunks_and_multipass, +) from onyx.document_index.vespa.indexing_utils import check_for_final_chunk_existence from onyx.document_index.vespa.indexing_utils import clean_chunk_id_copy from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client @@ -431,21 +432,21 @@ def _update_chunk( failure_msg = f"Failed to update document: {future_to_document_id[future]}" raise requests.HTTPError(failure_msg) from e - def update(self, update_requests: list[UpdateRequest]) -> None: + def update( + self, update_requests: list[UpdateRequest], tenant_id: str | None + ) -> None: logger.debug(f"Updating {len(update_requests)} documents in Vespa") # Handle Vespa character limitations # Mutating update_requests but it's not used later anyway for update_request in update_requests: - update_request.document_ids = [ - replace_invalid_doc_id_characters(doc_id) - for doc_id in update_request.document_ids - ] + for doc_info in update_request.minimal_document_indexing_info: + doc_info.doc_id = replace_invalid_doc_id_characters(doc_info.doc_id) update_start = time.monotonic() processed_updates_requests: list[_VespaUpdateRequest] = [] - all_doc_chunk_ids: dict[str, list[str]] = {} + all_doc_chunk_ids: dict[str, list[UUID]] = {} # Fetch all chunks for each document ahead of time index_names = [self.index_name] @@ -453,30 +454,24 @@ def update(self, update_requests: list[UpdateRequest]) -> None: index_names.append(self.secondary_index_name) chunk_id_start_time = time.monotonic() - with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor: - future_to_doc_chunk_ids = { - executor.submit( - get_all_vespa_ids_for_document_id, - document_id=document_id, - index_name=index_name, - filters=None, - get_large_chunks=True, - ): (document_id, index_name) - for index_name in index_names - for update_request in update_requests - for document_id in update_request.document_ids - } - for future in concurrent.futures.as_completed(future_to_doc_chunk_ids): - document_id, index_name = future_to_doc_chunk_ids[future] - try: - doc_chunk_ids = future.result() - if document_id not in all_doc_chunk_ids: - all_doc_chunk_ids[document_id] = [] - all_doc_chunk_ids[document_id].extend(doc_chunk_ids) - except Exception as e: - logger.error( - f"Error retrieving chunk IDs for document {document_id} in index {index_name}: {e}" - ) + with get_vespa_http_client() as http_client: + for update_request in update_requests: + for doc_info in update_request.minimal_document_indexing_info: + for index_name in index_names: + doc_chunk_info = self.enrich_basic_chunk_info( + index_name=index_name, + http_client=http_client, + document_id=doc_info.doc_id, + previous_chunk_count=doc_info.chunk_start_index, + new_chunk_count=0, + ) + doc_chunk_ids = assemble_document_chunk_info( + enriched_document_info_list=[doc_chunk_info], + tenant_id=tenant_id, + large_chunks_enabled=False, + ) + all_doc_chunk_ids[doc_info.doc_id] = doc_chunk_ids + logger.debug( f"Took {time.monotonic() - chunk_id_start_time:.2f} seconds to fetch all Vespa chunk IDs" ) @@ -505,11 +500,11 @@ def update(self, update_requests: list[UpdateRequest]) -> None: logger.error("Update request received but nothing to update") continue - for document_id in update_request.document_ids: - for doc_chunk_id in all_doc_chunk_ids[document_id]: + for doc_info in update_request.minimal_document_indexing_info: + for doc_chunk_id in all_doc_chunk_ids[doc_info.doc_id]: processed_updates_requests.append( _VespaUpdateRequest( - document_id=document_id, + document_id=doc_info.doc_id, url=f"{DOCUMENT_ID_ENDPOINT.format(index_name=self.index_name)}/{doc_chunk_id}", update_request=update_dict, ) @@ -566,7 +561,6 @@ def update_single_chunk( def update_single( self, doc_id: str, - large_chunks_enabled: bool, chunk_count: int | None, tenant_id: str | None, fields: VespaDocumentFields, @@ -583,6 +577,15 @@ def update_single( with get_vespa_http_client(http2=False) as http_client: for index_name in index_names: + large_chunks_enabled = False + + with get_session_with_tenant(tenant_id=tenant_id) as db_session: + multipass_config = check_enable_large_chunks_and_multipass( + db_session=db_session, + primary_index=index_name == self.index_name, + ) + large_chunks_enabled = multipass_config.enable_large_chunks + enriched_doc_infos = VespaIndex.enrich_basic_chunk_info( index_name=index_name, http_client=http_client, @@ -601,12 +604,12 @@ def update_single( self.update_single_chunk( doc_chunk_id=doc_chunk_id, index_name=index_name, fields=fields ) + return doc_chunk_count def delete_single( self, doc_id: str, - large_chunks_enabled: bool, tenant_id: str | None, chunk_count: int | None, ) -> int: @@ -629,6 +632,14 @@ def delete_single( max_workers=NUM_THREADS ) as executor: for index_name in index_names: + large_chunks_enabled = False + with get_session_with_tenant(tenant_id=tenant_id) as db_session: + multipass_config = check_enable_large_chunks_and_multipass( + db_session=db_session, + primary_index=index_name == self.index_name, + ) + large_chunks_enabled = multipass_config.enable_large_chunks + enriched_doc_infos = VespaIndex.enrich_basic_chunk_info( index_name=index_name, http_client=http_client, diff --git a/backend/onyx/document_index/vespa/indexing_utils.py b/backend/onyx/document_index/vespa/indexing_utils.py index 6781cae1d93..a8d35e996db 100644 --- a/backend/onyx/document_index/vespa/indexing_utils.py +++ b/backend/onyx/document_index/vespa/indexing_utils.py @@ -7,10 +7,15 @@ import httpx from retry import retry +from sqlalchemy.orm import Session +from onyx.configs.app_configs import ENABLE_MULTIPASS_INDEXING from onyx.connectors.cross_connector_utils.miscellaneous_utils import ( get_experts_stores_representations, ) +from onyx.db.models import SearchSettings +from onyx.db.search_settings import get_current_search_settings +from onyx.db.search_settings import get_secondary_search_settings from onyx.document_index.document_index_utils import get_uuid_from_chunk from onyx.document_index.document_index_utils import get_uuid_from_chunk_info_old from onyx.document_index.interfaces import MinimalDocumentIndexingInfo @@ -45,6 +50,8 @@ from onyx.document_index.vespa_constants import TITLE from onyx.document_index.vespa_constants import TITLE_EMBEDDING from onyx.indexing.models import DocMetadataAwareIndexChunk +from onyx.indexing.models import EmbeddingProvider +from onyx.indexing.models import MultipassConfig from onyx.utils.logger import setup_logger logger = setup_logger() @@ -263,5 +270,49 @@ def check_for_final_chunk_existence( ) if not _does_doc_chunk_exist(doc_chunk_id, index_name, http_client): return index - index += 1 + + +def should_use_multipass(search_settings: SearchSettings | None) -> bool: + """ + Determines whether multipass should be used based on the search settings + or the default config if settings are unavailable. + """ + if search_settings is not None: + return search_settings.multipass_indexing + return ENABLE_MULTIPASS_INDEXING + + +def can_use_large_chunks(multipass: bool, search_settings: SearchSettings) -> bool: + """ + Given multipass usage and an embedder, decides whether large chunks are allowed + based on model/provider constraints. + """ + # Only local models that support a larger context are from Nomic + # Cohere does not support larger contexts (they recommend not going above ~512 tokens) + return ( + multipass + and search_settings.model_name.startswith("nomic-ai") + and search_settings.provider_type != EmbeddingProvider.COHERE + ) + + +def check_enable_large_chunks_and_multipass( + db_session: Session, primary_index: bool = True +) -> MultipassConfig: + """ + Determines whether to enable multipass and large chunks by examining + the current search settings and the embedder configuration. + """ + search_settings = ( + get_current_search_settings(db_session) + if primary_index + else get_secondary_search_settings(db_session) + ) + multipass = should_use_multipass(search_settings) + if not search_settings: + return MultipassConfig(multipass_indexing=False, enable_large_chunks=False) + enable_large_chunks = can_use_large_chunks(multipass, search_settings) + return MultipassConfig( + multipass_indexing=multipass, enable_large_chunks=enable_large_chunks + ) diff --git a/backend/onyx/indexing/indexing_pipeline.py b/backend/onyx/indexing/indexing_pipeline.py index 1a2e73b2ab1..1502ac6b3d3 100644 --- a/backend/onyx/indexing/indexing_pipeline.py +++ b/backend/onyx/indexing/indexing_pipeline.py @@ -11,7 +11,6 @@ from onyx.access.access import get_access_for_documents from onyx.access.models import DocumentAccess -from onyx.configs.app_configs import ENABLE_MULTIPASS_INDEXING from onyx.configs.app_configs import INDEXING_EXCEPTION_LIMIT from onyx.configs.app_configs import MAX_DOCUMENT_CHARS from onyx.configs.constants import DEFAULT_BOOST @@ -31,12 +30,14 @@ from onyx.db.document_set import fetch_document_sets_for_documents from onyx.db.index_attempt import create_index_attempt_error from onyx.db.models import Document as DBDocument -from onyx.db.search_settings import get_current_search_settings from onyx.db.tag import create_or_add_document_tag from onyx.db.tag import create_or_add_document_tag_list from onyx.document_index.interfaces import DocumentIndex from onyx.document_index.interfaces import DocumentMetadata from onyx.document_index.interfaces import IndexBatchParams +from onyx.document_index.vespa.indexing_utils import ( + check_enable_large_chunks_and_multipass, +) from onyx.indexing.chunker import Chunker from onyx.indexing.embedder import IndexingEmbedder from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface @@ -44,7 +45,6 @@ from onyx.indexing.models import DocMetadataAwareIndexChunk from onyx.utils.logger import setup_logger from onyx.utils.timing import log_function_time -from shared_configs.enums import EmbeddingProvider logger = setup_logger() @@ -479,28 +479,6 @@ def index_doc_batch( return result -def check_enable_large_chunks_and_multipass( - embedder: IndexingEmbedder, db_session: Session -) -> tuple[bool, bool]: - search_settings = get_current_search_settings(db_session) - multipass = ( - search_settings.multipass_indexing - if search_settings - else ENABLE_MULTIPASS_INDEXING - ) - - enable_large_chunks = ( - multipass - and - # Only local models that supports larger context are from Nomic - (embedder.model_name.startswith("nomic-ai")) - and - # Cohere does not support larger context they recommend not going above 512 tokens - embedder.provider_type != EmbeddingProvider.COHERE - ) - return multipass, enable_large_chunks - - def build_indexing_pipeline( *, embedder: IndexingEmbedder, @@ -513,14 +491,14 @@ def build_indexing_pipeline( callback: IndexingHeartbeatInterface | None = None, ) -> IndexingPipelineProtocol: """Builds a pipeline which takes in a list (batch) of docs and indexes them.""" - multipass, enable_large_chunks = check_enable_large_chunks_and_multipass( - embedder, db_session + multipass_config = check_enable_large_chunks_and_multipass( + db_session, primary_index=True ) chunker = chunker or Chunker( tokenizer=embedder.embedding_model.tokenizer, - enable_multipass=multipass, - enable_large_chunks=enable_large_chunks, + enable_multipass=multipass_config.multipass_indexing, + enable_large_chunks=multipass_config.enable_large_chunks, # after every doc, update status in case there are a bunch of really long docs callback=callback, ) diff --git a/backend/onyx/indexing/models.py b/backend/onyx/indexing/models.py index e536428282b..44a8419cb95 100644 --- a/backend/onyx/indexing/models.py +++ b/backend/onyx/indexing/models.py @@ -154,3 +154,8 @@ def from_db_model(cls, search_settings: "SearchSettings") -> "IndexingSetting": index_name=search_settings.index_name, multipass_indexing=search_settings.multipass_indexing, ) + + +class MultipassConfig(BaseModel): + multipass_indexing: bool + enable_large_chunks: bool From d0629d9cded5d8d10516b7d7b27343fd059e0def Mon Sep 17 00:00:00 2001 From: pablodanswer Date: Tue, 7 Jan 2025 13:01:47 -0800 Subject: [PATCH 03/11] update updates --- .../onyx/background/celery/tasks/shared/RetryDocumentIndex.py | 2 +- backend/onyx/background/celery/tasks/shared/tasks.py | 1 - backend/onyx/background/celery/tasks/vespa/tasks.py | 2 -- backend/onyx/document_index/vespa/index.py | 1 + 4 files changed, 2 insertions(+), 4 deletions(-) diff --git a/backend/onyx/background/celery/tasks/shared/RetryDocumentIndex.py b/backend/onyx/background/celery/tasks/shared/RetryDocumentIndex.py index 1f0cd000402..cf3afefeb95 100644 --- a/backend/onyx/background/celery/tasks/shared/RetryDocumentIndex.py +++ b/backend/onyx/background/celery/tasks/shared/RetryDocumentIndex.py @@ -48,11 +48,11 @@ def delete_single( def update_single( self, doc_id: str, - large_chunks_enabled: bool, tenant_id: str | None, chunk_count: int | None, fields: VespaDocumentFields, ) -> int: + print("Update single") return self.index.update_single( doc_id, tenant_id=tenant_id, diff --git a/backend/onyx/background/celery/tasks/shared/tasks.py b/backend/onyx/background/celery/tasks/shared/tasks.py index e1e901475cd..a7050b57498 100644 --- a/backend/onyx/background/celery/tasks/shared/tasks.py +++ b/backend/onyx/background/celery/tasks/shared/tasks.py @@ -118,7 +118,6 @@ def document_by_cc_pair_cleanup_task( # TODO: fix the large chunks enabled chunks_affected = retry_index.update_single( document_id, - large_chunks_enabled=False, tenant_id=tenant_id, chunk_count=doc.chunk_count, fields=fields, diff --git a/backend/onyx/background/celery/tasks/vespa/tasks.py b/backend/onyx/background/celery/tasks/vespa/tasks.py index 591d55ec91b..8eabeb7d8a8 100644 --- a/backend/onyx/background/celery/tasks/vespa/tasks.py +++ b/backend/onyx/background/celery/tasks/vespa/tasks.py @@ -992,10 +992,8 @@ def vespa_metadata_sync_task( ) # update Vespa. OK if doc doesn't exist. Raises exception otherwise. - # TODO: fix hte large chunks enabled chunks_affected = retry_index.update_single( document_id, - large_chunks_enabled=False, tenant_id=tenant_id, chunk_count=doc.chunk_count, fields=fields, diff --git a/backend/onyx/document_index/vespa/index.py b/backend/onyx/document_index/vespa/index.py index 2519c3e6f4c..681dee153e8 100644 --- a/backend/onyx/document_index/vespa/index.py +++ b/backend/onyx/document_index/vespa/index.py @@ -601,6 +601,7 @@ def update_single( doc_chunk_count += len(doc_chunk_ids) for doc_chunk_id in doc_chunk_ids: + print("THIS ONE is being updated") self.update_single_chunk( doc_chunk_id=doc_chunk_id, index_name=index_name, fields=fields ) From 87efb1db5eb8cd5074c2110f974e6bbe9f24617d Mon Sep 17 00:00:00 2001 From: pablodanswer Date: Tue, 7 Jan 2025 13:02:34 -0800 Subject: [PATCH 04/11] nit --- backend/onyx/document_index/vespa/index.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/onyx/document_index/vespa/index.py b/backend/onyx/document_index/vespa/index.py index 681dee153e8..4dc42a9671c 100644 --- a/backend/onyx/document_index/vespa/index.py +++ b/backend/onyx/document_index/vespa/index.py @@ -578,7 +578,6 @@ def update_single( with get_vespa_http_client(http2=False) as http_client: for index_name in index_names: large_chunks_enabled = False - with get_session_with_tenant(tenant_id=tenant_id) as db_session: multipass_config = check_enable_large_chunks_and_multipass( db_session=db_session, From c2db6de8c3fb2e065cbbd19dcdf40820d6fddb24 Mon Sep 17 00:00:00 2001 From: pablodanswer Date: Tue, 7 Jan 2025 13:09:09 -0800 Subject: [PATCH 05/11] clean up args --- .../celery/tasks/shared/RetryDocumentIndex.py | 3 ++- backend/onyx/document_index/interfaces.py | 8 +++++--- backend/onyx/document_index/vespa/index.py | 11 +++++++++-- .../scripts/force_delete_connector_by_id.py | 19 ++++++++++++------- backend/scripts/orphan_doc_cleanup_script.py | 17 ++++++++++++----- 5 files changed, 40 insertions(+), 18 deletions(-) diff --git a/backend/onyx/background/celery/tasks/shared/RetryDocumentIndex.py b/backend/onyx/background/celery/tasks/shared/RetryDocumentIndex.py index cf3afefeb95..34a3e0a8864 100644 --- a/backend/onyx/background/celery/tasks/shared/RetryDocumentIndex.py +++ b/backend/onyx/background/celery/tasks/shared/RetryDocumentIndex.py @@ -31,6 +31,7 @@ def __init__(self, index: DocumentIndex): def delete_single( self, doc_id: str, + *, tenant_id: str | None, chunk_count: int | None, ) -> int: @@ -48,11 +49,11 @@ def delete_single( def update_single( self, doc_id: str, + *, tenant_id: str | None, chunk_count: int | None, fields: VespaDocumentFields, ) -> int: - print("Update single") return self.index.update_single( doc_id, tenant_id=tenant_id, diff --git a/backend/onyx/document_index/interfaces.py b/backend/onyx/document_index/interfaces.py index 85bfb7ae3d6..08dbfdc9efb 100644 --- a/backend/onyx/document_index/interfaces.py +++ b/backend/onyx/document_index/interfaces.py @@ -136,7 +136,7 @@ def __init__( index_name: str, secondary_index_name: str | None, *args: Any, - **kwargs: Any + **kwargs: Any, ) -> None: super().__init__(*args, **kwargs) self.index_name = index_name @@ -221,6 +221,7 @@ class Deletable(abc.ABC): def delete_single( self, doc_id: str, + *, tenant_id: str | None, chunk_count: int | None, ) -> int: @@ -247,8 +248,9 @@ class Updatable(abc.ABC): def update_single( self, doc_id: str, - chunk_count: int | None, + *, tenant_id: str | None, + chunk_count: int | None, fields: VespaDocumentFields, ) -> int: """ @@ -269,7 +271,7 @@ def update_single( @abc.abstractmethod def update( - self, update_requests: list[UpdateRequest], tenant_id: str | None + self, update_requests: list[UpdateRequest], *, tenant_id: str | None ) -> None: """ Updates some set of chunks. The document and fields to update are specified in the update diff --git a/backend/onyx/document_index/vespa/index.py b/backend/onyx/document_index/vespa/index.py index 4dc42a9671c..78cfea85656 100644 --- a/backend/onyx/document_index/vespa/index.py +++ b/backend/onyx/document_index/vespa/index.py @@ -433,7 +433,7 @@ def _update_chunk( raise requests.HTTPError(failure_msg) from e def update( - self, update_requests: list[UpdateRequest], tenant_id: str | None + self, update_requests: list[UpdateRequest], *, tenant_id: str | None ) -> None: logger.debug(f"Updating {len(update_requests)} documents in Vespa") @@ -561,6 +561,7 @@ def update_single_chunk( def update_single( self, doc_id: str, + *, chunk_count: int | None, tenant_id: str | None, fields: VespaDocumentFields, @@ -610,6 +611,7 @@ def update_single( def delete_single( self, doc_id: str, + *, tenant_id: str | None, chunk_count: int | None, ) -> int: @@ -795,7 +797,12 @@ def enrich_basic_chunk_info( return enriched_doc_info @classmethod - def delete_entries_by_tenant_id(cls, tenant_id: str, index_name: str) -> None: + def delete_entries_by_tenant_id( + cls, + *, + tenant_id: str, + index_name: str, + ) -> None: """ Deletes all entries in the specified index with the given tenant_id. diff --git a/backend/scripts/force_delete_connector_by_id.py b/backend/scripts/force_delete_connector_by_id.py index 3f0bf01ad61..39b98b9bcb8 100755 --- a/backend/scripts/force_delete_connector_by_id.py +++ b/backend/scripts/force_delete_connector_by_id.py @@ -5,6 +5,7 @@ from sqlalchemy import delete from sqlalchemy.orm import Session +from onyx.db.document import delete_documents_complete__no_commit from onyx.db.enums import ConnectorCredentialPairStatus # Modify sys.path @@ -70,14 +71,17 @@ def _unsafe_deletion( if not documents: break - # document_ids = [document.id for document in documents] - # for doc_id in document_ids: - # document_index.delete_single(doc_id) + for document in documents: + document_index.delete_single( + doc_id=document.id, + tenant_id=None, + chunk_count=document.chunk_count, + ) - # delete_documents_complete__no_commit( - # db_session=db_session, - # document_ids=document_ids, - # ) + delete_documents_complete__no_commit( + db_session=db_session, + document_ids=[document.id for document in documents], + ) num_docs_deleted += len(documents) @@ -215,6 +219,7 @@ def _delete_connector(cc_pair_id: int, db_session: Session) -> None: parser.add_argument( "connector_id", type=int, help="The ID of the connector to delete" ) + args = parser.parse_args() with get_session_context_manager() as db_session: _delete_connector(args.connector_id, db_session) diff --git a/backend/scripts/orphan_doc_cleanup_script.py b/backend/scripts/orphan_doc_cleanup_script.py index 6d404f7f52e..baf52d93b05 100644 --- a/backend/scripts/orphan_doc_cleanup_script.py +++ b/backend/scripts/orphan_doc_cleanup_script.py @@ -15,6 +15,7 @@ from onyx.db.document import delete_documents_complete__no_commit # noqa: E402 from onyx.db.search_settings import get_current_search_settings # noqa: E402 from onyx.document_index.vespa.index import VespaIndex # noqa: E402 +from onyx.db.document import get_document # noqa: E402 BATCH_SIZE = 100 @@ -63,6 +64,10 @@ def main() -> None: with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor: def process_doc(doc_id: str) -> str | None: + document = get_document(doc_id, db_session) + if not document: + print(f"Document {doc_id} not found in Postgres") + return None # Check if document exists in Vespa first try: chunks = vespa_index.id_based_retrieval( @@ -83,11 +88,13 @@ def process_doc(doc_id: str) -> str | None: try: print(f"Deleting document {doc_id} in Vespa") - # chunks_deleted = vespa_index.delete_single(doc_id) - # if chunks_deleted > 0: - # print( - # f"Deleted {chunks_deleted} chunks for document {doc_id}" - # ) + chunks_deleted = vespa_index.delete_single( + doc_id, tenant_id=None, chunk_count=document.chunk_count + ) + if chunks_deleted > 0: + print( + f"Deleted {chunks_deleted} chunks for document {doc_id}" + ) return doc_id except Exception as e: print( From e7173378f62854d909b07b8997999127ce07ca9f Mon Sep 17 00:00:00 2001 From: pablodanswer Date: Tue, 7 Jan 2025 16:32:12 -0800 Subject: [PATCH 06/11] update for clarity + best practices --- .../background/celery/tasks/shared/tasks.py | 2 -- .../document_index/document_index_utils.py | 5 ++- backend/onyx/document_index/vespa/index.py | 32 +++++++++---------- .../document_index/vespa/indexing_utils.py | 2 +- backend/onyx/indexing/indexing_pipeline.py | 6 ++-- 5 files changed, 22 insertions(+), 25 deletions(-) diff --git a/backend/onyx/background/celery/tasks/shared/tasks.py b/backend/onyx/background/celery/tasks/shared/tasks.py index a7050b57498..16434a6b5f5 100644 --- a/backend/onyx/background/celery/tasks/shared/tasks.py +++ b/backend/onyx/background/celery/tasks/shared/tasks.py @@ -80,7 +80,6 @@ def document_by_cc_pair_cleanup_task( # delete it from vespa and the db action = "delete" - # TODO: fix the large chunks enabled chunks_affected = retry_index.delete_single( document_id, tenant_id=tenant_id, @@ -115,7 +114,6 @@ def document_by_cc_pair_cleanup_task( ) # update Vespa. OK if doc doesn't exist. Raises exception otherwise. - # TODO: fix the large chunks enabled chunks_affected = retry_index.update_single( document_id, tenant_id=tenant_id, diff --git a/backend/onyx/document_index/document_index_utils.py b/backend/onyx/document_index/document_index_utils.py index 8976b556eb2..4770a8a2843 100644 --- a/backend/onyx/document_index/document_index_utils.py +++ b/backend/onyx/document_index/document_index_utils.py @@ -37,7 +37,10 @@ def translate_boost_count_to_multiplier(boost: int) -> float: return 2 / (1 + math.exp(-1 * boost / 3)) -def assemble_document_chunk_info( +# Assembles a list of Vespa chunk IDs for a document +# given the required context. This can be used to directly query +# Vespa's Document API. +def get_document_chunk_ids( enriched_document_info_list: list[EnrichedDocumentIndexingInfo], tenant_id: str | None, large_chunks_enabled: bool, diff --git a/backend/onyx/document_index/vespa/index.py b/backend/onyx/document_index/vespa/index.py index 78cfea85656..d8d4b41e4a4 100644 --- a/backend/onyx/document_index/vespa/index.py +++ b/backend/onyx/document_index/vespa/index.py @@ -26,7 +26,7 @@ from onyx.context.search.models import IndexFilters from onyx.context.search.models import InferenceChunkUncleaned from onyx.db.engine import get_session_with_tenant -from onyx.document_index.document_index_utils import assemble_document_chunk_info +from onyx.document_index.document_index_utils import get_document_chunk_ids from onyx.document_index.interfaces import DocumentIndex from onyx.document_index.interfaces import DocumentInsertionRecord from onyx.document_index.interfaces import EnrichedDocumentIndexingInfo @@ -42,11 +42,11 @@ from onyx.document_index.vespa.chunk_retrieval import query_vespa from onyx.document_index.vespa.deletion import delete_vespa_chunks from onyx.document_index.vespa.indexing_utils import batch_index_vespa_chunks -from onyx.document_index.vespa.indexing_utils import ( - check_enable_large_chunks_and_multipass, -) from onyx.document_index.vespa.indexing_utils import check_for_final_chunk_existence from onyx.document_index.vespa.indexing_utils import clean_chunk_id_copy +from onyx.document_index.vespa.indexing_utils import ( + get_multipass_config, +) from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client from onyx.document_index.vespa.shared_utils.utils import ( replace_invalid_doc_id_characters, @@ -355,7 +355,7 @@ def index( # Now, for each doc, we know exactly where to start and end our deletion # So let's generate the chunk IDs for each chunk to delete - chunks_to_delete = assemble_document_chunk_info( + chunks_to_delete = get_document_chunk_ids( enriched_document_info_list=enriched_doc_infos, tenant_id=tenant_id, large_chunks_enabled=large_chunks_enabled, @@ -458,14 +458,14 @@ def update( for update_request in update_requests: for doc_info in update_request.minimal_document_indexing_info: for index_name in index_names: - doc_chunk_info = self.enrich_basic_chunk_info( + doc_chunk_info = VespaIndex.enrich_basic_chunk_info( index_name=index_name, http_client=http_client, document_id=doc_info.doc_id, previous_chunk_count=doc_info.chunk_start_index, new_chunk_count=0, ) - doc_chunk_ids = assemble_document_chunk_info( + doc_chunk_ids = get_document_chunk_ids( enriched_document_info_list=[doc_chunk_info], tenant_id=tenant_id, large_chunks_enabled=False, @@ -578,9 +578,8 @@ def update_single( with get_vespa_http_client(http2=False) as http_client: for index_name in index_names: - large_chunks_enabled = False with get_session_with_tenant(tenant_id=tenant_id) as db_session: - multipass_config = check_enable_large_chunks_and_multipass( + multipass_config = get_multipass_config( db_session=db_session, primary_index=index_name == self.index_name, ) @@ -593,7 +592,7 @@ def update_single( previous_chunk_count=chunk_count, new_chunk_count=0, ) - doc_chunk_ids = assemble_document_chunk_info( + doc_chunk_ids = get_document_chunk_ids( enriched_document_info_list=[enriched_doc_infos], tenant_id=tenant_id, large_chunks_enabled=large_chunks_enabled, @@ -601,7 +600,6 @@ def update_single( doc_chunk_count += len(doc_chunk_ids) for doc_chunk_id in doc_chunk_ids: - print("THIS ONE is being updated") self.update_single_chunk( doc_chunk_id=doc_chunk_id, index_name=index_name, fields=fields ) @@ -615,9 +613,6 @@ def delete_single( tenant_id: str | None, chunk_count: int | None, ) -> int: - """Possibly faster overall than the delete method due to using a single - delete call with a selection query.""" - total_chunks_deleted = 0 doc_id = replace_invalid_doc_id_characters(doc_id) @@ -634,9 +629,8 @@ def delete_single( max_workers=NUM_THREADS ) as executor: for index_name in index_names: - large_chunks_enabled = False with get_session_with_tenant(tenant_id=tenant_id) as db_session: - multipass_config = check_enable_large_chunks_and_multipass( + multipass_config = get_multipass_config( db_session=db_session, primary_index=index_name == self.index_name, ) @@ -649,7 +643,7 @@ def delete_single( previous_chunk_count=chunk_count, new_chunk_count=0, ) - chunks_to_delete = assemble_document_chunk_info( + chunks_to_delete = get_document_chunk_ids( enriched_document_info_list=[enriched_doc_infos], tenant_id=tenant_id, large_chunks_enabled=large_chunks_enabled, @@ -762,6 +756,10 @@ def admin_retrieval( return query_vespa(params) + # Retrieves chunk information for a document: + # - Determines the last indexed chunk + # - Identifies if the document uses the old or new chunk ID system + # This data is crucial for Vespa document updates without relying on the visit API. @classmethod def enrich_basic_chunk_info( cls, diff --git a/backend/onyx/document_index/vespa/indexing_utils.py b/backend/onyx/document_index/vespa/indexing_utils.py index a8d35e996db..857162382ef 100644 --- a/backend/onyx/document_index/vespa/indexing_utils.py +++ b/backend/onyx/document_index/vespa/indexing_utils.py @@ -297,7 +297,7 @@ def can_use_large_chunks(multipass: bool, search_settings: SearchSettings) -> bo ) -def check_enable_large_chunks_and_multipass( +def get_multipass_config( db_session: Session, primary_index: bool = True ) -> MultipassConfig: """ diff --git a/backend/onyx/indexing/indexing_pipeline.py b/backend/onyx/indexing/indexing_pipeline.py index 1502ac6b3d3..74c6c08fd8a 100644 --- a/backend/onyx/indexing/indexing_pipeline.py +++ b/backend/onyx/indexing/indexing_pipeline.py @@ -36,7 +36,7 @@ from onyx.document_index.interfaces import DocumentMetadata from onyx.document_index.interfaces import IndexBatchParams from onyx.document_index.vespa.indexing_utils import ( - check_enable_large_chunks_and_multipass, + get_multipass_config, ) from onyx.indexing.chunker import Chunker from onyx.indexing.embedder import IndexingEmbedder @@ -491,9 +491,7 @@ def build_indexing_pipeline( callback: IndexingHeartbeatInterface | None = None, ) -> IndexingPipelineProtocol: """Builds a pipeline which takes in a list (batch) of docs and indexes them.""" - multipass_config = check_enable_large_chunks_and_multipass( - db_session, primary_index=True - ) + multipass_config = get_multipass_config(db_session, primary_index=True) chunker = chunker or Chunker( tokenizer=embedder.embedding_model.tokenizer, From 7b20e11300c1173d041bfd776cec93c63dd19a3c Mon Sep 17 00:00:00 2001 From: pablodanswer Date: Tue, 7 Jan 2025 21:07:21 -0800 Subject: [PATCH 07/11] nit + logs --- .../background/celery/tasks/shared/tasks.py | 5 +- .../background/celery/tasks/vespa/tasks.py | 4 + backend/onyx/db/document.py | 8 + backend/onyx/document_index/vespa/index.py | 94 +++++- backend/tests/__init__.py | 0 backend/tests/integration/__init__.py | 0 .../common_utils/managers/document.py | 11 +- .../connector/test_connector_deletion.py | 318 +++++++++--------- 8 files changed, 265 insertions(+), 175 deletions(-) create mode 100644 backend/tests/__init__.py create mode 100644 backend/tests/integration/__init__.py diff --git a/backend/onyx/background/celery/tasks/shared/tasks.py b/backend/onyx/background/celery/tasks/shared/tasks.py index 16434a6b5f5..b078c282b58 100644 --- a/backend/onyx/background/celery/tasks/shared/tasks.py +++ b/backend/onyx/background/celery/tasks/shared/tasks.py @@ -12,6 +12,7 @@ from onyx.configs.constants import OnyxCeleryTask from onyx.db.document import delete_document_by_connector_credential_pair__no_commit from onyx.db.document import delete_documents_complete__no_commit +from onyx.db.document import fetch_chunk_count_for_document from onyx.db.document import get_document from onyx.db.document import get_document_connector_count from onyx.db.document import mark_document_as_modified @@ -80,10 +81,12 @@ def document_by_cc_pair_cleanup_task( # delete it from vespa and the db action = "delete" + chunk_count = fetch_chunk_count_for_document(document_id, db_session) + chunks_affected = retry_index.delete_single( document_id, tenant_id=tenant_id, - chunk_count=None, + chunk_count=chunk_count, ) delete_documents_complete__no_commit( db_session=db_session, diff --git a/backend/onyx/background/celery/tasks/vespa/tasks.py b/backend/onyx/background/celery/tasks/vespa/tasks.py index 8eabeb7d8a8..f3c7b67818e 100644 --- a/backend/onyx/background/celery/tasks/vespa/tasks.py +++ b/backend/onyx/background/celery/tasks/vespa/tasks.py @@ -990,6 +990,10 @@ def vespa_metadata_sync_task( boost=doc.boost, hidden=doc.hidden, ) + logger.info("\n\n\n\n\n\n\n\nuPDATING DOCUMENT") + logger.info(document_id) + logger.info(fields) + logger.info(doc_access) # update Vespa. OK if doc doesn't exist. Raises exception otherwise. chunks_affected = retry_index.update_single( diff --git a/backend/onyx/db/document.py b/backend/onyx/db/document.py index 7f11b64d824..b12c852f493 100644 --- a/backend/onyx/db/document.py +++ b/backend/onyx/db/document.py @@ -702,3 +702,11 @@ def fetch_chunk_counts_for_documents( # otherwise cast to str if you need to be sure it's a string: return [(str(row[0]), row[1]) for row in results] # or row.id, row.chunk_count if they are named attributes in your ORM model + + +def fetch_chunk_count_for_document( + document_id: str, + db_session: Session, +) -> int | None: + stmt = select(DbDocument.chunk_count).where(DbDocument.id == document_id) + return db_session.execute(stmt).scalar_one_or_none() diff --git a/backend/onyx/document_index/vespa/index.py b/backend/onyx/document_index/vespa/index.py index d8d4b41e4a4..8d02ee0f04b 100644 --- a/backend/onyx/document_index/vespa/index.py +++ b/backend/onyx/document_index/vespa/index.py @@ -517,47 +517,100 @@ def update( ) def update_single_chunk( - self, doc_chunk_id: UUID, index_name: str, fields: VespaDocumentFields + self, + doc_chunk_id: UUID, # The chunk ID, presumably your docid in Vespa + index_name: str, # The Vespa doc type name (must match schema) + fields: VespaDocumentFields, # Your container of fields to update + doc_id: str, # Possibly used for logging ) -> None: - """Update a single chunk in Vespa using its chunk ID.""" + """ + Update a single "chunk" (document) in Vespa using its chunk ID. + Requires that 'index_name' matches the document type in your Vespa schema. + """ - # Build the update request + # 1) Build the partial-update JSON update_dict: dict[str, dict] = {"fields": {}} + if fields.boost is not None: update_dict["fields"][BOOST] = {"assign": fields.boost} + if fields.document_sets is not None: + # WeightedSet needs a map { item: weight, ... } update_dict["fields"][DOCUMENT_SETS] = { "assign": {document_set: 1 for document_set in fields.document_sets} } + if fields.access is not None: + # Another WeightedSet update_dict["fields"][ACCESS_CONTROL_LIST] = { "assign": {acl_entry: 1 for acl_entry in fields.access.to_acl()} } + if fields.hidden is not None: update_dict["fields"][HIDDEN] = {"assign": fields.hidden} if not update_dict["fields"]: - logger.error("Update request received but nothing to update") + logger.error("Update request received but nothing to update.") return + # 2) Construct the correct doc URL + # Make sure DOCUMENT_ID_ENDPOINT includes something like: + # http://:/document/v1/{namespace}/{doc_type}/docid + # Also note the "?create=true" so that partial updates upsert. + vespa_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}?create=true" + + logger.info("Vespa partial-update payload: %s", update_dict) + logger.info('Attempting PUT to URL: "%s"', vespa_url) + with get_vespa_http_client(http2=False) as http_client: - vespa_url = ( - f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}" - ) + # --- (Optional) Check the doc before updating --- + try: + before_resp = http_client.get(vespa_url) + before_resp.raise_for_status() + before_doc_json = before_resp.json() + # You might not want to log large fields like embeddings, etc. + # If so, pop them out before logging: + before_doc_json.pop("embeddings", None) + before_doc_json.pop("title_embedding", None) + logger.info("Document before update [%s]: %s", doc_id, before_doc_json) + except httpx.HTTPError: + logger.warning( + "Document %s did not exist prior to update (this might be normal if it's new).", + doc_chunk_id, + ) + # --- Perform the partial update --- try: - logger.debug(f'update_single PUT on URL "{vespa_url}"') resp = http_client.put( vespa_url, headers={"Content-Type": "application/json"}, json=update_dict, ) resp.raise_for_status() - except httpx.HTTPStatusError as e: - logger.error(f"Failed to update chunk, details: {e.response.text}") + logger.error( + "Failed to update doc chunk %s (doc_id=%s). Details: %s", + doc_chunk_id, + doc_id, + e.response.text, + ) raise + # --- (Optional) Check the doc after updating --- + try: + after_resp = http_client.get(vespa_url) + after_resp.raise_for_status() + after_doc_json = after_resp.json() + after_doc_json.pop("embeddings", None) + after_doc_json.pop("title_embedding", None) + logger.info("Document after update [%s]: %s", doc_id, after_doc_json) + except httpx.HTTPError as e: + logger.warning( + "Failed fetching document after update for %s: %s", + doc_chunk_id, + str(e), + ) + def update_single( self, doc_id: str, @@ -570,6 +623,9 @@ def update_single( function will complete with no errors or exceptions. Handle other exceptions if you wish to implement retry behavior """ + logger.info(f"RIGHT NOW UPDATING document {doc_id} with fields {fields}") + logger.info(fields.__dict__) + doc_chunk_count = 0 index_names = [self.index_name] @@ -592,17 +648,28 @@ def update_single( previous_chunk_count=chunk_count, new_chunk_count=0, ) + logger.info("ENRICHED DOC INFO") + logger.info(enriched_doc_infos) + doc_chunk_ids = get_document_chunk_ids( enriched_document_info_list=[enriched_doc_infos], tenant_id=tenant_id, large_chunks_enabled=large_chunks_enabled, ) + + logger.info("UPDATING len(doc_chunk_ids)") + doc_chunk_count += len(doc_chunk_ids) for doc_chunk_id in doc_chunk_ids: + logger.info("UPDATING CHUNK") self.update_single_chunk( - doc_chunk_id=doc_chunk_id, index_name=index_name, fields=fields + doc_chunk_id=doc_chunk_id, + index_name=index_name, + fields=fields, + doc_id=doc_id, ) + logger.info(f"UPDATED A TOTAL OF {doc_chunk_count} CHUNKS for {doc_id}") return doc_chunk_count @@ -613,6 +680,7 @@ def delete_single( tenant_id: str | None, chunk_count: int | None, ) -> int: + print("\n\n\n\n\n\n\n\nDELETE") total_chunks_deleted = 0 doc_id = replace_invalid_doc_id_characters(doc_id) @@ -643,6 +711,9 @@ def delete_single( previous_chunk_count=chunk_count, new_chunk_count=0, ) + print("the enriched doc info", enriched_doc_infos) + # for doc_info in enriched_doc_infos: + # print(doc_info.__dict__) chunks_to_delete = get_document_chunk_ids( enriched_document_info_list=[enriched_doc_infos], tenant_id=tenant_id, @@ -651,6 +722,7 @@ def delete_single( for doc_chunk_ids_batch in batch_generator( chunks_to_delete, BATCH_SIZE ): + print("DELETING CHUNK") total_chunks_deleted += len(doc_chunk_ids_batch) delete_vespa_chunks( doc_chunk_ids=doc_chunk_ids_batch, diff --git a/backend/tests/__init__.py b/backend/tests/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/backend/tests/integration/__init__.py b/backend/tests/integration/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/backend/tests/integration/common_utils/managers/document.py b/backend/tests/integration/common_utils/managers/document.py index 939a420f014..58465be0149 100644 --- a/backend/tests/integration/common_utils/managers/document.py +++ b/backend/tests/integration/common_utils/managers/document.py @@ -21,8 +21,10 @@ def _verify_document_permissions( group_names: list[str] | None = None, doc_creating_user: DATestUser | None = None, ) -> None: - acl_keys = set(retrieved_doc["access_control_list"].keys()) + acl_keys = set(retrieved_doc.get("access_control_list", {}).keys()) print(f"ACL keys: {acl_keys}") + print(f"Full retrieved document: {retrieved_doc}") + if cc_pair.access_type == AccessType.PUBLIC: if "PUBLIC" not in acl_keys: raise ValueError( @@ -39,11 +41,14 @@ def _verify_document_permissions( if group_names is not None: expected_group_keys = {f"group:{group_name}" for group_name in group_names} + print(f"Expected group keys: {expected_group_keys}") found_group_keys = {key for key in acl_keys if key.startswith("group:")} + print(f"Found group keys: {found_group_keys}") if found_group_keys != expected_group_keys: raise ValueError( - f"Document {retrieved_doc['document_id']} has incorrect group ACL keys. Found: {found_group_keys}, \n" - f"Expected: {expected_group_keys}" + f"Document {retrieved_doc['document_id']} has incorrect group ACL keys. " + f"Expected: {expected_group_keys} Found: {found_group_keys}\n" + f"All ACL keys: {acl_keys}" ) if doc_set_names is not None: diff --git a/backend/tests/integration/tests/connector/test_connector_deletion.py b/backend/tests/integration/tests/connector/test_connector_deletion.py index 8878a502e2a..d2af7a31f56 100644 --- a/backend/tests/integration/tests/connector/test_connector_deletion.py +++ b/backend/tests/integration/tests/connector/test_connector_deletion.py @@ -5,8 +5,6 @@ - updates the document sets and user groups to remove the connector - Ensure that deleting a connector that is part of an overlapping document set and/or user group works as expected """ -from uuid import uuid4 - from sqlalchemy.orm import Session from onyx.db.engine import get_sqlalchemy_engine @@ -192,161 +190,161 @@ def test_connector_deletion(reset: None, vespa_client: vespa_fixture) -> None: ) -def test_connector_deletion_for_overlapping_connectors( - reset: None, vespa_client: vespa_fixture -) -> None: - """Checks to make sure that connectors with overlapping documents work properly. Specifically, that the overlapping - document (1) still exists and (2) has the right document set / group post-deletion of one of the connectors. - """ - # Creating an admin user (first user created is automatically an admin) - admin_user: DATestUser = UserManager.create(name="admin_user") - # create api key - api_key: DATestAPIKey = APIKeyManager.create( - user_performing_action=admin_user, - ) - - # create connectors - cc_pair_1 = CCPairManager.create_from_scratch( - source=DocumentSource.INGESTION_API, - user_performing_action=admin_user, - ) - cc_pair_2 = CCPairManager.create_from_scratch( - source=DocumentSource.INGESTION_API, - user_performing_action=admin_user, - ) - - doc_ids = [str(uuid4())] - cc_pair_1.documents = DocumentManager.seed_dummy_docs( - cc_pair=cc_pair_1, - document_ids=doc_ids, - api_key=api_key, - ) - cc_pair_2.documents = DocumentManager.seed_dummy_docs( - cc_pair=cc_pair_2, - document_ids=doc_ids, - api_key=api_key, - ) - - # verify vespa document exists and that it is not in any document sets or groups - DocumentManager.verify( - vespa_client=vespa_client, - cc_pair=cc_pair_1, - doc_set_names=[], - group_names=[], - doc_creating_user=admin_user, - ) - DocumentManager.verify( - vespa_client=vespa_client, - cc_pair=cc_pair_2, - doc_set_names=[], - group_names=[], - doc_creating_user=admin_user, - ) - - # create document set - doc_set_1 = DocumentSetManager.create( - name="Test Document Set 1", - cc_pair_ids=[cc_pair_1.id], - user_performing_action=admin_user, - ) - DocumentSetManager.wait_for_sync( - document_sets_to_check=[doc_set_1], - user_performing_action=admin_user, - ) - - print("Document set 1 created and synced") - - # verify vespa document is in the document set - DocumentManager.verify( - vespa_client=vespa_client, - cc_pair=cc_pair_1, - doc_set_names=[doc_set_1.name], - doc_creating_user=admin_user, - ) - DocumentManager.verify( - vespa_client=vespa_client, - cc_pair=cc_pair_2, - doc_creating_user=admin_user, - ) - - # create a user group and attach it to connector 1 - user_group_1: DATestUserGroup = UserGroupManager.create( - name="Test User Group 1", - cc_pair_ids=[cc_pair_1.id], - user_performing_action=admin_user, - ) - UserGroupManager.wait_for_sync( - user_groups_to_check=[user_group_1], - user_performing_action=admin_user, - ) - cc_pair_1.groups = [user_group_1.id] - - print("User group 1 created and synced") - - # create a user group and attach it to connector 2 - user_group_2: DATestUserGroup = UserGroupManager.create( - name="Test User Group 2", - cc_pair_ids=[cc_pair_2.id], - user_performing_action=admin_user, - ) - UserGroupManager.wait_for_sync( - user_groups_to_check=[user_group_2], - user_performing_action=admin_user, - ) - cc_pair_2.groups = [user_group_2.id] - - print("User group 2 created and synced") - - # verify vespa document is in the user group - DocumentManager.verify( - vespa_client=vespa_client, - cc_pair=cc_pair_1, - group_names=[user_group_1.name, user_group_2.name], - doc_creating_user=admin_user, - ) - DocumentManager.verify( - vespa_client=vespa_client, - cc_pair=cc_pair_2, - group_names=[user_group_1.name, user_group_2.name], - doc_creating_user=admin_user, - ) - - # delete connector 1 - CCPairManager.pause_cc_pair( - cc_pair=cc_pair_1, - user_performing_action=admin_user, - ) - CCPairManager.delete( - cc_pair=cc_pair_1, - user_performing_action=admin_user, - ) - - # wait for deletion to finish - CCPairManager.wait_for_deletion_completion( - cc_pair_id=cc_pair_1.id, user_performing_action=admin_user - ) - - print("Connector 1 deleted") - - # check that only connector 1 is deleted - # TODO: check for the CC pair rather than the connector once the refactor is done - CCPairManager.verify( - cc_pair=cc_pair_1, - verify_deleted=True, - user_performing_action=admin_user, - ) - CCPairManager.verify( - cc_pair=cc_pair_2, - user_performing_action=admin_user, - ) - - # verify the document is not in any document sets - # verify the document is only in user group 2 - DocumentManager.verify( - vespa_client=vespa_client, - cc_pair=cc_pair_2, - doc_set_names=[], - group_names=[user_group_2.name], - doc_creating_user=admin_user, - verify_deleted=False, - ) +# def test_connector_deletion_for_overlapping_connectors( +# reset: None, vespa_client: vespa_fixture +# ) -> None: +# """Checks to make sure that connectors with overlapping documents work properly. Specifically, that the overlapping +# document (1) still exists and (2) has the right document set / group post-deletion of one of the connectors. +# """ +# # Creating an admin user (first user created is automatically an admin) +# admin_user: DATestUser = UserManager.create(name="admin_user") +# # create api key +# api_key: DATestAPIKey = APIKeyManager.create( +# user_performing_action=admin_user, +# ) + +# # create connectors +# cc_pair_1 = CCPairManager.create_from_scratch( +# source=DocumentSource.INGESTION_API, +# user_performing_action=admin_user, +# ) +# cc_pair_2 = CCPairManager.create_from_scratch( +# source=DocumentSource.INGESTION_API, +# user_performing_action=admin_user, +# ) + +# doc_ids = [str(uuid4())] +# cc_pair_1.documents = DocumentManager.seed_dummy_docs( +# cc_pair=cc_pair_1, +# document_ids=doc_ids, +# api_key=api_key, +# ) +# cc_pair_2.documents = DocumentManager.seed_dummy_docs( +# cc_pair=cc_pair_2, +# document_ids=doc_ids, +# api_key=api_key, +# ) + +# # verify vespa document exists and that it is not in any document sets or groups +# DocumentManager.verify( +# vespa_client=vespa_client, +# cc_pair=cc_pair_1, +# doc_set_names=[], +# group_names=[], +# doc_creating_user=admin_user, +# ) +# DocumentManager.verify( +# vespa_client=vespa_client, +# cc_pair=cc_pair_2, +# doc_set_names=[], +# group_names=[], +# doc_creating_user=admin_user, +# ) + +# # create document set +# doc_set_1 = DocumentSetManager.create( +# name="Test Document Set 1", +# cc_pair_ids=[cc_pair_1.id], +# user_performing_action=admin_user, +# ) +# DocumentSetManager.wait_for_sync( +# document_sets_to_check=[doc_set_1], +# user_performing_action=admin_user, +# ) + +# print("Document set 1 created and synced") + +# # verify vespa document is in the document set +# DocumentManager.verify( +# vespa_client=vespa_client, +# cc_pair=cc_pair_1, +# doc_set_names=[doc_set_1.name], +# doc_creating_user=admin_user, +# ) +# DocumentManager.verify( +# vespa_client=vespa_client, +# cc_pair=cc_pair_2, +# doc_creating_user=admin_user, +# ) + +# # create a user group and attach it to connector 1 +# user_group_1: DATestUserGroup = UserGroupManager.create( +# name="Test User Group 1", +# cc_pair_ids=[cc_pair_1.id], +# user_performing_action=admin_user, +# ) +# UserGroupManager.wait_for_sync( +# user_groups_to_check=[user_group_1], +# user_performing_action=admin_user, +# ) +# cc_pair_1.groups = [user_group_1.id] + +# print("User group 1 created and synced") + +# # create a user group and attach it to connector 2 +# user_group_2: DATestUserGroup = UserGroupManager.create( +# name="Test User Group 2", +# cc_pair_ids=[cc_pair_2.id], +# user_performing_action=admin_user, +# ) +# UserGroupManager.wait_for_sync( +# user_groups_to_check=[user_group_2], +# user_performing_action=admin_user, +# ) +# cc_pair_2.groups = [user_group_2.id] + +# print("User group 2 created and synced") + +# # verify vespa document is in the user group +# DocumentManager.verify( +# vespa_client=vespa_client, +# cc_pair=cc_pair_1, +# group_names=[user_group_1.name, user_group_2.name], +# doc_creating_user=admin_user, +# ) +# DocumentManager.verify( +# vespa_client=vespa_client, +# cc_pair=cc_pair_2, +# group_names=[user_group_1.name, user_group_2.name], +# doc_creating_user=admin_user, +# ) + +# # delete connector 1 +# CCPairManager.pause_cc_pair( +# cc_pair=cc_pair_1, +# user_performing_action=admin_user, +# ) +# CCPairManager.delete( +# cc_pair=cc_pair_1, +# user_performing_action=admin_user, +# ) + +# # wait for deletion to finish +# CCPairManager.wait_for_deletion_completion( +# cc_pair_id=cc_pair_1.id, user_performing_action=admin_user +# ) + +# print("Connector 1 deleted") + +# # check that only connector 1 is deleted +# # TODO: check for the CC pair rather than the connector once the refactor is done +# CCPairManager.verify( +# cc_pair=cc_pair_1, +# verify_deleted=True, +# user_performing_action=admin_user, +# ) +# CCPairManager.verify( +# cc_pair=cc_pair_2, +# user_performing_action=admin_user, +# ) + +# # verify the document is not in any document sets +# # verify the document is only in user group 2 +# DocumentManager.verify( +# vespa_client=vespa_client, +# cc_pair=cc_pair_2, +# doc_set_names=[], +# group_names=[user_group_2.name], +# doc_creating_user=admin_user, +# verify_deleted=False, +# ) From 231d03a4d6ff0d099404ea89732e94191eca9106 Mon Sep 17 00:00:00 2001 From: pablodanswer Date: Wed, 8 Jan 2025 12:25:19 -0800 Subject: [PATCH 08/11] fix --- backend/onyx/db/document.py | 15 +++-- .../document_index/document_index_utils.py | 7 ++- backend/onyx/document_index/vespa/index.py | 63 +++---------------- .../document_index/vespa/indexing_utils.py | 2 + .../common_utils/managers/document.py | 7 +++ 5 files changed, 28 insertions(+), 66 deletions(-) diff --git a/backend/onyx/db/document.py b/backend/onyx/db/document.py index b12c852f493..d9ff82d797a 100644 --- a/backend/onyx/db/document.py +++ b/backend/onyx/db/document.py @@ -685,23 +685,22 @@ def get_document_sources( def fetch_chunk_counts_for_documents( document_ids: list[str], db_session: Session, -) -> list[tuple[str, int | None]]: +) -> list[tuple[str, int]]: """ Return a list of (document_id, chunk_count) tuples. - Note: chunk_count might be None if not set in DB, - so we declare it as Optional[int]. + If a document_id is not found in the database, it will be returned with a chunk_count of 0. """ stmt = select(DbDocument.id, DbDocument.chunk_count).where( DbDocument.id.in_(document_ids) ) - # results is a list of 'Row' objects, each containing two columns results = db_session.execute(stmt).all() - # If DbDocument.id is guaranteed to be a string, you can just do row.id; - # otherwise cast to str if you need to be sure it's a string: - return [(str(row[0]), row[1]) for row in results] - # or row.id, row.chunk_count if they are named attributes in your ORM model + # Create a dictionary of document_id to chunk_count + chunk_counts = {str(row.id): row.chunk_count or 0 for row in results} + + # Return a list of tuples, using 0 for documents not found in the database + return [(doc_id, chunk_counts.get(doc_id, 0)) for doc_id in document_ids] def fetch_chunk_count_for_document( diff --git a/backend/onyx/document_index/document_index_utils.py b/backend/onyx/document_index/document_index_utils.py index 4770a8a2843..831ffb6d403 100644 --- a/backend/onyx/document_index/document_index_utils.py +++ b/backend/onyx/document_index/document_index_utils.py @@ -8,7 +8,7 @@ from onyx.db.search_settings import get_secondary_search_settings from onyx.document_index.interfaces import EnrichedDocumentIndexingInfo from onyx.indexing.models import DocMetadataAwareIndexChunk - +from shared_configs.configs import MULTI_TENANT DEFAULT_BATCH_SIZE = 30 DEFAULT_INDEX_NAME = "danswer_chunk" @@ -113,10 +113,11 @@ def get_uuid_from_chunk_info( "large_" + str(large_chunk_id) if large_chunk_id is not None else str(chunk_id) ) unique_identifier_string = "_".join([doc_str, chunk_index]) - if tenant_id: + if tenant_id and MULTI_TENANT: unique_identifier_string += "_" + tenant_id - return uuid.uuid5(uuid.NAMESPACE_X500, unique_identifier_string) + uuid_value = uuid.uuid5(uuid.NAMESPACE_X500, unique_identifier_string) + return uuid_value def get_uuid_from_chunk_info_old( diff --git a/backend/onyx/document_index/vespa/index.py b/backend/onyx/document_index/vespa/index.py index 8d02ee0f04b..b69fa853d0c 100644 --- a/backend/onyx/document_index/vespa/index.py +++ b/backend/onyx/document_index/vespa/index.py @@ -337,12 +337,13 @@ def index( # know precisely which chunks to delete. This information exists for # documents that have `chunk_count` in the database, but not for # `old_version` documents. + enriched_doc_infos: list[EnrichedDocumentIndexingInfo] = [ VespaIndex.enrich_basic_chunk_info( index_name=self.index_name, http_client=http_client, document_id=doc_id, - previous_chunk_count=doc_id_to_previous_chunk_cnt.get(doc_id), + previous_chunk_count=doc_id_to_previous_chunk_cnt.get(doc_id, 0), new_chunk_count=doc_id_to_new_chunk_cnt.get(doc_id, 0), ) for doc_id in doc_id_to_new_chunk_cnt.keys() @@ -518,10 +519,10 @@ def update( def update_single_chunk( self, - doc_chunk_id: UUID, # The chunk ID, presumably your docid in Vespa - index_name: str, # The Vespa doc type name (must match schema) - fields: VespaDocumentFields, # Your container of fields to update - doc_id: str, # Possibly used for logging + doc_chunk_id: UUID, + index_name: str, + fields: VespaDocumentFields, + doc_id: str, ) -> None: """ Update a single "chunk" (document) in Vespa using its chunk ID. @@ -559,27 +560,7 @@ def update_single_chunk( # Also note the "?create=true" so that partial updates upsert. vespa_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}?create=true" - logger.info("Vespa partial-update payload: %s", update_dict) - logger.info('Attempting PUT to URL: "%s"', vespa_url) - with get_vespa_http_client(http2=False) as http_client: - # --- (Optional) Check the doc before updating --- - try: - before_resp = http_client.get(vespa_url) - before_resp.raise_for_status() - before_doc_json = before_resp.json() - # You might not want to log large fields like embeddings, etc. - # If so, pop them out before logging: - before_doc_json.pop("embeddings", None) - before_doc_json.pop("title_embedding", None) - logger.info("Document before update [%s]: %s", doc_id, before_doc_json) - except httpx.HTTPError: - logger.warning( - "Document %s did not exist prior to update (this might be normal if it's new).", - doc_chunk_id, - ) - - # --- Perform the partial update --- try: resp = http_client.put( vespa_url, @@ -588,29 +569,10 @@ def update_single_chunk( ) resp.raise_for_status() except httpx.HTTPStatusError as e: - logger.error( - "Failed to update doc chunk %s (doc_id=%s). Details: %s", - doc_chunk_id, - doc_id, - e.response.text, - ) + error_message = f"Failed to update doc chunk {doc_chunk_id} (doc_id={doc_id}). Details: {e.response.text}" + logger.error(error_message) raise - # --- (Optional) Check the doc after updating --- - try: - after_resp = http_client.get(vespa_url) - after_resp.raise_for_status() - after_doc_json = after_resp.json() - after_doc_json.pop("embeddings", None) - after_doc_json.pop("title_embedding", None) - logger.info("Document after update [%s]: %s", doc_id, after_doc_json) - except httpx.HTTPError as e: - logger.warning( - "Failed fetching document after update for %s: %s", - doc_chunk_id, - str(e), - ) - def update_single( self, doc_id: str, @@ -623,8 +585,6 @@ def update_single( function will complete with no errors or exceptions. Handle other exceptions if you wish to implement retry behavior """ - logger.info(f"RIGHT NOW UPDATING document {doc_id} with fields {fields}") - logger.info(fields.__dict__) doc_chunk_count = 0 @@ -640,7 +600,6 @@ def update_single( primary_index=index_name == self.index_name, ) large_chunks_enabled = multipass_config.enable_large_chunks - enriched_doc_infos = VespaIndex.enrich_basic_chunk_info( index_name=index_name, http_client=http_client, @@ -648,8 +607,6 @@ def update_single( previous_chunk_count=chunk_count, new_chunk_count=0, ) - logger.info("ENRICHED DOC INFO") - logger.info(enriched_doc_infos) doc_chunk_ids = get_document_chunk_ids( enriched_document_info_list=[enriched_doc_infos], @@ -711,9 +668,6 @@ def delete_single( previous_chunk_count=chunk_count, new_chunk_count=0, ) - print("the enriched doc info", enriched_doc_infos) - # for doc_info in enriched_doc_infos: - # print(doc_info.__dict__) chunks_to_delete = get_document_chunk_ids( enriched_document_info_list=[enriched_doc_infos], tenant_id=tenant_id, @@ -722,7 +676,6 @@ def delete_single( for doc_chunk_ids_batch in batch_generator( chunks_to_delete, BATCH_SIZE ): - print("DELETING CHUNK") total_chunks_deleted += len(doc_chunk_ids_batch) delete_vespa_chunks( doc_chunk_ids=doc_chunk_ids_batch, diff --git a/backend/onyx/document_index/vespa/indexing_utils.py b/backend/onyx/document_index/vespa/indexing_utils.py index 857162382ef..ed802ada9ac 100644 --- a/backend/onyx/document_index/vespa/indexing_utils.py +++ b/backend/onyx/document_index/vespa/indexing_utils.py @@ -136,7 +136,9 @@ def _index_vespa_chunk( document = chunk.source_document # No minichunk documents in vespa, minichunk vectors are stored in the chunk itself + vespa_chunk_id = str(get_uuid_from_chunk(chunk)) + embeddings = chunk.embeddings embeddings_name_vector_map = {"full_chunk": embeddings.full_embedding} diff --git a/backend/tests/integration/common_utils/managers/document.py b/backend/tests/integration/common_utils/managers/document.py index 58465be0149..1e6d5b6acd5 100644 --- a/backend/tests/integration/common_utils/managers/document.py +++ b/backend/tests/integration/common_utils/managers/document.py @@ -158,9 +158,16 @@ def verify( ) -> None: doc_ids = [document.id for document in cc_pair.documents] retrieved_docs_dict = vespa_client.get_documents_by_id(doc_ids)["documents"] + print(f"from the doc ids {doc_ids}") + + print("--------------------------------") + for doc in retrieved_docs_dict: + print(f"document iD: {doc['fields']['document_id']} and {doc['id']}") + retrieved_docs = { doc["fields"]["document_id"]: doc["fields"] for doc in retrieved_docs_dict } + # Left this here for debugging purposes. # import json # for doc in retrieved_docs.values(): From 8ef4857e8a081889d501e8e75553a3263a62a722 Mon Sep 17 00:00:00 2001 From: pablodanswer Date: Wed, 8 Jan 2025 12:31:16 -0800 Subject: [PATCH 09/11] minor clean up --- .../background/celery/tasks/vespa/tasks.py | 4 - backend/scripts/orphan_doc_cleanup_script.py | 1 - backend/testing.txt | 140 ++++++++ .../common_utils/managers/document.py | 6 - .../connector/test_connector_deletion.py | 318 +++++++++--------- backend/vespa_update_logs.txt | 120 +++++++ 6 files changed, 420 insertions(+), 169 deletions(-) create mode 100644 backend/testing.txt create mode 100644 backend/vespa_update_logs.txt diff --git a/backend/onyx/background/celery/tasks/vespa/tasks.py b/backend/onyx/background/celery/tasks/vespa/tasks.py index f3c7b67818e..8eabeb7d8a8 100644 --- a/backend/onyx/background/celery/tasks/vespa/tasks.py +++ b/backend/onyx/background/celery/tasks/vespa/tasks.py @@ -990,10 +990,6 @@ def vespa_metadata_sync_task( boost=doc.boost, hidden=doc.hidden, ) - logger.info("\n\n\n\n\n\n\n\nuPDATING DOCUMENT") - logger.info(document_id) - logger.info(fields) - logger.info(doc_access) # update Vespa. OK if doc doesn't exist. Raises exception otherwise. chunks_affected = retry_index.update_single( diff --git a/backend/scripts/orphan_doc_cleanup_script.py b/backend/scripts/orphan_doc_cleanup_script.py index baf52d93b05..c138bdc64a6 100644 --- a/backend/scripts/orphan_doc_cleanup_script.py +++ b/backend/scripts/orphan_doc_cleanup_script.py @@ -66,7 +66,6 @@ def main() -> None: def process_doc(doc_id: str) -> str | None: document = get_document(doc_id, db_session) if not document: - print(f"Document {doc_id} not found in Postgres") return None # Check if document exists in Vespa first try: diff --git a/backend/testing.txt b/backend/testing.txt new file mode 100644 index 00000000000..49406e070e9 --- /dev/null +++ b/backend/testing.txt @@ -0,0 +1,140 @@ +[2025-01-08 12:25:51] OLD? UPDATE SINGLE +[2025-01-08 12:25:51] OLD? UPDATE SINGLE +[2025-01-08 12:25:51] doc_id test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) +[2025-01-08 12:25:51] doc_id test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) +[2025-01-08 12:25:51] OLD? UPDATE SINGLE +[2025-01-08 12:25:51] OLD? UPDATE SINGLE +[2025-01-08 12:25:51] doc_id test-doc-9666258c-a25a-4290-9050-951dd054f0b3, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) +[2025-01-08 12:25:51] doc_id test-doc-49b93c6c-0625-474e-a29b-10533384c461, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) +[2025-01-08 12:25:51] called in update_single +[2025-01-08 12:25:51] called in update_single +[2025-01-08 12:25:51] OLD? UPDATE SINGLE +[2025-01-08 12:25:51] called in update_single +[2025-01-08 12:25:51] Doc ID: test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac and id is c4e99738-6467-5a14-a151-d2d68173d286 +[2025-01-08 12:25:51] OLD? UPDATE SINGLE +[2025-01-08 12:25:51] OLD? UPDATE SINGLE +[2025-01-08 12:25:51] document_id test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] doc_id test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) +[2025-01-08 12:25:51] doc_id test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) +[2025-01-08 12:25:51] doc_id test-doc-3340635d-6077-4feb-8553-0618c0a249c7, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) +[2025-01-08 12:25:51] Doc ID: test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac and id is c4e99738-6467-5a14-a151-d2d68173d286 +[2025-01-08 12:25:51] Doc ID: test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9 and id is ea672bb3-2d07-51c5-a39a-5f5bd7219fab +[2025-01-08 12:25:51] document_id test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] called in update_single +[2025-01-08 12:25:51] OLD? UPDATE SINGLE +[2025-01-08 12:25:51] Doc ID: test-doc-9666258c-a25a-4290-9050-951dd054f0b3 and id is b518bd97-dda4-5241-8dcf-60164e0a6baa +[2025-01-08 12:25:51] Doc ID: test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9 and id is ea672bb3-2d07-51c5-a39a-5f5bd7219fab +[2025-01-08 12:25:51] doc_id test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) +[2025-01-08 12:25:51] document_id test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] document_id test-doc-9666258c-a25a-4290-9050-951dd054f0b3, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] OLD? UPDATE SINGLE +[2025-01-08 12:25:51] document_id test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] Doc ID: test-doc-9666258c-a25a-4290-9050-951dd054f0b3 and id is b518bd97-dda4-5241-8dcf-60164e0a6baa +[2025-01-08 12:25:51] doc_id test-doc-c0cda300-a641-449e-b53f-8572735899dc, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) +[2025-01-08 12:25:51] document_id test-doc-9666258c-a25a-4290-9050-951dd054f0b3, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] OLD? UPDATE SINGLE +[2025-01-08 12:25:51] Doc ID: test-doc-49b93c6c-0625-474e-a29b-10533384c461 and id is d88662bf-f859-58cc-84b6-f04257838115 +[2025-01-08 12:25:51] called in update_single +[2025-01-08 12:25:51] document_id test-doc-49b93c6c-0625-474e-a29b-10533384c461, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] called in update_single +[2025-01-08 12:25:51] doc_id test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) +[2025-01-08 12:25:51] Doc ID: test-doc-49b93c6c-0625-474e-a29b-10533384c461 and id is d88662bf-f859-58cc-84b6-f04257838115 +[2025-01-08 12:25:51] Doc ID: test-doc-3340635d-6077-4feb-8553-0618c0a249c7 and id is 3d01e4cc-0c5f-57ad-8139-fbc7edce37cc +[2025-01-08 12:25:51] document_id test-doc-49b93c6c-0625-474e-a29b-10533384c461, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] called in update_single +[2025-01-08 12:25:51] Doc ID: test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248 and id is 619fc1a8-74e3-57fe-95dc-1ac48d9ead3e +[2025-01-08 12:25:51] called in update_single +[2025-01-08 12:25:51] document_id test-doc-3340635d-6077-4feb-8553-0618c0a249c7, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] called in update_single +[2025-01-08 12:25:51] document_id test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] OLD? UPDATE SINGLE +[2025-01-08 12:25:51] OLD? UPDATE SINGLE +[2025-01-08 12:25:51] Doc ID: test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3 and id is b15838ad-4607-5815-9079-6c37a6688836 +[2025-01-08 12:25:51] Doc ID: test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248 and id is 619fc1a8-74e3-57fe-95dc-1ac48d9ead3e +[2025-01-08 12:25:51] Doc ID: test-doc-3340635d-6077-4feb-8553-0618c0a249c7 and id is 3d01e4cc-0c5f-57ad-8139-fbc7edce37cc +[2025-01-08 12:25:51] Doc ID: test-doc-c0cda300-a641-449e-b53f-8572735899dc and id is 91971d6a-e954-5c17-a3f4-88b9e7090adf +[2025-01-08 12:25:51] doc_id test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) +[2025-01-08 12:25:51] document_id test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] document_id test-doc-3340635d-6077-4feb-8553-0618c0a249c7, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] document_id test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] Doc ID: test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3 and id is b15838ad-4607-5815-9079-6c37a6688836 +[2025-01-08 12:25:51] document_id test-doc-c0cda300-a641-449e-b53f-8572735899dc, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] OLD? UPDATE SINGLE +[2025-01-08 12:25:51] doc_id test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) +[2025-01-08 12:25:51] document_id test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] called in update_single +[2025-01-08 12:25:51] Doc ID: test-doc-c0cda300-a641-449e-b53f-8572735899dc and id is 91971d6a-e954-5c17-a3f4-88b9e7090adf +[2025-01-08 12:25:51] Doc ID: test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547 and id is 1175290a-e73b-59ae-a00b-155603488490 +[2025-01-08 12:25:51] doc_id test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) +[2025-01-08 12:25:51] OLD? UPDATE SINGLE +[2025-01-08 12:25:51] doc_id test-doc-3340635d-6077-4feb-8553-0618c0a249c7, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) +[2025-01-08 12:25:51] document_id test-doc-c0cda300-a641-449e-b53f-8572735899dc, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] document_id test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] OLD? UPDATE SINGLE +[2025-01-08 12:25:51] doc_id test-doc-49b93c6c-0625-474e-a29b-10533384c461, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) +[2025-01-08 12:25:51] Doc ID: test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547 and id is 1175290a-e73b-59ae-a00b-155603488490 +[2025-01-08 12:25:51] called in update_single +[2025-01-08 12:25:51] called in update_single +[2025-01-08 12:25:51] document_id test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] called in update_single +[2025-01-08 12:25:51] called in update_single +[2025-01-08 12:25:51] Doc ID: test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac and id is c4e99738-6467-5a14-a151-d2d68173d286 +[2025-01-08 12:25:51] Doc ID: test-doc-3340635d-6077-4feb-8553-0618c0a249c7 and id is 3d01e4cc-0c5f-57ad-8139-fbc7edce37cc +[2025-01-08 12:25:51] OLD? UPDATE SINGLE +[2025-01-08 12:25:51] Doc ID: test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43 and id is 9188e172-07ff-547f-8ecc-f8ce25c181b0 +[2025-01-08 12:25:51] called in update_single +[2025-01-08 12:25:51] document_id test-doc-3340635d-6077-4feb-8553-0618c0a249c7, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] document_id test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] document_id test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] Doc ID: test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9 and id is ea672bb3-2d07-51c5-a39a-5f5bd7219fab +[2025-01-08 12:25:51] doc_id test-doc-9666258c-a25a-4290-9050-951dd054f0b3, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) +[2025-01-08 12:25:51] Doc ID: test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac and id is c4e99738-6467-5a14-a151-d2d68173d286 +[2025-01-08 12:25:51] Doc ID: test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43 and id is 9188e172-07ff-547f-8ecc-f8ce25c181b0 +[2025-01-08 12:25:51] document_id test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] Doc ID: test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547 and id is 1175290a-e73b-59ae-a00b-155603488490 +[2025-01-08 12:25:51] document_id test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] Doc ID: test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9 and id is ea672bb3-2d07-51c5-a39a-5f5bd7219fab +[2025-01-08 12:25:51] Doc ID: test-doc-3340635d-6077-4feb-8553-0618c0a249c7 and id is 3d01e4cc-0c5f-57ad-8139-fbc7edce37cc +[2025-01-08 12:25:51] document_id test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] document_id test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] document_id test-doc-3340635d-6077-4feb-8553-0618c0a249c7, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] Doc ID: test-doc-49b93c6c-0625-474e-a29b-10533384c461 and id is d88662bf-f859-58cc-84b6-f04257838115 +[2025-01-08 12:25:51] document_id test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] document_id test-doc-49b93c6c-0625-474e-a29b-10533384c461, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] Doc ID: test-doc-49b93c6c-0625-474e-a29b-10533384c461 and id is d88662bf-f859-58cc-84b6-f04257838115 +[2025-01-08 12:25:51] Doc ID: test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547 and id is 1175290a-e73b-59ae-a00b-155603488490 +[2025-01-08 12:25:51] document_id test-doc-49b93c6c-0625-474e-a29b-10533384c461, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] document_id test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] called in update_single +[2025-01-08 12:25:51] Doc ID: test-doc-9666258c-a25a-4290-9050-951dd054f0b3 and id is b518bd97-dda4-5241-8dcf-60164e0a6baa +[2025-01-08 12:25:51] document_id test-doc-9666258c-a25a-4290-9050-951dd054f0b3, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:51] Doc ID: test-doc-9666258c-a25a-4290-9050-951dd054f0b3 and id is b518bd97-dda4-5241-8dcf-60164e0a6baa +[2025-01-08 12:25:51] document_id test-doc-9666258c-a25a-4290-9050-951dd054f0b3, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:52] OLD? UPDATE SINGLE +[2025-01-08 12:25:52] doc_id test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) +[2025-01-08 12:25:52] OLD? UPDATE SINGLE +[2025-01-08 12:25:52] OLD? UPDATE SINGLE +[2025-01-08 12:25:52] doc_id test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) +[2025-01-08 12:25:52] doc_id test-doc-c0cda300-a641-449e-b53f-8572735899dc, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) +[2025-01-08 12:25:52] OLD? UPDATE SINGLE +[2025-01-08 12:25:52] doc_id test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) +[2025-01-08 12:25:52] called in update_single +[2025-01-08 12:25:52] called in update_single +[2025-01-08 12:25:52] called in update_single +[2025-01-08 12:25:52] Doc ID: test-doc-c0cda300-a641-449e-b53f-8572735899dc and id is 91971d6a-e954-5c17-a3f4-88b9e7090adf +[2025-01-08 12:25:52] Doc ID: test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248 and id is 619fc1a8-74e3-57fe-95dc-1ac48d9ead3e +[2025-01-08 12:25:52] document_id test-doc-c0cda300-a641-449e-b53f-8572735899dc, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:52] Doc ID: test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3 and id is b15838ad-4607-5815-9079-6c37a6688836 +[2025-01-08 12:25:52] document_id test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:52] Doc ID: test-doc-c0cda300-a641-449e-b53f-8572735899dc and id is 91971d6a-e954-5c17-a3f4-88b9e7090adf +[2025-01-08 12:25:52] Doc ID: test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248 and id is 619fc1a8-74e3-57fe-95dc-1ac48d9ead3e +[2025-01-08 12:25:52] document_id test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:52] document_id test-doc-c0cda300-a641-449e-b53f-8572735899dc, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:52] document_id test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:52] Doc ID: test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3 and id is b15838ad-4607-5815-9079-6c37a6688836 +[2025-01-08 12:25:52] document_id test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:52] called in update_single +[2025-01-08 12:25:52] Doc ID: test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43 and id is 9188e172-07ff-547f-8ecc-f8ce25c181b0 +[2025-01-08 12:25:52] document_id test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43, chunk_id 0, tenant_id public, large_chunk_id None +[2025-01-08 12:25:52] Doc ID: test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43 and id is 9188e172-07ff-547f-8ecc-f8ce25c181b0 +[2025-01-08 12:25:52] document_id test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43, chunk_id 0, tenant_id public, large_chunk_id None diff --git a/backend/tests/integration/common_utils/managers/document.py b/backend/tests/integration/common_utils/managers/document.py index 1e6d5b6acd5..d1514126cc3 100644 --- a/backend/tests/integration/common_utils/managers/document.py +++ b/backend/tests/integration/common_utils/managers/document.py @@ -23,7 +23,6 @@ def _verify_document_permissions( ) -> None: acl_keys = set(retrieved_doc.get("access_control_list", {}).keys()) print(f"ACL keys: {acl_keys}") - print(f"Full retrieved document: {retrieved_doc}") if cc_pair.access_type == AccessType.PUBLIC: if "PUBLIC" not in acl_keys: @@ -158,11 +157,6 @@ def verify( ) -> None: doc_ids = [document.id for document in cc_pair.documents] retrieved_docs_dict = vespa_client.get_documents_by_id(doc_ids)["documents"] - print(f"from the doc ids {doc_ids}") - - print("--------------------------------") - for doc in retrieved_docs_dict: - print(f"document iD: {doc['fields']['document_id']} and {doc['id']}") retrieved_docs = { doc["fields"]["document_id"]: doc["fields"] for doc in retrieved_docs_dict diff --git a/backend/tests/integration/tests/connector/test_connector_deletion.py b/backend/tests/integration/tests/connector/test_connector_deletion.py index d2af7a31f56..8878a502e2a 100644 --- a/backend/tests/integration/tests/connector/test_connector_deletion.py +++ b/backend/tests/integration/tests/connector/test_connector_deletion.py @@ -5,6 +5,8 @@ - updates the document sets and user groups to remove the connector - Ensure that deleting a connector that is part of an overlapping document set and/or user group works as expected """ +from uuid import uuid4 + from sqlalchemy.orm import Session from onyx.db.engine import get_sqlalchemy_engine @@ -190,161 +192,161 @@ def test_connector_deletion(reset: None, vespa_client: vespa_fixture) -> None: ) -# def test_connector_deletion_for_overlapping_connectors( -# reset: None, vespa_client: vespa_fixture -# ) -> None: -# """Checks to make sure that connectors with overlapping documents work properly. Specifically, that the overlapping -# document (1) still exists and (2) has the right document set / group post-deletion of one of the connectors. -# """ -# # Creating an admin user (first user created is automatically an admin) -# admin_user: DATestUser = UserManager.create(name="admin_user") -# # create api key -# api_key: DATestAPIKey = APIKeyManager.create( -# user_performing_action=admin_user, -# ) - -# # create connectors -# cc_pair_1 = CCPairManager.create_from_scratch( -# source=DocumentSource.INGESTION_API, -# user_performing_action=admin_user, -# ) -# cc_pair_2 = CCPairManager.create_from_scratch( -# source=DocumentSource.INGESTION_API, -# user_performing_action=admin_user, -# ) - -# doc_ids = [str(uuid4())] -# cc_pair_1.documents = DocumentManager.seed_dummy_docs( -# cc_pair=cc_pair_1, -# document_ids=doc_ids, -# api_key=api_key, -# ) -# cc_pair_2.documents = DocumentManager.seed_dummy_docs( -# cc_pair=cc_pair_2, -# document_ids=doc_ids, -# api_key=api_key, -# ) - -# # verify vespa document exists and that it is not in any document sets or groups -# DocumentManager.verify( -# vespa_client=vespa_client, -# cc_pair=cc_pair_1, -# doc_set_names=[], -# group_names=[], -# doc_creating_user=admin_user, -# ) -# DocumentManager.verify( -# vespa_client=vespa_client, -# cc_pair=cc_pair_2, -# doc_set_names=[], -# group_names=[], -# doc_creating_user=admin_user, -# ) - -# # create document set -# doc_set_1 = DocumentSetManager.create( -# name="Test Document Set 1", -# cc_pair_ids=[cc_pair_1.id], -# user_performing_action=admin_user, -# ) -# DocumentSetManager.wait_for_sync( -# document_sets_to_check=[doc_set_1], -# user_performing_action=admin_user, -# ) - -# print("Document set 1 created and synced") - -# # verify vespa document is in the document set -# DocumentManager.verify( -# vespa_client=vespa_client, -# cc_pair=cc_pair_1, -# doc_set_names=[doc_set_1.name], -# doc_creating_user=admin_user, -# ) -# DocumentManager.verify( -# vespa_client=vespa_client, -# cc_pair=cc_pair_2, -# doc_creating_user=admin_user, -# ) - -# # create a user group and attach it to connector 1 -# user_group_1: DATestUserGroup = UserGroupManager.create( -# name="Test User Group 1", -# cc_pair_ids=[cc_pair_1.id], -# user_performing_action=admin_user, -# ) -# UserGroupManager.wait_for_sync( -# user_groups_to_check=[user_group_1], -# user_performing_action=admin_user, -# ) -# cc_pair_1.groups = [user_group_1.id] - -# print("User group 1 created and synced") - -# # create a user group and attach it to connector 2 -# user_group_2: DATestUserGroup = UserGroupManager.create( -# name="Test User Group 2", -# cc_pair_ids=[cc_pair_2.id], -# user_performing_action=admin_user, -# ) -# UserGroupManager.wait_for_sync( -# user_groups_to_check=[user_group_2], -# user_performing_action=admin_user, -# ) -# cc_pair_2.groups = [user_group_2.id] - -# print("User group 2 created and synced") - -# # verify vespa document is in the user group -# DocumentManager.verify( -# vespa_client=vespa_client, -# cc_pair=cc_pair_1, -# group_names=[user_group_1.name, user_group_2.name], -# doc_creating_user=admin_user, -# ) -# DocumentManager.verify( -# vespa_client=vespa_client, -# cc_pair=cc_pair_2, -# group_names=[user_group_1.name, user_group_2.name], -# doc_creating_user=admin_user, -# ) - -# # delete connector 1 -# CCPairManager.pause_cc_pair( -# cc_pair=cc_pair_1, -# user_performing_action=admin_user, -# ) -# CCPairManager.delete( -# cc_pair=cc_pair_1, -# user_performing_action=admin_user, -# ) - -# # wait for deletion to finish -# CCPairManager.wait_for_deletion_completion( -# cc_pair_id=cc_pair_1.id, user_performing_action=admin_user -# ) - -# print("Connector 1 deleted") - -# # check that only connector 1 is deleted -# # TODO: check for the CC pair rather than the connector once the refactor is done -# CCPairManager.verify( -# cc_pair=cc_pair_1, -# verify_deleted=True, -# user_performing_action=admin_user, -# ) -# CCPairManager.verify( -# cc_pair=cc_pair_2, -# user_performing_action=admin_user, -# ) - -# # verify the document is not in any document sets -# # verify the document is only in user group 2 -# DocumentManager.verify( -# vespa_client=vespa_client, -# cc_pair=cc_pair_2, -# doc_set_names=[], -# group_names=[user_group_2.name], -# doc_creating_user=admin_user, -# verify_deleted=False, -# ) +def test_connector_deletion_for_overlapping_connectors( + reset: None, vespa_client: vespa_fixture +) -> None: + """Checks to make sure that connectors with overlapping documents work properly. Specifically, that the overlapping + document (1) still exists and (2) has the right document set / group post-deletion of one of the connectors. + """ + # Creating an admin user (first user created is automatically an admin) + admin_user: DATestUser = UserManager.create(name="admin_user") + # create api key + api_key: DATestAPIKey = APIKeyManager.create( + user_performing_action=admin_user, + ) + + # create connectors + cc_pair_1 = CCPairManager.create_from_scratch( + source=DocumentSource.INGESTION_API, + user_performing_action=admin_user, + ) + cc_pair_2 = CCPairManager.create_from_scratch( + source=DocumentSource.INGESTION_API, + user_performing_action=admin_user, + ) + + doc_ids = [str(uuid4())] + cc_pair_1.documents = DocumentManager.seed_dummy_docs( + cc_pair=cc_pair_1, + document_ids=doc_ids, + api_key=api_key, + ) + cc_pair_2.documents = DocumentManager.seed_dummy_docs( + cc_pair=cc_pair_2, + document_ids=doc_ids, + api_key=api_key, + ) + + # verify vespa document exists and that it is not in any document sets or groups + DocumentManager.verify( + vespa_client=vespa_client, + cc_pair=cc_pair_1, + doc_set_names=[], + group_names=[], + doc_creating_user=admin_user, + ) + DocumentManager.verify( + vespa_client=vespa_client, + cc_pair=cc_pair_2, + doc_set_names=[], + group_names=[], + doc_creating_user=admin_user, + ) + + # create document set + doc_set_1 = DocumentSetManager.create( + name="Test Document Set 1", + cc_pair_ids=[cc_pair_1.id], + user_performing_action=admin_user, + ) + DocumentSetManager.wait_for_sync( + document_sets_to_check=[doc_set_1], + user_performing_action=admin_user, + ) + + print("Document set 1 created and synced") + + # verify vespa document is in the document set + DocumentManager.verify( + vespa_client=vespa_client, + cc_pair=cc_pair_1, + doc_set_names=[doc_set_1.name], + doc_creating_user=admin_user, + ) + DocumentManager.verify( + vespa_client=vespa_client, + cc_pair=cc_pair_2, + doc_creating_user=admin_user, + ) + + # create a user group and attach it to connector 1 + user_group_1: DATestUserGroup = UserGroupManager.create( + name="Test User Group 1", + cc_pair_ids=[cc_pair_1.id], + user_performing_action=admin_user, + ) + UserGroupManager.wait_for_sync( + user_groups_to_check=[user_group_1], + user_performing_action=admin_user, + ) + cc_pair_1.groups = [user_group_1.id] + + print("User group 1 created and synced") + + # create a user group and attach it to connector 2 + user_group_2: DATestUserGroup = UserGroupManager.create( + name="Test User Group 2", + cc_pair_ids=[cc_pair_2.id], + user_performing_action=admin_user, + ) + UserGroupManager.wait_for_sync( + user_groups_to_check=[user_group_2], + user_performing_action=admin_user, + ) + cc_pair_2.groups = [user_group_2.id] + + print("User group 2 created and synced") + + # verify vespa document is in the user group + DocumentManager.verify( + vespa_client=vespa_client, + cc_pair=cc_pair_1, + group_names=[user_group_1.name, user_group_2.name], + doc_creating_user=admin_user, + ) + DocumentManager.verify( + vespa_client=vespa_client, + cc_pair=cc_pair_2, + group_names=[user_group_1.name, user_group_2.name], + doc_creating_user=admin_user, + ) + + # delete connector 1 + CCPairManager.pause_cc_pair( + cc_pair=cc_pair_1, + user_performing_action=admin_user, + ) + CCPairManager.delete( + cc_pair=cc_pair_1, + user_performing_action=admin_user, + ) + + # wait for deletion to finish + CCPairManager.wait_for_deletion_completion( + cc_pair_id=cc_pair_1.id, user_performing_action=admin_user + ) + + print("Connector 1 deleted") + + # check that only connector 1 is deleted + # TODO: check for the CC pair rather than the connector once the refactor is done + CCPairManager.verify( + cc_pair=cc_pair_1, + verify_deleted=True, + user_performing_action=admin_user, + ) + CCPairManager.verify( + cc_pair=cc_pair_2, + user_performing_action=admin_user, + ) + + # verify the document is not in any document sets + # verify the document is only in user group 2 + DocumentManager.verify( + vespa_client=vespa_client, + cc_pair=cc_pair_2, + doc_set_names=[], + group_names=[user_group_2.name], + doc_creating_user=admin_user, + verify_deleted=False, + ) diff --git a/backend/vespa_update_logs.txt b/backend/vespa_update_logs.txt new file mode 100644 index 00000000000..b4afa57c6e7 --- /dev/null +++ b/backend/vespa_update_logs.txt @@ -0,0 +1,120 @@ +[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac', chunk_start_index=0, old_version=False, chunk_end_index=1) +[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9', chunk_start_index=0, old_version=False, chunk_end_index=1) +[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-9666258c-a25a-4290-9050-951dd054f0b3', chunk_start_index=0, old_version=False, chunk_end_index=1) +[2025-01-08 12:25:51] CREATED Doc ID: test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac Chunk ID: c4e99738-6467-5a14-a151-d2d68173d286 +[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} +[2025-01-08 12:25:51] CREATED Doc ID: test-doc-9666258c-a25a-4290-9050-951dd054f0b3 Chunk ID: b518bd97-dda4-5241-8dcf-60164e0a6baa +[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-49b93c6c-0625-474e-a29b-10533384c461', chunk_start_index=0, old_version=False, chunk_end_index=1) +[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} +[2025-01-08 12:25:51] CREATED Doc ID: test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9 Chunk ID: ea672bb3-2d07-51c5-a39a-5f5bd7219fab +[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/b518bd97-dda4-5241-8dcf-60164e0a6baa?create=true" +[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/c4e99738-6467-5a14-a151-d2d68173d286?create=true" +[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} +[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-3340635d-6077-4feb-8553-0618c0a249c7', chunk_start_index=0, old_version=False, chunk_end_index=1) +[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248', chunk_start_index=0, old_version=False, chunk_end_index=1) +[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3', chunk_start_index=0, old_version=False, chunk_end_index=1) +[2025-01-08 12:25:51] CREATED Doc ID: test-doc-49b93c6c-0625-474e-a29b-10533384c461 Chunk ID: d88662bf-f859-58cc-84b6-f04257838115 +[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-c0cda300-a641-449e-b53f-8572735899dc', chunk_start_index=0, old_version=False, chunk_end_index=1) +[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/ea672bb3-2d07-51c5-a39a-5f5bd7219fab?create=true" +[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547', chunk_start_index=0, old_version=False, chunk_end_index=1) +[2025-01-08 12:25:51] Document c4e99738-6467-5a14-a151-d2d68173d286 did not exist prior to update (this might be normal if it's new). +[2025-01-08 12:25:51] Document b518bd97-dda4-5241-8dcf-60164e0a6baa did not exist prior to update (this might be normal if it's new). +[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} +[2025-01-08 12:25:51] CREATED Doc ID: test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248 Chunk ID: 619fc1a8-74e3-57fe-95dc-1ac48d9ead3e +[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/d88662bf-f859-58cc-84b6-f04257838115?create=true" +[2025-01-08 12:25:51] CREATED Doc ID: test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3 Chunk ID: b15838ad-4607-5815-9079-6c37a6688836 +[2025-01-08 12:25:51] CREATED Doc ID: test-doc-3340635d-6077-4feb-8553-0618c0a249c7 Chunk ID: 3d01e4cc-0c5f-57ad-8139-fbc7edce37cc +[2025-01-08 12:25:51] Document ea672bb3-2d07-51c5-a39a-5f5bd7219fab did not exist prior to update (this might be normal if it's new). +[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} +[2025-01-08 12:25:51] CREATED Doc ID: test-doc-c0cda300-a641-449e-b53f-8572735899dc Chunk ID: 91971d6a-e954-5c17-a3f4-88b9e7090adf +[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} +[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43', chunk_start_index=0, old_version=False, chunk_end_index=1) +[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/b15838ad-4607-5815-9079-6c37a6688836?create=true" +[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} +[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac', chunk_start_index=0, old_version=False, chunk_end_index=1) +[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/91971d6a-e954-5c17-a3f4-88b9e7090adf?create=true" +[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} +[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9', chunk_start_index=0, old_version=False, chunk_end_index=1) +[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/619fc1a8-74e3-57fe-95dc-1ac48d9ead3e?create=true" +[2025-01-08 12:25:51] Document d88662bf-f859-58cc-84b6-f04257838115 did not exist prior to update (this might be normal if it's new). +[2025-01-08 12:25:51] CREATED Doc ID: test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547 Chunk ID: 1175290a-e73b-59ae-a00b-155603488490 +[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/3d01e4cc-0c5f-57ad-8139-fbc7edce37cc?create=true" +[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-3340635d-6077-4feb-8553-0618c0a249c7', chunk_start_index=0, old_version=False, chunk_end_index=1) +[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547', chunk_start_index=0, old_version=False, chunk_end_index=1) +[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} +[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/1175290a-e73b-59ae-a00b-155603488490?create=true" +[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-49b93c6c-0625-474e-a29b-10533384c461', chunk_start_index=0, old_version=False, chunk_end_index=1) +[2025-01-08 12:25:51] Document 619fc1a8-74e3-57fe-95dc-1ac48d9ead3e did not exist prior to update (this might be normal if it's new). +[2025-01-08 12:25:51] Document b15838ad-4607-5815-9079-6c37a6688836 did not exist prior to update (this might be normal if it's new). +[2025-01-08 12:25:51] CREATED Doc ID: test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43 Chunk ID: 9188e172-07ff-547f-8ecc-f8ce25c181b0 +[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} +[2025-01-08 12:25:51] CREATED Doc ID: test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac Chunk ID: c4e99738-6467-5a14-a151-d2d68173d286 +[2025-01-08 12:25:51] Document 91971d6a-e954-5c17-a3f4-88b9e7090adf did not exist prior to update (this might be normal if it's new). +[2025-01-08 12:25:51] CREATED Doc ID: test-doc-3340635d-6077-4feb-8553-0618c0a249c7 Chunk ID: 3d01e4cc-0c5f-57ad-8139-fbc7edce37cc +[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} +[2025-01-08 12:25:51] CREATED Doc ID: test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9 Chunk ID: ea672bb3-2d07-51c5-a39a-5f5bd7219fab +[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} +[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} +[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/3d01e4cc-0c5f-57ad-8139-fbc7edce37cc?create=true" +[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/9188e172-07ff-547f-8ecc-f8ce25c181b0?create=true" +[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/c4e99738-6467-5a14-a151-d2d68173d286?create=true" +[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/ea672bb3-2d07-51c5-a39a-5f5bd7219fab?create=true" +[2025-01-08 12:25:51] CREATED Doc ID: test-doc-49b93c6c-0625-474e-a29b-10533384c461 Chunk ID: d88662bf-f859-58cc-84b6-f04257838115 +[2025-01-08 12:25:51] CREATED Doc ID: test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547 Chunk ID: 1175290a-e73b-59ae-a00b-155603488490 +[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} +[2025-01-08 12:25:51] Document 3d01e4cc-0c5f-57ad-8139-fbc7edce37cc did not exist prior to update (this might be normal if it's new). +[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} +[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/1175290a-e73b-59ae-a00b-155603488490?create=true" +[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/d88662bf-f859-58cc-84b6-f04257838115?create=true" +[2025-01-08 12:25:51] Document 1175290a-e73b-59ae-a00b-155603488490 did not exist prior to update (this might be normal if it's new). +[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-9666258c-a25a-4290-9050-951dd054f0b3', chunk_start_index=0, old_version=False, chunk_end_index=1) +[2025-01-08 12:25:51] Document 3d01e4cc-0c5f-57ad-8139-fbc7edce37cc did not exist prior to update (this might be normal if it's new). +[2025-01-08 12:25:51] Document 9188e172-07ff-547f-8ecc-f8ce25c181b0 did not exist prior to update (this might be normal if it's new). +[2025-01-08 12:25:51] CREATED Doc ID: test-doc-9666258c-a25a-4290-9050-951dd054f0b3 Chunk ID: b518bd97-dda4-5241-8dcf-60164e0a6baa +[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} +[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/b518bd97-dda4-5241-8dcf-60164e0a6baa?create=true" +[2025-01-08 12:25:51] Document 1175290a-e73b-59ae-a00b-155603488490 did not exist prior to update (this might be normal if it's new). +[2025-01-08 12:25:51] Document before update [test-doc-9666258c-a25a-4290-9050-951dd054f0b3]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/b518bd97-dda4-5241-8dcf-60164e0a6baa', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::b518bd97-dda4-5241-8dcf-60164e0a6baa', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:51] Document before update [test-doc-49b93c6c-0625-474e-a29b-10533384c461]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/d88662bf-f859-58cc-84b6-f04257838115', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::d88662bf-f859-58cc-84b6-f04257838115', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:51] Document before update [test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/ea672bb3-2d07-51c5-a39a-5f5bd7219fab', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::ea672bb3-2d07-51c5-a39a-5f5bd7219fab', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:51] Document before update [test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/c4e99738-6467-5a14-a151-d2d68173d286', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::c4e99738-6467-5a14-a151-d2d68173d286', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:51] Document after update [test-doc-c0cda300-a641-449e-b53f-8572735899dc]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/91971d6a-e954-5c17-a3f4-88b9e7090adf', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::91971d6a-e954-5c17-a3f4-88b9e7090adf', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:51] Document after update [test-doc-49b93c6c-0625-474e-a29b-10533384c461]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/d88662bf-f859-58cc-84b6-f04257838115', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::d88662bf-f859-58cc-84b6-f04257838115', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:51] Document after update [test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/9188e172-07ff-547f-8ecc-f8ce25c181b0', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::9188e172-07ff-547f-8ecc-f8ce25c181b0', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:51] Document after update [test-doc-9666258c-a25a-4290-9050-951dd054f0b3]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/b518bd97-dda4-5241-8dcf-60164e0a6baa', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::b518bd97-dda4-5241-8dcf-60164e0a6baa', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:51] Document after update [test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/619fc1a8-74e3-57fe-95dc-1ac48d9ead3e', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::619fc1a8-74e3-57fe-95dc-1ac48d9ead3e', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:51] Document after update [test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/ea672bb3-2d07-51c5-a39a-5f5bd7219fab', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::ea672bb3-2d07-51c5-a39a-5f5bd7219fab', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:51] Document after update [test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/b15838ad-4607-5815-9079-6c37a6688836', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::b15838ad-4607-5815-9079-6c37a6688836', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:51] Document after update [test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/c4e99738-6467-5a14-a151-d2d68173d286', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::c4e99738-6467-5a14-a151-d2d68173d286', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:51] Document after update [test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/1175290a-e73b-59ae-a00b-155603488490', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::1175290a-e73b-59ae-a00b-155603488490', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:51] Document after update [test-doc-3340635d-6077-4feb-8553-0618c0a249c7]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/3d01e4cc-0c5f-57ad-8139-fbc7edce37cc', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::3d01e4cc-0c5f-57ad-8139-fbc7edce37cc', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:52] Document after update [test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/1175290a-e73b-59ae-a00b-155603488490', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::1175290a-e73b-59ae-a00b-155603488490', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:52] Document after update [test-doc-3340635d-6077-4feb-8553-0618c0a249c7]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/3d01e4cc-0c5f-57ad-8139-fbc7edce37cc', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::3d01e4cc-0c5f-57ad-8139-fbc7edce37cc', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:52] Document after update [test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/c4e99738-6467-5a14-a151-d2d68173d286', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::c4e99738-6467-5a14-a151-d2d68173d286', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:52] Document after update [test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/ea672bb3-2d07-51c5-a39a-5f5bd7219fab', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::ea672bb3-2d07-51c5-a39a-5f5bd7219fab', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:52] Document after update [test-doc-49b93c6c-0625-474e-a29b-10533384c461]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/d88662bf-f859-58cc-84b6-f04257838115', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::d88662bf-f859-58cc-84b6-f04257838115', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:52] Document after update [test-doc-9666258c-a25a-4290-9050-951dd054f0b3]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/b518bd97-dda4-5241-8dcf-60164e0a6baa', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::b518bd97-dda4-5241-8dcf-60164e0a6baa', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:52] EnrichedDocumentIndexingInfo(doc_id='test-doc-c0cda300-a641-449e-b53f-8572735899dc', chunk_start_index=0, old_version=False, chunk_end_index=1) +[2025-01-08 12:25:52] EnrichedDocumentIndexingInfo(doc_id='test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248', chunk_start_index=0, old_version=False, chunk_end_index=1) +[2025-01-08 12:25:52] EnrichedDocumentIndexingInfo(doc_id='test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3', chunk_start_index=0, old_version=False, chunk_end_index=1) +[2025-01-08 12:25:52] CREATED Doc ID: test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248 Chunk ID: 619fc1a8-74e3-57fe-95dc-1ac48d9ead3e +[2025-01-08 12:25:52] CREATED Doc ID: test-doc-c0cda300-a641-449e-b53f-8572735899dc Chunk ID: 91971d6a-e954-5c17-a3f4-88b9e7090adf +[2025-01-08 12:25:52] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} +[2025-01-08 12:25:52] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} +[2025-01-08 12:25:52] CREATED Doc ID: test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3 Chunk ID: b15838ad-4607-5815-9079-6c37a6688836 +[2025-01-08 12:25:52] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/91971d6a-e954-5c17-a3f4-88b9e7090adf?create=true" +[2025-01-08 12:25:52] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/619fc1a8-74e3-57fe-95dc-1ac48d9ead3e?create=true" +[2025-01-08 12:25:52] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} +[2025-01-08 12:25:52] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/b15838ad-4607-5815-9079-6c37a6688836?create=true" +[2025-01-08 12:25:52] EnrichedDocumentIndexingInfo(doc_id='test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43', chunk_start_index=0, old_version=False, chunk_end_index=1) +[2025-01-08 12:25:52] CREATED Doc ID: test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43 Chunk ID: 9188e172-07ff-547f-8ecc-f8ce25c181b0 +[2025-01-08 12:25:52] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} +[2025-01-08 12:25:52] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/9188e172-07ff-547f-8ecc-f8ce25c181b0?create=true" +[2025-01-08 12:25:52] Document before update [test-doc-c0cda300-a641-449e-b53f-8572735899dc]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/91971d6a-e954-5c17-a3f4-88b9e7090adf', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::91971d6a-e954-5c17-a3f4-88b9e7090adf', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:52] Document before update [test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/619fc1a8-74e3-57fe-95dc-1ac48d9ead3e', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::619fc1a8-74e3-57fe-95dc-1ac48d9ead3e', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:52] Document before update [test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/b15838ad-4607-5815-9079-6c37a6688836', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::b15838ad-4607-5815-9079-6c37a6688836', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:52] Document before update [test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/9188e172-07ff-547f-8ecc-f8ce25c181b0', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::9188e172-07ff-547f-8ecc-f8ce25c181b0', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:52] Document after update [test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/619fc1a8-74e3-57fe-95dc-1ac48d9ead3e', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::619fc1a8-74e3-57fe-95dc-1ac48d9ead3e', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:52] Document after update [test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/b15838ad-4607-5815-9079-6c37a6688836', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::b15838ad-4607-5815-9079-6c37a6688836', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:52] Document after update [test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/9188e172-07ff-547f-8ecc-f8ce25c181b0', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::9188e172-07ff-547f-8ecc-f8ce25c181b0', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} +[2025-01-08 12:25:52] Document after update [test-doc-c0cda300-a641-449e-b53f-8572735899dc]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/91971d6a-e954-5c17-a3f4-88b9e7090adf', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::91971d6a-e954-5c17-a3f4-88b9e7090adf', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} From 86d9829c6fe9770c3c58e4d05264e3eef18acb02 Mon Sep 17 00:00:00 2001 From: pablodanswer Date: Wed, 8 Jan 2025 12:31:44 -0800 Subject: [PATCH 10/11] remove logs --- backend/testing.txt | 140 ---------------------------------- backend/vespa_update_logs.txt | 120 ----------------------------- 2 files changed, 260 deletions(-) delete mode 100644 backend/testing.txt delete mode 100644 backend/vespa_update_logs.txt diff --git a/backend/testing.txt b/backend/testing.txt deleted file mode 100644 index 49406e070e9..00000000000 --- a/backend/testing.txt +++ /dev/null @@ -1,140 +0,0 @@ -[2025-01-08 12:25:51] OLD? UPDATE SINGLE -[2025-01-08 12:25:51] OLD? UPDATE SINGLE -[2025-01-08 12:25:51] doc_id test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) -[2025-01-08 12:25:51] doc_id test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) -[2025-01-08 12:25:51] OLD? UPDATE SINGLE -[2025-01-08 12:25:51] OLD? UPDATE SINGLE -[2025-01-08 12:25:51] doc_id test-doc-9666258c-a25a-4290-9050-951dd054f0b3, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) -[2025-01-08 12:25:51] doc_id test-doc-49b93c6c-0625-474e-a29b-10533384c461, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) -[2025-01-08 12:25:51] called in update_single -[2025-01-08 12:25:51] called in update_single -[2025-01-08 12:25:51] OLD? UPDATE SINGLE -[2025-01-08 12:25:51] called in update_single -[2025-01-08 12:25:51] Doc ID: test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac and id is c4e99738-6467-5a14-a151-d2d68173d286 -[2025-01-08 12:25:51] OLD? UPDATE SINGLE -[2025-01-08 12:25:51] OLD? UPDATE SINGLE -[2025-01-08 12:25:51] document_id test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] doc_id test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) -[2025-01-08 12:25:51] doc_id test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) -[2025-01-08 12:25:51] doc_id test-doc-3340635d-6077-4feb-8553-0618c0a249c7, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) -[2025-01-08 12:25:51] Doc ID: test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac and id is c4e99738-6467-5a14-a151-d2d68173d286 -[2025-01-08 12:25:51] Doc ID: test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9 and id is ea672bb3-2d07-51c5-a39a-5f5bd7219fab -[2025-01-08 12:25:51] document_id test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] called in update_single -[2025-01-08 12:25:51] OLD? UPDATE SINGLE -[2025-01-08 12:25:51] Doc ID: test-doc-9666258c-a25a-4290-9050-951dd054f0b3 and id is b518bd97-dda4-5241-8dcf-60164e0a6baa -[2025-01-08 12:25:51] Doc ID: test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9 and id is ea672bb3-2d07-51c5-a39a-5f5bd7219fab -[2025-01-08 12:25:51] doc_id test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) -[2025-01-08 12:25:51] document_id test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] document_id test-doc-9666258c-a25a-4290-9050-951dd054f0b3, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] OLD? UPDATE SINGLE -[2025-01-08 12:25:51] document_id test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] Doc ID: test-doc-9666258c-a25a-4290-9050-951dd054f0b3 and id is b518bd97-dda4-5241-8dcf-60164e0a6baa -[2025-01-08 12:25:51] doc_id test-doc-c0cda300-a641-449e-b53f-8572735899dc, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) -[2025-01-08 12:25:51] document_id test-doc-9666258c-a25a-4290-9050-951dd054f0b3, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] OLD? UPDATE SINGLE -[2025-01-08 12:25:51] Doc ID: test-doc-49b93c6c-0625-474e-a29b-10533384c461 and id is d88662bf-f859-58cc-84b6-f04257838115 -[2025-01-08 12:25:51] called in update_single -[2025-01-08 12:25:51] document_id test-doc-49b93c6c-0625-474e-a29b-10533384c461, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] called in update_single -[2025-01-08 12:25:51] doc_id test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) -[2025-01-08 12:25:51] Doc ID: test-doc-49b93c6c-0625-474e-a29b-10533384c461 and id is d88662bf-f859-58cc-84b6-f04257838115 -[2025-01-08 12:25:51] Doc ID: test-doc-3340635d-6077-4feb-8553-0618c0a249c7 and id is 3d01e4cc-0c5f-57ad-8139-fbc7edce37cc -[2025-01-08 12:25:51] document_id test-doc-49b93c6c-0625-474e-a29b-10533384c461, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] called in update_single -[2025-01-08 12:25:51] Doc ID: test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248 and id is 619fc1a8-74e3-57fe-95dc-1ac48d9ead3e -[2025-01-08 12:25:51] called in update_single -[2025-01-08 12:25:51] document_id test-doc-3340635d-6077-4feb-8553-0618c0a249c7, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] called in update_single -[2025-01-08 12:25:51] document_id test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] OLD? UPDATE SINGLE -[2025-01-08 12:25:51] OLD? UPDATE SINGLE -[2025-01-08 12:25:51] Doc ID: test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3 and id is b15838ad-4607-5815-9079-6c37a6688836 -[2025-01-08 12:25:51] Doc ID: test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248 and id is 619fc1a8-74e3-57fe-95dc-1ac48d9ead3e -[2025-01-08 12:25:51] Doc ID: test-doc-3340635d-6077-4feb-8553-0618c0a249c7 and id is 3d01e4cc-0c5f-57ad-8139-fbc7edce37cc -[2025-01-08 12:25:51] Doc ID: test-doc-c0cda300-a641-449e-b53f-8572735899dc and id is 91971d6a-e954-5c17-a3f4-88b9e7090adf -[2025-01-08 12:25:51] doc_id test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) -[2025-01-08 12:25:51] document_id test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] document_id test-doc-3340635d-6077-4feb-8553-0618c0a249c7, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] document_id test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] Doc ID: test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3 and id is b15838ad-4607-5815-9079-6c37a6688836 -[2025-01-08 12:25:51] document_id test-doc-c0cda300-a641-449e-b53f-8572735899dc, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] OLD? UPDATE SINGLE -[2025-01-08 12:25:51] doc_id test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) -[2025-01-08 12:25:51] document_id test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] called in update_single -[2025-01-08 12:25:51] Doc ID: test-doc-c0cda300-a641-449e-b53f-8572735899dc and id is 91971d6a-e954-5c17-a3f4-88b9e7090adf -[2025-01-08 12:25:51] Doc ID: test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547 and id is 1175290a-e73b-59ae-a00b-155603488490 -[2025-01-08 12:25:51] doc_id test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) -[2025-01-08 12:25:51] OLD? UPDATE SINGLE -[2025-01-08 12:25:51] doc_id test-doc-3340635d-6077-4feb-8553-0618c0a249c7, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) -[2025-01-08 12:25:51] document_id test-doc-c0cda300-a641-449e-b53f-8572735899dc, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] document_id test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] OLD? UPDATE SINGLE -[2025-01-08 12:25:51] doc_id test-doc-49b93c6c-0625-474e-a29b-10533384c461, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) -[2025-01-08 12:25:51] Doc ID: test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547 and id is 1175290a-e73b-59ae-a00b-155603488490 -[2025-01-08 12:25:51] called in update_single -[2025-01-08 12:25:51] called in update_single -[2025-01-08 12:25:51] document_id test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] called in update_single -[2025-01-08 12:25:51] called in update_single -[2025-01-08 12:25:51] Doc ID: test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac and id is c4e99738-6467-5a14-a151-d2d68173d286 -[2025-01-08 12:25:51] Doc ID: test-doc-3340635d-6077-4feb-8553-0618c0a249c7 and id is 3d01e4cc-0c5f-57ad-8139-fbc7edce37cc -[2025-01-08 12:25:51] OLD? UPDATE SINGLE -[2025-01-08 12:25:51] Doc ID: test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43 and id is 9188e172-07ff-547f-8ecc-f8ce25c181b0 -[2025-01-08 12:25:51] called in update_single -[2025-01-08 12:25:51] document_id test-doc-3340635d-6077-4feb-8553-0618c0a249c7, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] document_id test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] document_id test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] Doc ID: test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9 and id is ea672bb3-2d07-51c5-a39a-5f5bd7219fab -[2025-01-08 12:25:51] doc_id test-doc-9666258c-a25a-4290-9050-951dd054f0b3, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) -[2025-01-08 12:25:51] Doc ID: test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac and id is c4e99738-6467-5a14-a151-d2d68173d286 -[2025-01-08 12:25:51] Doc ID: test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43 and id is 9188e172-07ff-547f-8ecc-f8ce25c181b0 -[2025-01-08 12:25:51] document_id test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] Doc ID: test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547 and id is 1175290a-e73b-59ae-a00b-155603488490 -[2025-01-08 12:25:51] document_id test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] Doc ID: test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9 and id is ea672bb3-2d07-51c5-a39a-5f5bd7219fab -[2025-01-08 12:25:51] Doc ID: test-doc-3340635d-6077-4feb-8553-0618c0a249c7 and id is 3d01e4cc-0c5f-57ad-8139-fbc7edce37cc -[2025-01-08 12:25:51] document_id test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] document_id test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] document_id test-doc-3340635d-6077-4feb-8553-0618c0a249c7, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] Doc ID: test-doc-49b93c6c-0625-474e-a29b-10533384c461 and id is d88662bf-f859-58cc-84b6-f04257838115 -[2025-01-08 12:25:51] document_id test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] document_id test-doc-49b93c6c-0625-474e-a29b-10533384c461, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] Doc ID: test-doc-49b93c6c-0625-474e-a29b-10533384c461 and id is d88662bf-f859-58cc-84b6-f04257838115 -[2025-01-08 12:25:51] Doc ID: test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547 and id is 1175290a-e73b-59ae-a00b-155603488490 -[2025-01-08 12:25:51] document_id test-doc-49b93c6c-0625-474e-a29b-10533384c461, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] document_id test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] called in update_single -[2025-01-08 12:25:51] Doc ID: test-doc-9666258c-a25a-4290-9050-951dd054f0b3 and id is b518bd97-dda4-5241-8dcf-60164e0a6baa -[2025-01-08 12:25:51] document_id test-doc-9666258c-a25a-4290-9050-951dd054f0b3, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:51] Doc ID: test-doc-9666258c-a25a-4290-9050-951dd054f0b3 and id is b518bd97-dda4-5241-8dcf-60164e0a6baa -[2025-01-08 12:25:51] document_id test-doc-9666258c-a25a-4290-9050-951dd054f0b3, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:52] OLD? UPDATE SINGLE -[2025-01-08 12:25:52] doc_id test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) -[2025-01-08 12:25:52] OLD? UPDATE SINGLE -[2025-01-08 12:25:52] OLD? UPDATE SINGLE -[2025-01-08 12:25:52] doc_id test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) -[2025-01-08 12:25:52] doc_id test-doc-c0cda300-a641-449e-b53f-8572735899dc, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) -[2025-01-08 12:25:52] OLD? UPDATE SINGLE -[2025-01-08 12:25:52] doc_id test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43, chunk_count 1, tenant_id None, fields VespaDocumentFields(access=DocumentAccess(external_user_emails=set(), external_user_group_ids=set(), is_public=True, user_emails={'admin_user@test.com'}, user_groups={'test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52'}), document_sets=set(), boost=0, hidden=False) -[2025-01-08 12:25:52] called in update_single -[2025-01-08 12:25:52] called in update_single -[2025-01-08 12:25:52] called in update_single -[2025-01-08 12:25:52] Doc ID: test-doc-c0cda300-a641-449e-b53f-8572735899dc and id is 91971d6a-e954-5c17-a3f4-88b9e7090adf -[2025-01-08 12:25:52] Doc ID: test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248 and id is 619fc1a8-74e3-57fe-95dc-1ac48d9ead3e -[2025-01-08 12:25:52] document_id test-doc-c0cda300-a641-449e-b53f-8572735899dc, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:52] Doc ID: test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3 and id is b15838ad-4607-5815-9079-6c37a6688836 -[2025-01-08 12:25:52] document_id test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:52] Doc ID: test-doc-c0cda300-a641-449e-b53f-8572735899dc and id is 91971d6a-e954-5c17-a3f4-88b9e7090adf -[2025-01-08 12:25:52] Doc ID: test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248 and id is 619fc1a8-74e3-57fe-95dc-1ac48d9ead3e -[2025-01-08 12:25:52] document_id test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:52] document_id test-doc-c0cda300-a641-449e-b53f-8572735899dc, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:52] document_id test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:52] Doc ID: test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3 and id is b15838ad-4607-5815-9079-6c37a6688836 -[2025-01-08 12:25:52] document_id test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:52] called in update_single -[2025-01-08 12:25:52] Doc ID: test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43 and id is 9188e172-07ff-547f-8ecc-f8ce25c181b0 -[2025-01-08 12:25:52] document_id test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43, chunk_id 0, tenant_id public, large_chunk_id None -[2025-01-08 12:25:52] Doc ID: test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43 and id is 9188e172-07ff-547f-8ecc-f8ce25c181b0 -[2025-01-08 12:25:52] document_id test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43, chunk_id 0, tenant_id public, large_chunk_id None diff --git a/backend/vespa_update_logs.txt b/backend/vespa_update_logs.txt deleted file mode 100644 index b4afa57c6e7..00000000000 --- a/backend/vespa_update_logs.txt +++ /dev/null @@ -1,120 +0,0 @@ -[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac', chunk_start_index=0, old_version=False, chunk_end_index=1) -[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9', chunk_start_index=0, old_version=False, chunk_end_index=1) -[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-9666258c-a25a-4290-9050-951dd054f0b3', chunk_start_index=0, old_version=False, chunk_end_index=1) -[2025-01-08 12:25:51] CREATED Doc ID: test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac Chunk ID: c4e99738-6467-5a14-a151-d2d68173d286 -[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} -[2025-01-08 12:25:51] CREATED Doc ID: test-doc-9666258c-a25a-4290-9050-951dd054f0b3 Chunk ID: b518bd97-dda4-5241-8dcf-60164e0a6baa -[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-49b93c6c-0625-474e-a29b-10533384c461', chunk_start_index=0, old_version=False, chunk_end_index=1) -[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} -[2025-01-08 12:25:51] CREATED Doc ID: test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9 Chunk ID: ea672bb3-2d07-51c5-a39a-5f5bd7219fab -[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/b518bd97-dda4-5241-8dcf-60164e0a6baa?create=true" -[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/c4e99738-6467-5a14-a151-d2d68173d286?create=true" -[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} -[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-3340635d-6077-4feb-8553-0618c0a249c7', chunk_start_index=0, old_version=False, chunk_end_index=1) -[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248', chunk_start_index=0, old_version=False, chunk_end_index=1) -[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3', chunk_start_index=0, old_version=False, chunk_end_index=1) -[2025-01-08 12:25:51] CREATED Doc ID: test-doc-49b93c6c-0625-474e-a29b-10533384c461 Chunk ID: d88662bf-f859-58cc-84b6-f04257838115 -[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-c0cda300-a641-449e-b53f-8572735899dc', chunk_start_index=0, old_version=False, chunk_end_index=1) -[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/ea672bb3-2d07-51c5-a39a-5f5bd7219fab?create=true" -[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547', chunk_start_index=0, old_version=False, chunk_end_index=1) -[2025-01-08 12:25:51] Document c4e99738-6467-5a14-a151-d2d68173d286 did not exist prior to update (this might be normal if it's new). -[2025-01-08 12:25:51] Document b518bd97-dda4-5241-8dcf-60164e0a6baa did not exist prior to update (this might be normal if it's new). -[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} -[2025-01-08 12:25:51] CREATED Doc ID: test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248 Chunk ID: 619fc1a8-74e3-57fe-95dc-1ac48d9ead3e -[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/d88662bf-f859-58cc-84b6-f04257838115?create=true" -[2025-01-08 12:25:51] CREATED Doc ID: test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3 Chunk ID: b15838ad-4607-5815-9079-6c37a6688836 -[2025-01-08 12:25:51] CREATED Doc ID: test-doc-3340635d-6077-4feb-8553-0618c0a249c7 Chunk ID: 3d01e4cc-0c5f-57ad-8139-fbc7edce37cc -[2025-01-08 12:25:51] Document ea672bb3-2d07-51c5-a39a-5f5bd7219fab did not exist prior to update (this might be normal if it's new). -[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} -[2025-01-08 12:25:51] CREATED Doc ID: test-doc-c0cda300-a641-449e-b53f-8572735899dc Chunk ID: 91971d6a-e954-5c17-a3f4-88b9e7090adf -[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} -[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43', chunk_start_index=0, old_version=False, chunk_end_index=1) -[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/b15838ad-4607-5815-9079-6c37a6688836?create=true" -[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} -[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac', chunk_start_index=0, old_version=False, chunk_end_index=1) -[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/91971d6a-e954-5c17-a3f4-88b9e7090adf?create=true" -[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} -[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9', chunk_start_index=0, old_version=False, chunk_end_index=1) -[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/619fc1a8-74e3-57fe-95dc-1ac48d9ead3e?create=true" -[2025-01-08 12:25:51] Document d88662bf-f859-58cc-84b6-f04257838115 did not exist prior to update (this might be normal if it's new). -[2025-01-08 12:25:51] CREATED Doc ID: test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547 Chunk ID: 1175290a-e73b-59ae-a00b-155603488490 -[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/3d01e4cc-0c5f-57ad-8139-fbc7edce37cc?create=true" -[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-3340635d-6077-4feb-8553-0618c0a249c7', chunk_start_index=0, old_version=False, chunk_end_index=1) -[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547', chunk_start_index=0, old_version=False, chunk_end_index=1) -[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} -[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/1175290a-e73b-59ae-a00b-155603488490?create=true" -[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-49b93c6c-0625-474e-a29b-10533384c461', chunk_start_index=0, old_version=False, chunk_end_index=1) -[2025-01-08 12:25:51] Document 619fc1a8-74e3-57fe-95dc-1ac48d9ead3e did not exist prior to update (this might be normal if it's new). -[2025-01-08 12:25:51] Document b15838ad-4607-5815-9079-6c37a6688836 did not exist prior to update (this might be normal if it's new). -[2025-01-08 12:25:51] CREATED Doc ID: test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43 Chunk ID: 9188e172-07ff-547f-8ecc-f8ce25c181b0 -[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} -[2025-01-08 12:25:51] CREATED Doc ID: test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac Chunk ID: c4e99738-6467-5a14-a151-d2d68173d286 -[2025-01-08 12:25:51] Document 91971d6a-e954-5c17-a3f4-88b9e7090adf did not exist prior to update (this might be normal if it's new). -[2025-01-08 12:25:51] CREATED Doc ID: test-doc-3340635d-6077-4feb-8553-0618c0a249c7 Chunk ID: 3d01e4cc-0c5f-57ad-8139-fbc7edce37cc -[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} -[2025-01-08 12:25:51] CREATED Doc ID: test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9 Chunk ID: ea672bb3-2d07-51c5-a39a-5f5bd7219fab -[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} -[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} -[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/3d01e4cc-0c5f-57ad-8139-fbc7edce37cc?create=true" -[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/9188e172-07ff-547f-8ecc-f8ce25c181b0?create=true" -[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/c4e99738-6467-5a14-a151-d2d68173d286?create=true" -[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/ea672bb3-2d07-51c5-a39a-5f5bd7219fab?create=true" -[2025-01-08 12:25:51] CREATED Doc ID: test-doc-49b93c6c-0625-474e-a29b-10533384c461 Chunk ID: d88662bf-f859-58cc-84b6-f04257838115 -[2025-01-08 12:25:51] CREATED Doc ID: test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547 Chunk ID: 1175290a-e73b-59ae-a00b-155603488490 -[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} -[2025-01-08 12:25:51] Document 3d01e4cc-0c5f-57ad-8139-fbc7edce37cc did not exist prior to update (this might be normal if it's new). -[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} -[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/1175290a-e73b-59ae-a00b-155603488490?create=true" -[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/d88662bf-f859-58cc-84b6-f04257838115?create=true" -[2025-01-08 12:25:51] Document 1175290a-e73b-59ae-a00b-155603488490 did not exist prior to update (this might be normal if it's new). -[2025-01-08 12:25:51] EnrichedDocumentIndexingInfo(doc_id='test-doc-9666258c-a25a-4290-9050-951dd054f0b3', chunk_start_index=0, old_version=False, chunk_end_index=1) -[2025-01-08 12:25:51] Document 3d01e4cc-0c5f-57ad-8139-fbc7edce37cc did not exist prior to update (this might be normal if it's new). -[2025-01-08 12:25:51] Document 9188e172-07ff-547f-8ecc-f8ce25c181b0 did not exist prior to update (this might be normal if it's new). -[2025-01-08 12:25:51] CREATED Doc ID: test-doc-9666258c-a25a-4290-9050-951dd054f0b3 Chunk ID: b518bd97-dda4-5241-8dcf-60164e0a6baa -[2025-01-08 12:25:51] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} -[2025-01-08 12:25:51] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/b518bd97-dda4-5241-8dcf-60164e0a6baa?create=true" -[2025-01-08 12:25:51] Document 1175290a-e73b-59ae-a00b-155603488490 did not exist prior to update (this might be normal if it's new). -[2025-01-08 12:25:51] Document before update [test-doc-9666258c-a25a-4290-9050-951dd054f0b3]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/b518bd97-dda4-5241-8dcf-60164e0a6baa', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::b518bd97-dda4-5241-8dcf-60164e0a6baa', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:51] Document before update [test-doc-49b93c6c-0625-474e-a29b-10533384c461]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/d88662bf-f859-58cc-84b6-f04257838115', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::d88662bf-f859-58cc-84b6-f04257838115', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:51] Document before update [test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/ea672bb3-2d07-51c5-a39a-5f5bd7219fab', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::ea672bb3-2d07-51c5-a39a-5f5bd7219fab', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:51] Document before update [test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/c4e99738-6467-5a14-a151-d2d68173d286', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::c4e99738-6467-5a14-a151-d2d68173d286', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:51] Document after update [test-doc-c0cda300-a641-449e-b53f-8572735899dc]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/91971d6a-e954-5c17-a3f4-88b9e7090adf', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::91971d6a-e954-5c17-a3f4-88b9e7090adf', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:51] Document after update [test-doc-49b93c6c-0625-474e-a29b-10533384c461]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/d88662bf-f859-58cc-84b6-f04257838115', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::d88662bf-f859-58cc-84b6-f04257838115', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:51] Document after update [test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/9188e172-07ff-547f-8ecc-f8ce25c181b0', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::9188e172-07ff-547f-8ecc-f8ce25c181b0', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:51] Document after update [test-doc-9666258c-a25a-4290-9050-951dd054f0b3]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/b518bd97-dda4-5241-8dcf-60164e0a6baa', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::b518bd97-dda4-5241-8dcf-60164e0a6baa', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:51] Document after update [test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/619fc1a8-74e3-57fe-95dc-1ac48d9ead3e', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::619fc1a8-74e3-57fe-95dc-1ac48d9ead3e', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:51] Document after update [test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/ea672bb3-2d07-51c5-a39a-5f5bd7219fab', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::ea672bb3-2d07-51c5-a39a-5f5bd7219fab', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:51] Document after update [test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/b15838ad-4607-5815-9079-6c37a6688836', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::b15838ad-4607-5815-9079-6c37a6688836', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:51] Document after update [test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/c4e99738-6467-5a14-a151-d2d68173d286', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::c4e99738-6467-5a14-a151-d2d68173d286', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:51] Document after update [test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/1175290a-e73b-59ae-a00b-155603488490', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::1175290a-e73b-59ae-a00b-155603488490', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:51] Document after update [test-doc-3340635d-6077-4feb-8553-0618c0a249c7]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/3d01e4cc-0c5f-57ad-8139-fbc7edce37cc', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::3d01e4cc-0c5f-57ad-8139-fbc7edce37cc', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:52] Document after update [test-doc-48b1c08c-2625-4a79-b5a8-57f96648f547]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/1175290a-e73b-59ae-a00b-155603488490', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::1175290a-e73b-59ae-a00b-155603488490', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:52] Document after update [test-doc-3340635d-6077-4feb-8553-0618c0a249c7]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/3d01e4cc-0c5f-57ad-8139-fbc7edce37cc', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::3d01e4cc-0c5f-57ad-8139-fbc7edce37cc', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:52] Document after update [test-doc-010f8ddf-f065-421e-8b5f-dba2e51409ac]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/c4e99738-6467-5a14-a151-d2d68173d286', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::c4e99738-6467-5a14-a151-d2d68173d286', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:52] Document after update [test-doc-15d0dcd4-afd0-41c0-ac3a-f062d7d54ae9]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/ea672bb3-2d07-51c5-a39a-5f5bd7219fab', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::ea672bb3-2d07-51c5-a39a-5f5bd7219fab', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:52] Document after update [test-doc-49b93c6c-0625-474e-a29b-10533384c461]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/d88662bf-f859-58cc-84b6-f04257838115', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::d88662bf-f859-58cc-84b6-f04257838115', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:52] Document after update [test-doc-9666258c-a25a-4290-9050-951dd054f0b3]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/b518bd97-dda4-5241-8dcf-60164e0a6baa', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::b518bd97-dda4-5241-8dcf-60164e0a6baa', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:52] EnrichedDocumentIndexingInfo(doc_id='test-doc-c0cda300-a641-449e-b53f-8572735899dc', chunk_start_index=0, old_version=False, chunk_end_index=1) -[2025-01-08 12:25:52] EnrichedDocumentIndexingInfo(doc_id='test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248', chunk_start_index=0, old_version=False, chunk_end_index=1) -[2025-01-08 12:25:52] EnrichedDocumentIndexingInfo(doc_id='test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3', chunk_start_index=0, old_version=False, chunk_end_index=1) -[2025-01-08 12:25:52] CREATED Doc ID: test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248 Chunk ID: 619fc1a8-74e3-57fe-95dc-1ac48d9ead3e -[2025-01-08 12:25:52] CREATED Doc ID: test-doc-c0cda300-a641-449e-b53f-8572735899dc Chunk ID: 91971d6a-e954-5c17-a3f4-88b9e7090adf -[2025-01-08 12:25:52] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} -[2025-01-08 12:25:52] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} -[2025-01-08 12:25:52] CREATED Doc ID: test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3 Chunk ID: b15838ad-4607-5815-9079-6c37a6688836 -[2025-01-08 12:25:52] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/91971d6a-e954-5c17-a3f4-88b9e7090adf?create=true" -[2025-01-08 12:25:52] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/619fc1a8-74e3-57fe-95dc-1ac48d9ead3e?create=true" -[2025-01-08 12:25:52] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} -[2025-01-08 12:25:52] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/b15838ad-4607-5815-9079-6c37a6688836?create=true" -[2025-01-08 12:25:52] EnrichedDocumentIndexingInfo(doc_id='test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43', chunk_start_index=0, old_version=False, chunk_end_index=1) -[2025-01-08 12:25:52] CREATED Doc ID: test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43 Chunk ID: 9188e172-07ff-547f-8ecc-f8ce25c181b0 -[2025-01-08 12:25:52] Vespa partial-update payload: {'fields': {'boost': {'assign': 0}, 'document_sets': {'assign': {}}, 'access_control_list': {'assign': {'PUBLIC': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1, 'user_email:admin_user@test.com': 1}}, 'hidden': {'assign': False}}} -[2025-01-08 12:25:52] Attempting PUT to URL: "http://localhost:8081/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/9188e172-07ff-547f-8ecc-f8ce25c181b0?create=true" -[2025-01-08 12:25:52] Document before update [test-doc-c0cda300-a641-449e-b53f-8572735899dc]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/91971d6a-e954-5c17-a3f4-88b9e7090adf', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::91971d6a-e954-5c17-a3f4-88b9e7090adf', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:52] Document before update [test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/619fc1a8-74e3-57fe-95dc-1ac48d9ead3e', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::619fc1a8-74e3-57fe-95dc-1ac48d9ead3e', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:52] Document before update [test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/b15838ad-4607-5815-9079-6c37a6688836', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::b15838ad-4607-5815-9079-6c37a6688836', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:52] Document before update [test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/9188e172-07ff-547f-8ecc-f8ce25c181b0', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::9188e172-07ff-547f-8ecc-f8ce25c181b0', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:52] Document after update [test-doc-99aacccf-d43f-46f8-8fd3-979a4a58f248]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/619fc1a8-74e3-57fe-95dc-1ac48d9ead3e', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::619fc1a8-74e3-57fe-95dc-1ac48d9ead3e', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:52] Document after update [test-doc-aea1f8b8-f1a7-4071-a9c2-492a372512a3]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/b15838ad-4607-5815-9079-6c37a6688836', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::b15838ad-4607-5815-9079-6c37a6688836', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:52] Document after update [test-doc-e91efe25-09d0-4f1f-8adb-f62ae8bd1f43]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/9188e172-07ff-547f-8ecc-f8ce25c181b0', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::9188e172-07ff-547f-8ecc-f8ce25c181b0', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} -[2025-01-08 12:25:52] Document after update [test-doc-c0cda300-a641-449e-b53f-8572735899dc]: {'pathId': '/document/v1/default/danswer_chunk_nomic_ai_nomic_embed_text_v1/docid/91971d6a-e954-5c17-a3f4-88b9e7090adf', 'id': 'id:default:danswer_chunk_nomic_ai_nomic_embed_text_v1::91971d6a-e954-5c17-a3f4-88b9e7090adf', 'fields': {'hidden': False, 'boost': 0.0, 'skip_title': False, 'access_control_list': {'PUBLIC': 1, 'user_email:admin_user@test.com': 1, 'group:test-user-group-c0453b8b-0090-4d0f-8672-e08091908c52': 1}, 'section_continuation': False}} From 85fa3278771e43fdaf9b39aa1fa4783286ac7147 Mon Sep 17 00:00:00 2001 From: pablodanswer Date: Wed, 8 Jan 2025 12:35:49 -0800 Subject: [PATCH 11/11] quick nit --- backend/onyx/document_index/vespa/index.py | 9 +-------- .../tests/integration/common_utils/managers/document.py | 2 -- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/backend/onyx/document_index/vespa/index.py b/backend/onyx/document_index/vespa/index.py index b69fa853d0c..ce2c580524f 100644 --- a/backend/onyx/document_index/vespa/index.py +++ b/backend/onyx/document_index/vespa/index.py @@ -526,10 +526,8 @@ def update_single_chunk( ) -> None: """ Update a single "chunk" (document) in Vespa using its chunk ID. - Requires that 'index_name' matches the document type in your Vespa schema. """ - # 1) Build the partial-update JSON update_dict: dict[str, dict] = {"fields": {}} if fields.boost is not None: @@ -542,7 +540,7 @@ def update_single_chunk( } if fields.access is not None: - # Another WeightedSet + # Similar to above update_dict["fields"][ACCESS_CONTROL_LIST] = { "assign": {acl_entry: 1 for acl_entry in fields.access.to_acl()} } @@ -554,10 +552,6 @@ def update_single_chunk( logger.error("Update request received but nothing to update.") return - # 2) Construct the correct doc URL - # Make sure DOCUMENT_ID_ENDPOINT includes something like: - # http://:/document/v1/{namespace}/{doc_type}/docid - # Also note the "?create=true" so that partial updates upsert. vespa_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}?create=true" with get_vespa_http_client(http2=False) as http_client: @@ -637,7 +631,6 @@ def delete_single( tenant_id: str | None, chunk_count: int | None, ) -> int: - print("\n\n\n\n\n\n\n\nDELETE") total_chunks_deleted = 0 doc_id = replace_invalid_doc_id_characters(doc_id) diff --git a/backend/tests/integration/common_utils/managers/document.py b/backend/tests/integration/common_utils/managers/document.py index d1514126cc3..9ce3430fe62 100644 --- a/backend/tests/integration/common_utils/managers/document.py +++ b/backend/tests/integration/common_utils/managers/document.py @@ -40,9 +40,7 @@ def _verify_document_permissions( if group_names is not None: expected_group_keys = {f"group:{group_name}" for group_name in group_names} - print(f"Expected group keys: {expected_group_keys}") found_group_keys = {key for key in acl_keys if key.startswith("group:")} - print(f"Found group keys: {found_group_keys}") if found_group_keys != expected_group_keys: raise ValueError( f"Document {retrieved_doc['document_id']} has incorrect group ACL keys. "