From 97a963b4bf2d992639f8b29fbb1df1dd2b9f6566 Mon Sep 17 00:00:00 2001
From: rkuo-danswer <rkuo@danswer.ai>
Date: Wed, 8 Jan 2025 16:56:55 -0800
Subject: [PATCH 01/29] add index to speed up get last attempt (#3636)

* add index to speed up get last attempt

* use descending order

* put back unique param

* how did this not get formatted?

---------

Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app>
---
 ..._add_composite_index_for_index_attempt_.py | 35 +++++++++++++++++++
 backend/onyx/db/models.py                     |  8 +++++
 2 files changed, 43 insertions(+)
 create mode 100644 backend/alembic/versions/369644546676_add_composite_index_for_index_attempt_.py

diff --git a/backend/alembic/versions/369644546676_add_composite_index_for_index_attempt_.py b/backend/alembic/versions/369644546676_add_composite_index_for_index_attempt_.py
new file mode 100644
index 00000000000..4e0384fe486
--- /dev/null
+++ b/backend/alembic/versions/369644546676_add_composite_index_for_index_attempt_.py
@@ -0,0 +1,35 @@
+"""add composite index for index attempt time updated
+
+Revision ID: 369644546676
+Revises: 2955778aa44c
+Create Date: 2025-01-08 15:38:17.224380
+
+"""
+from alembic import op
+from sqlalchemy import text
+
+# revision identifiers, used by Alembic.
+revision = "369644546676"
+down_revision = "2955778aa44c"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.create_index(
+        "ix_index_attempt_ccpair_search_settings_time_updated",
+        "index_attempt",
+        [
+            "connector_credential_pair_id",
+            "search_settings_id",
+            text("time_updated DESC"),
+        ],
+        unique=False,
+    )
+
+
+def downgrade() -> None:
+    op.drop_index(
+        "ix_index_attempt_ccpair_search_settings_time_updated",
+        table_name="index_attempt",
+    )
diff --git a/backend/onyx/db/models.py b/backend/onyx/db/models.py
index 87370e91f67..ff1c98d13d8 100644
--- a/backend/onyx/db/models.py
+++ b/backend/onyx/db/models.py
@@ -18,6 +18,7 @@
 from fastapi_users_db_sqlalchemy.generics import TIMESTAMPAware
 from sqlalchemy import Boolean
 from sqlalchemy import DateTime
+from sqlalchemy import desc
 from sqlalchemy import Enum
 from sqlalchemy import Float
 from sqlalchemy import ForeignKey
@@ -813,6 +814,13 @@ class IndexAttempt(Base):
             "connector_credential_pair_id",
             "time_created",
         ),
+        Index(
+            "ix_index_attempt_ccpair_search_settings_time_updated",
+            "connector_credential_pair_id",
+            "search_settings_id",
+            desc("time_updated"),
+            unique=False,
+        ),
     )
 
     def __repr__(self) -> str:

From d40fd82803572ed8305d33df8448121c4a6783f3 Mon Sep 17 00:00:00 2001
From: hagen-danswer <hagen@danswer.ai>
Date: Thu, 9 Jan 2025 12:56:56 -0800
Subject: [PATCH 02/29] Conf doc sync improvements (#3643)

* Reduce number of requests to Confluence

* undo

* added a way to dynamically adjust the pagination limit

* undo
---
 .../confluence/doc_sync.py                    |  7 ++++
 .../connectors/confluence/onyx_confluence.py  | 40 ++++++++++++++-----
 2 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/backend/ee/onyx/external_permissions/confluence/doc_sync.py b/backend/ee/onyx/external_permissions/confluence/doc_sync.py
index 708be895dd0..bd78a8eade4 100644
--- a/backend/ee/onyx/external_permissions/confluence/doc_sync.py
+++ b/backend/ee/onyx/external_permissions/confluence/doc_sync.py
@@ -67,6 +67,13 @@ def _get_server_space_permissions(
         else:
             logger.warning(f"Email for user {user_name} not found in Confluence")
 
+    if not user_emails and not group_names:
+        logger.warning(
+            "No user emails or group names found in Confluence space permissions"
+            f"\nSpace key: {space_key}"
+            f"\nSpace permissions: {space_permissions}"
+        )
+
     return ExternalAccess(
         external_user_emails=user_emails,
         external_user_group_ids=group_names,
diff --git a/backend/onyx/connectors/confluence/onyx_confluence.py b/backend/onyx/connectors/confluence/onyx_confluence.py
index ea8a7a67e74..d95fa19630e 100644
--- a/backend/onyx/connectors/confluence/onyx_confluence.py
+++ b/backend/onyx/connectors/confluence/onyx_confluence.py
@@ -121,6 +121,7 @@ def wrapped_call(*args: list[Any], **kwargs: Any) -> Any:
 
 
 _DEFAULT_PAGINATION_LIMIT = 1000
+_MINIMUM_PAGINATION_LIMIT = 50
 
 
 class OnyxConfluence(Confluence):
@@ -204,24 +205,41 @@ def _paginate_url(
                 # If the problematic expansion is in the url, replace it
                 # with the replacement expansion and try again
                 # If that fails, raise the error
-                if _PROBLEMATIC_EXPANSIONS not in url_suffix:
-                    logger.exception(
+                if _PROBLEMATIC_EXPANSIONS in url_suffix:
+                    logger.warning(
+                        f"Replacing {_PROBLEMATIC_EXPANSIONS} with {_REPLACEMENT_EXPANSIONS}"
+                        " and trying again."
+                    )
+                    url_suffix = url_suffix.replace(
+                        _PROBLEMATIC_EXPANSIONS,
+                        _REPLACEMENT_EXPANSIONS,
+                    )
+                    continue
+                if (
+                    raw_response.status_code == 500
+                    and limit > _MINIMUM_PAGINATION_LIMIT
+                ):
+                    new_limit = limit // 2
+                    logger.warning(
                         f"Error in confluence call to {url_suffix} \n"
                         f"Raw Response Text: {raw_response.text} \n"
                         f"Full Response: {raw_response.__dict__} \n"
                         f"Error: {e} \n"
+                        f"Reducing limit from {limit} to {new_limit} and trying again."
                     )
-                    raise e
+                    url_suffix = url_suffix.replace(
+                        f"limit={limit}", f"limit={new_limit}"
+                    )
+                    limit = new_limit
+                    continue
 
-                logger.warning(
-                    f"Replacing {_PROBLEMATIC_EXPANSIONS} with {_REPLACEMENT_EXPANSIONS}"
-                    " and trying again."
-                )
-                url_suffix = url_suffix.replace(
-                    _PROBLEMATIC_EXPANSIONS,
-                    _REPLACEMENT_EXPANSIONS,
+                logger.exception(
+                    f"Error in confluence call to {url_suffix} \n"
+                    f"Raw Response Text: {raw_response.text} \n"
+                    f"Full Response: {raw_response.__dict__} \n"
+                    f"Error: {e} \n"
                 )
-                continue
+                raise e
 
             try:
                 next_response = raw_response.json()

From 2ae91f0f2ba25f4f5a86ed848a7c99e404480361 Mon Sep 17 00:00:00 2001
From: rkuo-danswer <rkuo@danswer.ai>
Date: Thu, 9 Jan 2025 13:34:07 -0800
Subject: [PATCH 03/29] Feature/redis prod tool (#3619)

* prototype tools for handling prod issues

* add some commands

* add batching and dry run options

* custom redis tool

* comment

* default to app config settings for redis

---------

Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app>
---
 backend/scripts/celery_purge_queue.py |  87 +++++++++++
 backend/scripts/onyx_redis.py         | 198 ++++++++++++++++++++++++++
 2 files changed, 285 insertions(+)
 create mode 100644 backend/scripts/celery_purge_queue.py
 create mode 100644 backend/scripts/onyx_redis.py

diff --git a/backend/scripts/celery_purge_queue.py b/backend/scripts/celery_purge_queue.py
new file mode 100644
index 00000000000..cbaed2de4fe
--- /dev/null
+++ b/backend/scripts/celery_purge_queue.py
@@ -0,0 +1,87 @@
+# Tool to run operations on Celery/Redis in production
+# this is a work in progress and isn't completely put together yet
+# but can serve as a stub for future operations
+import argparse
+import logging
+from logging import getLogger
+
+from redis import Redis
+
+from onyx.background.celery.celery_redis import celery_get_queue_length
+from onyx.configs.app_configs import REDIS_DB_NUMBER_CELERY
+from onyx.redis.redis_pool import RedisPool
+
+# Configure the logger
+logging.basicConfig(
+    level=logging.INFO,  # Set the log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",  # Log format
+    handlers=[logging.StreamHandler()],  # Output logs to console
+)
+
+logger = getLogger(__name__)
+
+REDIS_PASSWORD = ""
+
+
+def celery_purge_queue(queue: str, tenant_id: str) -> None:
+    """Purging a celery queue is extremely difficult because the queue is a list
+    and the only way an item can be removed from a list is by VALUE, which is
+    a linear scan.  Therefore, to purge the list of many values is roughly
+    n^2.
+
+    The other alternative is to pop values and push them back, but that raises
+    questions about behavior while operating on a live queue.
+    """
+
+    pool = RedisPool.create_pool(
+        host="127.0.0.1",
+        port=6380,
+        db=REDIS_DB_NUMBER_CELERY,
+        password=REDIS_PASSWORD,
+        ssl=True,
+        ssl_cert_reqs="optional",
+        ssl_ca_certs=None,
+    )
+
+    r = Redis(connection_pool=pool)
+
+    length = celery_get_queue_length(queue, r)
+
+    logger.info(f"queue={queue} length={length}")
+
+    # processed = 0
+    # deleted = 0
+    # for i in range(len(OnyxCeleryPriority)):
+    #     queue_name = queue
+    #     if i > 0:
+    #         queue_name += CELERY_SEPARATOR
+    #         queue_name += str(i)
+
+    #     length = r.llen(queue_name)
+    #     for i in range(length):
+    #         task_raw: bytes | None = r.lindex(queue_name, i)
+    #         if not task_raw:
+    #             break
+
+    #         processed += 1
+    #         task_str = task_raw.decode("utf-8")
+    #         task = json.loads(task_str)
+    #         task_kwargs_str = task["headers"]["kwargsrepr"]
+    #         task_kwargs = json.loads(task_kwargs_str)
+    #         task_tenant_id = task_kwargs["tenant_id"]
+    #         if task_tenant_id and task_tenant_id == "tenant_id":
+    #             print("Delete tenant_id={tenant_id}")
+    #             if
+    #             deleted += 1
+
+    #         logger.info(f"processed={processed} deleted={deleted}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Purge celery queue by tenant id")
+    parser.add_argument("--queue", type=str, help="Queue to purge", required=True)
+
+    parser.add_argument("--tenant", type=str, help="Tenant ID to purge", required=True)
+
+    args = parser.parse_args()
+    celery_purge_queue(queue=args.queue, tenant_id=args.tenant)
diff --git a/backend/scripts/onyx_redis.py b/backend/scripts/onyx_redis.py
new file mode 100644
index 00000000000..c7eb7fbef5c
--- /dev/null
+++ b/backend/scripts/onyx_redis.py
@@ -0,0 +1,198 @@
+# Tool to run helpful operations on Redis in production
+# This is targeted for internal usage and may not have all the necessary parameters
+# for general usage across custom deployments
+import argparse
+import logging
+import sys
+import time
+from logging import getLogger
+from typing import cast
+
+from redis import Redis
+
+from onyx.configs.app_configs import REDIS_DB_NUMBER
+from onyx.configs.app_configs import REDIS_HOST
+from onyx.configs.app_configs import REDIS_PASSWORD
+from onyx.configs.app_configs import REDIS_PORT
+from onyx.redis.redis_pool import RedisPool
+
+# Configure the logger
+logging.basicConfig(
+    level=logging.INFO,  # Set the log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",  # Log format
+    handlers=[logging.StreamHandler()],  # Output logs to console
+)
+
+logger = getLogger(__name__)
+
+SCAN_ITER_COUNT = 10000
+BATCH_DEFAULT = 1000
+
+
+def onyx_redis(
+    command: str,
+    batch: int,
+    dry_run: bool,
+    host: str,
+    port: int,
+    db: int,
+    password: str | None,
+) -> int:
+    pool = RedisPool.create_pool(
+        host=host,
+        port=port,
+        db=db,
+        password=password if password else "",
+        ssl=True,
+        ssl_cert_reqs="optional",
+        ssl_ca_certs=None,
+    )
+
+    r = Redis(connection_pool=pool)
+
+    try:
+        r.ping()
+    except:
+        logger.exception("Redis ping exceptioned")
+        raise
+
+    if command == "purge_connectorsync_taskset":
+        """Purge connector tasksets. Used when the tasks represented in the tasksets
+        have been purged."""
+        return purge_by_match_and_type(
+            "*connectorsync_taskset*", "set", batch, dry_run, r
+        )
+    elif command == "purge_documentset_taskset":
+        return purge_by_match_and_type(
+            "*documentset_taskset*", "set", batch, dry_run, r
+        )
+    elif command == "purge_usergroup_taskset":
+        return purge_by_match_and_type("*usergroup_taskset*", "set", batch, dry_run, r)
+    elif command == "purge_vespa_syncing":
+        return purge_by_match_and_type(
+            "*connectorsync:vespa_syncing*", "string", batch, dry_run, r
+        )
+    else:
+        pass
+
+    return 255
+
+
+def flush_batch_delete(batch_keys: list[bytes], r: Redis) -> None:
+    logger.info(f"Flushing {len(batch_keys)} operations to Redis.")
+    with r.pipeline() as pipe:
+        for batch_key in batch_keys:
+            pipe.delete(batch_key)
+        pipe.execute()
+
+
+def purge_by_match_and_type(
+    match_pattern: str, match_type: str, batch_size: int, dry_run: bool, r: Redis
+) -> int:
+    """match_pattern: glob style expression
+    match_type: https://redis.io/docs/latest/commands/type/
+    """
+
+    # cursor = "0"
+    # while cursor != 0:
+    #     cursor, data = self.scan(
+    #         cursor=cursor, match=match, count=count, _type=_type, **kwargs
+    #     )
+
+    start = time.monotonic()
+
+    count = 0
+    batch_keys: list[bytes] = []
+    for key in r.scan_iter(match_pattern, count=SCAN_ITER_COUNT, _type=match_type):
+        # key_type = r.type(key)
+        # if key_type != match_type.encode("utf-8"):
+        #     continue
+
+        key = cast(bytes, key)
+        key_str = key.decode("utf-8")
+
+        count += 1
+        if dry_run:
+            logger.info(f"(DRY-RUN) Deleting item {count}: {key_str}")
+            continue
+
+        logger.info(f"Deleting item {count}: {key_str}")
+
+        batch_keys.append(key)
+        if len(batch_keys) >= batch_size:
+            flush_batch_delete(batch_keys, r)
+            batch_keys.clear()
+
+    if len(batch_keys) >= batch_size:
+        flush_batch_delete(batch_keys, r)
+        batch_keys.clear()
+
+    logger.info(f"Deleted {count} matches.")
+
+    elapsed = time.monotonic() - start
+    logger.info(f"Time elapsed: {elapsed:.2f}s")
+    return 0
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Onyx Redis Manager")
+    parser.add_argument("--command", type=str, help="Operation to run", required=True)
+
+    parser.add_argument(
+        "--host",
+        type=str,
+        default=REDIS_HOST,
+        help="The redis host",
+        required=False,
+    )
+
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=REDIS_PORT,
+        help="The redis port",
+        required=False,
+    )
+
+    parser.add_argument(
+        "--db",
+        type=int,
+        default=REDIS_DB_NUMBER,
+        help="The redis db",
+        required=False,
+    )
+
+    parser.add_argument(
+        "--password",
+        type=str,
+        default=REDIS_PASSWORD,
+        help="The redis password",
+        required=False,
+    )
+
+    parser.add_argument(
+        "--batch",
+        type=int,
+        default=BATCH_DEFAULT,
+        help="Size of operation batches to send to Redis",
+        required=False,
+    )
+
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Perform a dry run without actually executing modifications",
+        required=False,
+    )
+
+    args = parser.parse_args()
+    exitcode = onyx_redis(
+        command=args.command,
+        batch=args.batch,
+        dry_run=args.dry_run,
+        host=args.host,
+        port=args.port,
+        db=args.db,
+        password=args.password,
+    )
+    sys.exit(exitcode)

From 91e32e801de127998469cdbf4cce43bc5e724a59 Mon Sep 17 00:00:00 2001
From: "Richard Kuo (Danswer)" <rkuo@onyx.app>
Date: Thu, 9 Jan 2025 13:51:58 -0800
Subject: [PATCH 04/29] hope this env var works.

---
 .../docker-build-push-model-server-container-on-tag.yml         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docker-build-push-model-server-container-on-tag.yml b/.github/workflows/docker-build-push-model-server-container-on-tag.yml
index 7df47c416ce..7e47fcbe7b0 100644
--- a/.github/workflows/docker-build-push-model-server-container-on-tag.yml
+++ b/.github/workflows/docker-build-push-model-server-container-on-tag.yml
@@ -118,6 +118,6 @@ jobs:
           TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2"
           TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1"
         with:
-          image-ref: docker.io/onyxdotapp/onyx-model-server:${{ github.ref_name }}
+          image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
           severity: "CRITICAL,HIGH"
           timeout: "10m"

From c55de284238b5a374d98b26030896b01ce9f0d73 Mon Sep 17 00:00:00 2001
From: hagen-danswer <hagen@danswer.ai>
Date: Thu, 9 Jan 2025 14:15:38 -0800
Subject: [PATCH 05/29] added distinct when outer joining for user filters
 (#3641)

* added distinct when outer joining for user filters

* Added distinct when outer joining for user filters for all
---
 backend/ee/onyx/db/token_limit.py            | 1 +
 backend/onyx/db/connector_credential_pair.py | 1 +
 backend/onyx/db/credentials.py               | 1 +
 backend/onyx/db/document_set.py              | 1 +
 backend/onyx/db/feedback.py                  | 1 +
 backend/onyx/db/persona.py                   | 1 +
 6 files changed, 6 insertions(+)

diff --git a/backend/ee/onyx/db/token_limit.py b/backend/ee/onyx/db/token_limit.py
index 47d78e1fd26..863f4450315 100644
--- a/backend/ee/onyx/db/token_limit.py
+++ b/backend/ee/onyx/db/token_limit.py
@@ -24,6 +24,7 @@ def _add_user_filters(
     if user is None or user.role == UserRole.ADMIN:
         return stmt
 
+    stmt = stmt.distinct()
     TRLimit_UG = aliased(TokenRateLimit__UserGroup)
     User__UG = aliased(User__UserGroup)
 
diff --git a/backend/onyx/db/connector_credential_pair.py b/backend/onyx/db/connector_credential_pair.py
index 3c796492b11..ea72f1a9507 100644
--- a/backend/onyx/db/connector_credential_pair.py
+++ b/backend/onyx/db/connector_credential_pair.py
@@ -39,6 +39,7 @@ def _add_user_filters(
     if user is None or user.role == UserRole.ADMIN:
         return stmt
 
+    stmt = stmt.distinct()
     UG__CCpair = aliased(UserGroup__ConnectorCredentialPair)
     User__UG = aliased(User__UserGroup)
 
diff --git a/backend/onyx/db/credentials.py b/backend/onyx/db/credentials.py
index 5c135137fbd..86cb31aa811 100644
--- a/backend/onyx/db/credentials.py
+++ b/backend/onyx/db/credentials.py
@@ -74,6 +74,7 @@ def _add_user_filters(
         # Basic users can only access credentials that are owned by them
         return stmt.where(Credential.user_id == user.id)
 
+    stmt = stmt.distinct()
     """
     THIS PART IS FOR CURATORS AND GLOBAL CURATORS
     Here we select cc_pairs by relation:
diff --git a/backend/onyx/db/document_set.py b/backend/onyx/db/document_set.py
index b5f0dd365aa..750021d29a1 100644
--- a/backend/onyx/db/document_set.py
+++ b/backend/onyx/db/document_set.py
@@ -40,6 +40,7 @@ def _add_user_filters(
     if user is None or user.role == UserRole.ADMIN:
         return stmt
 
+    stmt = stmt.distinct()
     DocumentSet__UG = aliased(DocumentSet__UserGroup)
     User__UG = aliased(User__UserGroup)
     """
diff --git a/backend/onyx/db/feedback.py b/backend/onyx/db/feedback.py
index f01d8151228..7acf44fd7e4 100644
--- a/backend/onyx/db/feedback.py
+++ b/backend/onyx/db/feedback.py
@@ -50,6 +50,7 @@ def _add_user_filters(
     if user is None or user.role == UserRole.ADMIN:
         return stmt
 
+    stmt = stmt.distinct()
     DocByCC = aliased(DocumentByConnectorCredentialPair)
     CCPair = aliased(ConnectorCredentialPair)
     UG__CCpair = aliased(UserGroup__ConnectorCredentialPair)
diff --git a/backend/onyx/db/persona.py b/backend/onyx/db/persona.py
index d092a4c3275..ec896c5d304 100644
--- a/backend/onyx/db/persona.py
+++ b/backend/onyx/db/persona.py
@@ -49,6 +49,7 @@ def _add_user_filters(
     if user is None or user.role == UserRole.ADMIN:
         return stmt
 
+    stmt = stmt.distinct()
     Persona__UG = aliased(Persona__UserGroup)
     User__UG = aliased(User__UserGroup)
     """

From 50131ba22cc369c4ff373ce83a106a1a24b64b30 Mon Sep 17 00:00:00 2001
From: hagen-danswer <hagen@danswer.ai>
Date: Thu, 9 Jan 2025 15:13:02 -0800
Subject: [PATCH 06/29] Better logging for confluence space permissions

---
 .../confluence/doc_sync.py                    |  4 ++-
 .../connectors/confluence/onyx_confluence.py  | 27 +++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/backend/ee/onyx/external_permissions/confluence/doc_sync.py b/backend/ee/onyx/external_permissions/confluence/doc_sync.py
index bd78a8eade4..9805cdad6ee 100644
--- a/backend/ee/onyx/external_permissions/confluence/doc_sync.py
+++ b/backend/ee/onyx/external_permissions/confluence/doc_sync.py
@@ -24,7 +24,9 @@
 def _get_server_space_permissions(
     confluence_client: OnyxConfluence, space_key: str
 ) -> ExternalAccess:
-    space_permissions = confluence_client.get_space_permissions(space_key=space_key)
+    space_permissions = confluence_client.get_all_space_permissions_server(
+        space_key=space_key
+    )
 
     viewspace_permissions = []
     for permission_category in space_permissions:
diff --git a/backend/onyx/connectors/confluence/onyx_confluence.py b/backend/onyx/connectors/confluence/onyx_confluence.py
index d95fa19630e..e6a2b957ee7 100644
--- a/backend/onyx/connectors/confluence/onyx_confluence.py
+++ b/backend/onyx/connectors/confluence/onyx_confluence.py
@@ -354,6 +354,33 @@ def paginated_group_members_retrieval(
         group_name = quote(group_name)
         yield from self._paginate_url(f"rest/api/group/{group_name}/member", limit)
 
+    def get_all_space_permissions_server(
+        self,
+        space_key: str,
+    ) -> list[dict[str, Any]]:
+        """
+        This is a confluence server specific method that can be used to
+        fetch the permissions of a space.
+        This is better logging than calling the get_space_permissions method
+        because it returns a jsonrpc response.
+        """
+        url = "rpc/json-rpc/confluenceservice-v2"
+        data = {
+            "jsonrpc": "2.0",
+            "method": "getSpacePermissionSets",
+            "id": 7,
+            "params": [space_key],
+        }
+        response = self.post(url, data=data)
+        logger.debug(f"jsonrpc response: {response}")
+        if not response.get("result"):
+            logger.warning(
+                f"No jsonrpc response for space permissions for space {space_key}"
+                f"\nResponse: {response}"
+            )
+
+        return response.get("result", [])
+
 
 def _validate_connector_configuration(
     credentials: dict[str, Any],

From 962240031fc2c0616a69f5efcb43c9fe8cd6ad00 Mon Sep 17 00:00:00 2001
From: "Richard Kuo (Danswer)" <rkuo@onyx.app>
Date: Thu, 9 Jan 2025 16:29:37 -0800
Subject: [PATCH 07/29] figuring out why multiprocessing set_start_method isn't
 working.

---
 backend/onyx/background/celery/apps/app_base.py        | 6 ++++--
 backend/onyx/background/celery/apps/heavy.py           | 1 +
 backend/onyx/background/celery/apps/indexing.py        | 1 +
 backend/onyx/background/celery/apps/light.py           | 2 ++
 backend/onyx/background/celery/apps/primary.py         | 1 +
 backend/onyx/background/celery/tasks/indexing/tasks.py | 8 ++++++--
 6 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/backend/onyx/background/celery/apps/app_base.py b/backend/onyx/background/celery/apps/app_base.py
index 22529a66c2b..5e767dfbefc 100644
--- a/backend/onyx/background/celery/apps/app_base.py
+++ b/backend/onyx/background/celery/apps/app_base.py
@@ -1,5 +1,4 @@
 import logging
-import multiprocessing
 import time
 from typing import Any
 
@@ -163,7 +162,10 @@ def on_task_postrun(
 
 def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None:
     """The first signal sent on celery worker startup"""
-    multiprocessing.set_start_method("spawn")  # fork is unsafe, set to spawn
+    # rkuo: commenting out as set_start_method seems to work here on macOS
+    # but not in the cloud and it is unclear why.
+    # logger.info(f"Multiprocessing start method - setting to spawn.")
+    # multiprocessing.set_start_method("spawn")  # fork is unsafe, set to spawn
 
 
 def wait_for_redis(sender: Any, **kwargs: Any) -> None:
diff --git a/backend/onyx/background/celery/apps/heavy.py b/backend/onyx/background/celery/apps/heavy.py
index f45e6df9aa4..ee8958e7dd0 100644
--- a/backend/onyx/background/celery/apps/heavy.py
+++ b/backend/onyx/background/celery/apps/heavy.py
@@ -56,6 +56,7 @@ def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None
 @worker_init.connect
 def on_worker_init(sender: Any, **kwargs: Any) -> None:
     logger.info("worker_init signal received.")
+    multiprocessing.set_start_method("spawn")  # fork is unsafe, set to spawn
     logger.info(f"Multiprocessing start method: {multiprocessing.get_start_method()}")
 
     SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_HEAVY_APP_NAME)
diff --git a/backend/onyx/background/celery/apps/indexing.py b/backend/onyx/background/celery/apps/indexing.py
index 9262b632dc2..46282772ff4 100644
--- a/backend/onyx/background/celery/apps/indexing.py
+++ b/backend/onyx/background/celery/apps/indexing.py
@@ -57,6 +57,7 @@ def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None
 @worker_init.connect
 def on_worker_init(sender: Any, **kwargs: Any) -> None:
     logger.info("worker_init signal received.")
+    multiprocessing.set_start_method("spawn")  # fork is unsafe, set to spawn
     logger.info(f"Multiprocessing start method: {multiprocessing.get_start_method()}")
 
     SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_INDEXING_APP_NAME)
diff --git a/backend/onyx/background/celery/apps/light.py b/backend/onyx/background/celery/apps/light.py
index e6567b14770..11f1341a1e0 100644
--- a/backend/onyx/background/celery/apps/light.py
+++ b/backend/onyx/background/celery/apps/light.py
@@ -56,7 +56,9 @@ def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None
 @worker_init.connect
 def on_worker_init(sender: Any, **kwargs: Any) -> None:
     logger.info("worker_init signal received.")
+    multiprocessing.set_start_method("spawn")  # fork is unsafe, set to spawn
     logger.info(f"Multiprocessing start method: {multiprocessing.get_start_method()}")
+    logger.info(f"Concurrency: {sender.concurrency}")
 
     SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_LIGHT_APP_NAME)
     SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=8)
diff --git a/backend/onyx/background/celery/apps/primary.py b/backend/onyx/background/celery/apps/primary.py
index caa697f8837..af2105b8c6d 100644
--- a/backend/onyx/background/celery/apps/primary.py
+++ b/backend/onyx/background/celery/apps/primary.py
@@ -80,6 +80,7 @@ def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None
 @worker_init.connect
 def on_worker_init(sender: Any, **kwargs: Any) -> None:
     logger.info("worker_init signal received.")
+    multiprocessing.set_start_method("spawn")  # fork is unsafe, set to spawn
     logger.info(f"Multiprocessing start method: {multiprocessing.get_start_method()}")
 
     SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_PRIMARY_APP_NAME)
diff --git a/backend/onyx/background/celery/tasks/indexing/tasks.py b/backend/onyx/background/celery/tasks/indexing/tasks.py
index 9fd73972d0e..b29dd1e8a08 100644
--- a/backend/onyx/background/celery/tasks/indexing/tasks.py
+++ b/backend/onyx/background/celery/tasks/indexing/tasks.py
@@ -1,3 +1,4 @@
+import multiprocessing
 import os
 import sys
 import time
@@ -853,11 +854,14 @@ def connector_indexing_proxy_task(
     search_settings_id: int,
     tenant_id: str | None,
 ) -> None:
-    """celery tasks are forked, but forking is unstable.  This proxies work to a spawned task."""
+    """celery tasks are forked, but forking is unstable.
+    This is a thread that proxies work to a spawned task."""
+
     task_logger.info(
         f"Indexing watchdog - starting: attempt={index_attempt_id} "
         f"cc_pair={cc_pair_id} "
-        f"search_settings={search_settings_id}"
+        f"search_settings={search_settings_id} "
+        f"multiprocessing={multiprocessing.get_start_method()}"
     )
 
     if not self.request.id:

From d972a78f45a696999abf727f9b38c49fada5ea47 Mon Sep 17 00:00:00 2001
From: rkuo-danswer <rkuo@danswer.ai>
Date: Thu, 9 Jan 2025 17:39:45 -0800
Subject: [PATCH 08/29] Make connector pause and delete fast (#3646)

* first cut

* refresh on delete

---------

Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app>
---
 .../onyx/background/celery/celery_utils.py    |  24 ++--
 backend/onyx/server/documents/cc_pair.py      | 107 +++++-------------
 .../connector/[ccPairId]/DeletionButton.tsx   |  32 ++++--
 .../app/admin/connector/[ccPairId]/page.tsx   |   2 +-
 4 files changed, 69 insertions(+), 96 deletions(-)

diff --git a/backend/onyx/background/celery/celery_utils.py b/backend/onyx/background/celery/celery_utils.py
index fc6fef1fab5..394dff35258 100644
--- a/backend/onyx/background/celery/celery_utils.py
+++ b/backend/onyx/background/celery/celery_utils.py
@@ -14,6 +14,7 @@
 from onyx.connectors.interfaces import SlimConnector
 from onyx.connectors.models import Document
 from onyx.db.connector_credential_pair import get_connector_credential_pair
+from onyx.db.enums import ConnectorCredentialPairStatus
 from onyx.db.enums import TaskStatus
 from onyx.db.models import TaskQueueState
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
@@ -41,14 +42,21 @@ def _get_deletion_status(
         return None
 
     redis_connector = RedisConnector(tenant_id, cc_pair.id)
-    if not redis_connector.delete.fenced:
-        return None
-
-    return TaskQueueState(
-        task_id="",
-        task_name=redis_connector.delete.fence_key,
-        status=TaskStatus.STARTED,
-    )
+    if redis_connector.delete.fenced:
+        return TaskQueueState(
+            task_id="",
+            task_name=redis_connector.delete.fence_key,
+            status=TaskStatus.STARTED,
+        )
+
+    if cc_pair.status == ConnectorCredentialPairStatus.DELETING:
+        return TaskQueueState(
+            task_id="",
+            task_name=redis_connector.delete.fence_key,
+            status=TaskStatus.PENDING,
+        )
+
+    return None
 
 
 def get_deletion_attempt_snapshot(
diff --git a/backend/onyx/server/documents/cc_pair.py b/backend/onyx/server/documents/cc_pair.py
index cf87469535e..64086de5df0 100644
--- a/backend/onyx/server/documents/cc_pair.py
+++ b/backend/onyx/server/documents/cc_pair.py
@@ -164,17 +164,12 @@ def update_cc_pair_status(
     db_session: Session = Depends(get_session),
     tenant_id: str | None = Depends(get_current_tenant_id),
 ) -> JSONResponse:
-    """This method may wait up to 30 seconds if pausing the connector due to the need to
-    terminate tasks in progress. Tasks are not guaranteed to terminate within the
-    timeout.
+    """This method returns nearly immediately. It simply sets some signals and
+    optimistically assumes any running background processes will clean themselves up.
+    This is done to improve the perceived end user experience.
 
     Returns HTTPStatus.OK if everything finished.
-    Returns HTTPStatus.ACCEPTED if the connector is being paused, but background tasks
-    did not finish within the timeout.
     """
-    WAIT_TIMEOUT = 15.0
-    still_terminating = False
-
     cc_pair = get_connector_credential_pair_from_id(
         cc_pair_id=cc_pair_id,
         db_session=db_session,
@@ -188,73 +183,37 @@ def update_cc_pair_status(
             detail="Connection not found for current user's permissions",
         )
 
+    redis_connector = RedisConnector(tenant_id, cc_pair_id)
     if status_update_request.status == ConnectorCredentialPairStatus.PAUSED:
+        redis_connector.stop.set_fence(True)
+
         search_settings_list: list[SearchSettings] = get_active_search_settings(
             db_session
         )
 
-        redis_connector = RedisConnector(tenant_id, cc_pair_id)
-
-        try:
-            redis_connector.stop.set_fence(True)
-            while True:
-                logger.debug(
-                    f"Wait for indexing soft termination starting: cc_pair={cc_pair_id}"
-                )
-                wait_succeeded = redis_connector.wait_for_indexing_termination(
-                    search_settings_list, WAIT_TIMEOUT
-                )
-                if wait_succeeded:
-                    logger.debug(
-                        f"Wait for indexing soft termination succeeded: cc_pair={cc_pair_id}"
-                    )
-                    break
-
-                logger.debug(
-                    "Wait for indexing soft termination timed out. "
-                    f"Moving to hard termination: cc_pair={cc_pair_id} timeout={WAIT_TIMEOUT:.2f}"
-                )
-
-                for search_settings in search_settings_list:
-                    redis_connector_index = redis_connector.new_index(
-                        search_settings.id
-                    )
-                    if not redis_connector_index.fenced:
-                        continue
-
-                    index_payload = redis_connector_index.payload
-                    if not index_payload:
-                        continue
-
-                    if not index_payload.celery_task_id:
-                        continue
-
-                    # Revoke the task to prevent it from running
-                    primary_app.control.revoke(index_payload.celery_task_id)
-
-                    # If it is running, then signaling for termination will get the
-                    # watchdog thread to kill the spawned task
-                    redis_connector_index.set_terminate(index_payload.celery_task_id)
-
-                logger.debug(
-                    f"Wait for indexing hard termination starting: cc_pair={cc_pair_id}"
-                )
-                wait_succeeded = redis_connector.wait_for_indexing_termination(
-                    search_settings_list, WAIT_TIMEOUT
-                )
-                if wait_succeeded:
-                    logger.debug(
-                        f"Wait for indexing hard termination succeeded: cc_pair={cc_pair_id}"
-                    )
-                    break
-
-                logger.debug(
-                    f"Wait for indexing hard termination timed out: cc_pair={cc_pair_id}"
-                )
-                still_terminating = True
-                break
-        finally:
-            redis_connector.stop.set_fence(False)
+        while True:
+            for search_settings in search_settings_list:
+                redis_connector_index = redis_connector.new_index(search_settings.id)
+                if not redis_connector_index.fenced:
+                    continue
+
+                index_payload = redis_connector_index.payload
+                if not index_payload:
+                    continue
+
+                if not index_payload.celery_task_id:
+                    continue
+
+                # Revoke the task to prevent it from running
+                primary_app.control.revoke(index_payload.celery_task_id)
+
+                # If it is running, then signaling for termination will get the
+                # watchdog thread to kill the spawned task
+                redis_connector_index.set_terminate(index_payload.celery_task_id)
+
+            break
+    else:
+        redis_connector.stop.set_fence(False)
 
     update_connector_credential_pair_from_id(
         db_session=db_session,
@@ -264,14 +223,6 @@ def update_cc_pair_status(
 
     db_session.commit()
 
-    if still_terminating:
-        return JSONResponse(
-            status_code=HTTPStatus.ACCEPTED,
-            content={
-                "message": "Request accepted, background task termination still in progress"
-            },
-        )
-
     return JSONResponse(
         status_code=HTTPStatus.OK, content={"message": str(HTTPStatus.OK)}
     )
diff --git a/web/src/app/admin/connector/[ccPairId]/DeletionButton.tsx b/web/src/app/admin/connector/[ccPairId]/DeletionButton.tsx
index ccef14b5a35..fe430af33fc 100644
--- a/web/src/app/admin/connector/[ccPairId]/DeletionButton.tsx
+++ b/web/src/app/admin/connector/[ccPairId]/DeletionButton.tsx
@@ -8,7 +8,13 @@ import { deleteCCPair } from "@/lib/documentDeletion";
 import { mutate } from "swr";
 import { buildCCPairInfoUrl } from "./lib";
 
-export function DeletionButton({ ccPair }: { ccPair: CCPairFullInfo }) {
+export function DeletionButton({
+  ccPair,
+  refresh,
+}: {
+  ccPair: CCPairFullInfo;
+  refresh: () => void;
+}) {
   const { popup, setPopup } = usePopup();
 
   const isDeleting =
@@ -31,14 +37,22 @@ export function DeletionButton({ ccPair }: { ccPair: CCPairFullInfo }) {
       {popup}
       <Button
         variant="destructive"
-        onClick={() =>
-          deleteCCPair(
-            ccPair.connector.id,
-            ccPair.credential.id,
-            setPopup,
-            () => mutate(buildCCPairInfoUrl(ccPair.id))
-          )
-        }
+        onClick={async () => {
+          try {
+            // Await the delete operation to ensure it completes
+            await deleteCCPair(
+              ccPair.connector.id,
+              ccPair.credential.id,
+              setPopup,
+              () => mutate(buildCCPairInfoUrl(ccPair.id))
+            );
+
+            // Call refresh to update the state after deletion
+            refresh();
+          } catch (error) {
+            console.error("Error deleting connector:", error);
+          }
+        }}
         icon={FiTrash}
         disabled={
           ccPair.status === ConnectorCredentialPairStatus.ACTIVE || isDeleting
diff --git a/web/src/app/admin/connector/[ccPairId]/page.tsx b/web/src/app/admin/connector/[ccPairId]/page.tsx
index 08bcdece6af..7881de4e861 100644
--- a/web/src/app/admin/connector/[ccPairId]/page.tsx
+++ b/web/src/app/admin/connector/[ccPairId]/page.tsx
@@ -362,7 +362,7 @@ function Main({ ccPairId }: { ccPairId: number }) {
       <div className="flex mt-4">
         <div className="mx-auto">
           {ccPair.is_editable_for_current_user && (
-            <DeletionButton ccPair={ccPair} />
+            <DeletionButton ccPair={ccPair} refresh={refresh} />
           )}
         </div>
       </div>

From bf78fb79f8ce53fa9842f572bd9a009e279752e1 Mon Sep 17 00:00:00 2001
From: rkuo-danswer <rkuo@danswer.ai>
Date: Thu, 9 Jan 2025 18:10:59 -0800
Subject: [PATCH 09/29] possible fix for gdrive oauth in the cloud (#3642)

* possible fix for gd oauth in the cloud

* missed code in rename/merge

---------

Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app>
---
 .../onyx/connectors/google_utils/google_kv.py | 14 ++++++
 backend/onyx/server/documents/connector.py    | 50 +++++++++++++++++--
 backend/onyx/server/documents/credential.py   |  3 --
 3 files changed, 61 insertions(+), 6 deletions(-)

diff --git a/backend/onyx/connectors/google_utils/google_kv.py b/backend/onyx/connectors/google_utils/google_kv.py
index 96785e32544..4f714e98fb5 100644
--- a/backend/onyx/connectors/google_utils/google_kv.py
+++ b/backend/onyx/connectors/google_utils/google_kv.py
@@ -17,6 +17,9 @@
 from onyx.configs.constants import KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY
 from onyx.connectors.google_utils.resources import get_drive_service
 from onyx.connectors.google_utils.resources import get_gmail_service
+from onyx.connectors.google_utils.shared_constants import (
+    DB_CREDENTIALS_AUTHENTICATION_METHOD,
+)
 from onyx.connectors.google_utils.shared_constants import (
     DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY,
 )
@@ -29,6 +32,9 @@
 from onyx.connectors.google_utils.shared_constants import (
     GOOGLE_SCOPES,
 )
+from onyx.connectors.google_utils.shared_constants import (
+    GoogleOAuthAuthenticationMethod,
+)
 from onyx.connectors.google_utils.shared_constants import (
     MISSING_SCOPES_ERROR_STR,
 )
@@ -96,6 +102,7 @@ def update_credential_access_tokens(
     user: User,
     db_session: Session,
     source: DocumentSource,
+    auth_method: GoogleOAuthAuthenticationMethod,
 ) -> OAuthCredentials | None:
     app_credentials = get_google_app_cred(source)
     flow = InstalledAppFlow.from_client_config(
@@ -119,6 +126,7 @@ def update_credential_access_tokens(
     new_creds_dict = {
         DB_CREDENTIALS_DICT_TOKEN_KEY: token_json_str,
         DB_CREDENTIALS_PRIMARY_ADMIN_KEY: email,
+        DB_CREDENTIALS_AUTHENTICATION_METHOD: auth_method.value,
     }
 
     if not update_credential_json(credential_id, new_creds_dict, user, db_session):
@@ -129,6 +137,7 @@ def update_credential_access_tokens(
 def build_service_account_creds(
     source: DocumentSource,
     primary_admin_email: str | None = None,
+    name: str | None = None,
 ) -> CredentialBase:
     service_account_key = get_service_account_key(source=source)
 
@@ -138,10 +147,15 @@ def build_service_account_creds(
     if primary_admin_email:
         credential_dict[DB_CREDENTIALS_PRIMARY_ADMIN_KEY] = primary_admin_email
 
+    credential_dict[
+        DB_CREDENTIALS_AUTHENTICATION_METHOD
+    ] = GoogleOAuthAuthenticationMethod.UPLOADED.value
+
     return CredentialBase(
         credential_json=credential_dict,
         admin_public=True,
         source=source,
+        name=name,
     )
 
 
diff --git a/backend/onyx/server/documents/connector.py b/backend/onyx/server/documents/connector.py
index 46c74942e1e..6be024cb227 100644
--- a/backend/onyx/server/documents/connector.py
+++ b/backend/onyx/server/documents/connector.py
@@ -53,8 +53,9 @@
     upsert_service_account_key,
 )
 from onyx.connectors.google_utils.google_kv import verify_csrf
+from onyx.connectors.google_utils.shared_constants import DB_CREDENTIALS_DICT_TOKEN_KEY
 from onyx.connectors.google_utils.shared_constants import (
-    DB_CREDENTIALS_DICT_TOKEN_KEY,
+    GoogleOAuthAuthenticationMethod,
 )
 from onyx.db.connector import create_connector
 from onyx.db.connector import delete_connector
@@ -314,6 +315,7 @@ def upsert_service_account_credential(
         credential_base = build_service_account_creds(
             DocumentSource.GOOGLE_DRIVE,
             primary_admin_email=service_account_credential_request.google_primary_admin,
+            name="Service Account (uploaded)",
         )
     except KvKeyNotFoundError as e:
         raise HTTPException(status_code=400, detail=str(e))
@@ -408,6 +410,38 @@ def upload_files(
     return FileUploadResponse(file_paths=deduped_file_paths)
 
 
+@router.get("/admin/connector")
+def get_connectors_by_credential(
+    _: User = Depends(current_curator_or_admin_user),
+    db_session: Session = Depends(get_session),
+    credential: int | None = None,
+) -> list[ConnectorSnapshot]:
+    """Get a list of connectors. Allow filtering by a specific credential id."""
+
+    connectors = fetch_connectors(db_session)
+
+    filtered_connectors = []
+    for connector in connectors:
+        if connector.source == DocumentSource.INGESTION_API:
+            # don't include INGESTION_API, as it's a system level
+            # connector not manageable by the user
+            continue
+
+        if credential is not None:
+            found = False
+            for cc_pair in connector.credentials:
+                if credential == cc_pair.credential_id:
+                    found = True
+                    break
+
+            if not found:
+                continue
+
+        filtered_connectors.append(ConnectorSnapshot.from_connector_db_model(connector))
+
+    return filtered_connectors
+
+
 # Retrieves most recent failure cases for connectors that are currently failing
 @router.get("/admin/connector/failed-indexing-status")
 def get_currently_failed_indexing_status(
@@ -987,7 +1021,12 @@ def gmail_callback(
     credential_id = int(credential_id_cookie)
     verify_csrf(credential_id, callback.state)
     credentials: Credentials | None = update_credential_access_tokens(
-        callback.code, credential_id, user, db_session, DocumentSource.GMAIL
+        callback.code,
+        credential_id,
+        user,
+        db_session,
+        DocumentSource.GMAIL,
+        GoogleOAuthAuthenticationMethod.UPLOADED,
     )
     if credentials is None:
         raise HTTPException(
@@ -1013,7 +1052,12 @@ def google_drive_callback(
     verify_csrf(credential_id, callback.state)
 
     credentials: Credentials | None = update_credential_access_tokens(
-        callback.code, credential_id, user, db_session, DocumentSource.GOOGLE_DRIVE
+        callback.code,
+        credential_id,
+        user,
+        db_session,
+        DocumentSource.GOOGLE_DRIVE,
+        GoogleOAuthAuthenticationMethod.UPLOADED,
     )
     if credentials is None:
         raise HTTPException(
diff --git a/backend/onyx/server/documents/credential.py b/backend/onyx/server/documents/credential.py
index 51d9643dc77..b68ee660cb7 100644
--- a/backend/onyx/server/documents/credential.py
+++ b/backend/onyx/server/documents/credential.py
@@ -9,7 +9,6 @@
 from onyx.auth.users import current_user
 from onyx.db.credentials import alter_credential
 from onyx.db.credentials import cleanup_gmail_credentials
-from onyx.db.credentials import cleanup_google_drive_credentials
 from onyx.db.credentials import create_credential
 from onyx.db.credentials import CREDENTIAL_PERMISSIONS_TO_IGNORE
 from onyx.db.credentials import delete_credential
@@ -133,8 +132,6 @@ def create_credential_from_model(
     # Temporary fix for empty Google App credentials
     if credential_info.source == DocumentSource.GMAIL:
         cleanup_gmail_credentials(db_session=db_session)
-    if credential_info.source == DocumentSource.GOOGLE_DRIVE:
-        cleanup_google_drive_credentials(db_session=db_session)
 
     credential = create_credential(credential_info, user, db_session)
     return ObjectCreationIdResponse(

From 1470b7e038e55f3d7da701c54cc30c69be44f051 Mon Sep 17 00:00:00 2001
From: Weves <chrisweaver101@gmail.com>
Date: Thu, 9 Jan 2025 20:03:02 -0800
Subject: [PATCH 10/29] Add tests for some LLM provider endpoints + small logic
 change to ensure that display_model_names is not empty

---
 backend/onyx/server/manage/llm/api.py         |  27 ++--
 backend/tests/integration/conftest.py         |  28 ++++
 .../openai_assistants_api/conftest.py         |  28 ----
 .../tests/llm_provider/test_llm_provider.py   | 120 ++++++++++++++++++
 4 files changed, 162 insertions(+), 41 deletions(-)
 create mode 100644 backend/tests/integration/tests/llm_provider/test_llm_provider.py

diff --git a/backend/onyx/server/manage/llm/api.py b/backend/onyx/server/manage/llm/api.py
index dc36ce649cb..b5b52f59014 100644
--- a/backend/onyx/server/manage/llm/api.py
+++ b/backend/onyx/server/manage/llm/api.py
@@ -142,19 +142,20 @@ def put_llm_provider(
             detail=f"LLM Provider with name {llm_provider.name} already exists",
         )
 
-    # Ensure default_model_name and fast_default_model_name are in display_model_names
-    # This is necessary for custom models and Bedrock/Azure models
-    if llm_provider.display_model_names is None:
-        llm_provider.display_model_names = []
-
-    if llm_provider.default_model_name not in llm_provider.display_model_names:
-        llm_provider.display_model_names.append(llm_provider.default_model_name)
-
-    if (
-        llm_provider.fast_default_model_name
-        and llm_provider.fast_default_model_name not in llm_provider.display_model_names
-    ):
-        llm_provider.display_model_names.append(llm_provider.fast_default_model_name)
+    if llm_provider.display_model_names is not None:
+        # Ensure default_model_name and fast_default_model_name are in display_model_names
+        # This is necessary for custom models and Bedrock/Azure models
+        if llm_provider.default_model_name not in llm_provider.display_model_names:
+            llm_provider.display_model_names.append(llm_provider.default_model_name)
+
+        if (
+            llm_provider.fast_default_model_name
+            and llm_provider.fast_default_model_name
+            not in llm_provider.display_model_names
+        ):
+            llm_provider.display_model_names.append(
+                llm_provider.fast_default_model_name
+            )
 
     try:
         return upsert_llm_provider(
diff --git a/backend/tests/integration/conftest.py b/backend/tests/integration/conftest.py
index 5eba1e66f87..ec50669d0bb 100644
--- a/backend/tests/integration/conftest.py
+++ b/backend/tests/integration/conftest.py
@@ -4,8 +4,12 @@
 import pytest
 from sqlalchemy.orm import Session
 
+from onyx.auth.schemas import UserRole
 from onyx.db.engine import get_session_context_manager
 from onyx.db.search_settings import get_current_search_settings
+from tests.integration.common_utils.constants import GENERAL_HEADERS
+from tests.integration.common_utils.managers.user import build_email
+from tests.integration.common_utils.managers.user import DEFAULT_PASSWORD
 from tests.integration.common_utils.managers.user import UserManager
 from tests.integration.common_utils.reset import reset_all
 from tests.integration.common_utils.reset import reset_all_multitenant
@@ -57,6 +61,30 @@ def new_admin_user(reset: None) -> DATestUser | None:
         return None
 
 
+@pytest.fixture
+def admin_user() -> DATestUser | None:
+    try:
+        return UserManager.create(name="admin_user")
+    except Exception:
+        pass
+
+    try:
+        return UserManager.login_as_user(
+            DATestUser(
+                id="",
+                email=build_email("admin_user"),
+                password=DEFAULT_PASSWORD,
+                headers=GENERAL_HEADERS,
+                role=UserRole.ADMIN,
+                is_active=True,
+            )
+        )
+    except Exception:
+        pass
+
+    return None
+
+
 @pytest.fixture
 def reset_multitenant() -> None:
     reset_all_multitenant()
diff --git a/backend/tests/integration/openai_assistants_api/conftest.py b/backend/tests/integration/openai_assistants_api/conftest.py
index 37ada5cd87b..5fc6660ee62 100644
--- a/backend/tests/integration/openai_assistants_api/conftest.py
+++ b/backend/tests/integration/openai_assistants_api/conftest.py
@@ -7,40 +7,12 @@
 from tests.integration.common_utils.constants import API_SERVER_URL
 from tests.integration.common_utils.constants import GENERAL_HEADERS
 from tests.integration.common_utils.managers.llm_provider import LLMProviderManager
-from tests.integration.common_utils.managers.user import build_email
-from tests.integration.common_utils.managers.user import DEFAULT_PASSWORD
-from tests.integration.common_utils.managers.user import UserManager
-from tests.integration.common_utils.managers.user import UserRole
 from tests.integration.common_utils.test_models import DATestLLMProvider
 from tests.integration.common_utils.test_models import DATestUser
 
 BASE_URL = f"{API_SERVER_URL}/openai-assistants"
 
 
-@pytest.fixture
-def admin_user() -> DATestUser | None:
-    try:
-        return UserManager.create("admin_user")
-    except Exception:
-        pass
-
-    try:
-        return UserManager.login_as_user(
-            DATestUser(
-                id="",
-                email=build_email("admin_user"),
-                password=DEFAULT_PASSWORD,
-                headers=GENERAL_HEADERS,
-                role=UserRole.ADMIN,
-                is_active=True,
-            )
-        )
-    except Exception:
-        pass
-
-    return None
-
-
 @pytest.fixture
 def llm_provider(admin_user: DATestUser | None) -> DATestLLMProvider:
     return LLMProviderManager.create(user_performing_action=admin_user)
diff --git a/backend/tests/integration/tests/llm_provider/test_llm_provider.py b/backend/tests/integration/tests/llm_provider/test_llm_provider.py
new file mode 100644
index 00000000000..4540f24b239
--- /dev/null
+++ b/backend/tests/integration/tests/llm_provider/test_llm_provider.py
@@ -0,0 +1,120 @@
+import uuid
+
+import requests
+
+from tests.integration.common_utils.constants import API_SERVER_URL
+from tests.integration.common_utils.test_models import DATestUser
+
+
+_DEFAULT_MODELS = ["gpt-4", "gpt-4o"]
+
+
+def _get_provider_by_id(admin_user: DATestUser, provider_id: str) -> dict | None:
+    """Utility function to fetch an LLM provider by ID"""
+    response = requests.get(
+        f"{API_SERVER_URL}/admin/llm/provider",
+        headers=admin_user.headers,
+    )
+    assert response.status_code == 200
+    providers = response.json()
+    return next((p for p in providers if p["id"] == provider_id), None)
+
+
+def test_create_llm_provider_without_display_model_names(
+    admin_user: DATestUser,
+) -> None:
+    """Test creating an LLM provider without specifying
+    display_model_names and verify it's null in response"""
+    # Create LLM provider without model_names
+    response = requests.put(
+        f"{API_SERVER_URL}/admin/llm/provider",
+        headers=admin_user.headers,
+        json={
+            "name": str(uuid.uuid4()),
+            "provider": "openai",
+            "default_model_name": _DEFAULT_MODELS[0],
+            "model_names": _DEFAULT_MODELS,
+            "is_public": True,
+            "groups": [],
+        },
+    )
+    assert response.status_code == 200
+    created_provider = response.json()
+    provider_data = _get_provider_by_id(admin_user, created_provider["id"])
+
+    # Verify model_names is None/null
+    assert provider_data is not None
+    assert provider_data["model_names"] == _DEFAULT_MODELS
+    assert provider_data["default_model_name"] == _DEFAULT_MODELS[0]
+    assert provider_data["display_model_names"] is None
+
+
+def test_update_llm_provider_model_names(admin_user: DATestUser) -> None:
+    """Test updating an LLM provider's model_names"""
+    # First create provider without model_names
+    name = str(uuid.uuid4())
+    response = requests.put(
+        f"{API_SERVER_URL}/admin/llm/provider",
+        headers=admin_user.headers,
+        json={
+            "name": name,
+            "provider": "openai",
+            "default_model_name": _DEFAULT_MODELS[0],
+            "model_names": [_DEFAULT_MODELS[0]],
+            "is_public": True,
+            "groups": [],
+        },
+    )
+    assert response.status_code == 200
+    created_provider = response.json()
+
+    # Update with model_names
+    response = requests.put(
+        f"{API_SERVER_URL}/admin/llm/provider",
+        headers=admin_user.headers,
+        json={
+            "id": created_provider["id"],
+            "name": name,
+            "provider": created_provider["provider"],
+            "default_model_name": _DEFAULT_MODELS[0],
+            "model_names": _DEFAULT_MODELS,
+            "is_public": True,
+            "groups": [],
+        },
+    )
+    assert response.status_code == 200
+
+    # Verify update
+    provider_data = _get_provider_by_id(admin_user, created_provider["id"])
+    assert provider_data is not None
+    assert provider_data["model_names"] == _DEFAULT_MODELS
+
+
+def test_delete_llm_provider(admin_user: DATestUser) -> None:
+    """Test deleting an LLM provider"""
+    # Create a provider
+    response = requests.put(
+        f"{API_SERVER_URL}/admin/llm/provider",
+        headers=admin_user.headers,
+        json={
+            "name": "test-provider-delete",
+            "provider": "openai",
+            "default_model_name": _DEFAULT_MODELS[0],
+            "model_names": _DEFAULT_MODELS,
+            "is_public": True,
+            "groups": [],
+        },
+    )
+    assert response.status_code == 200
+    created_provider = response.json()
+
+    # Delete the provider
+    response = requests.delete(
+        f"{API_SERVER_URL}/admin/llm/provider/{created_provider['id']}",
+        headers=admin_user.headers,
+    )
+    assert response.status_code == 200
+
+    # Verify provider is deleted by checking it's not in the list
+    provider_data = _get_provider_by_id(admin_user, created_provider["id"])
+    assert provider_data is None

From cab7e60542e7a01ac1013120e3002d3ba7df3e36 Mon Sep 17 00:00:00 2001
From: pablonyx <pablo@danswer.ai>
Date: Fri, 10 Jan 2025 11:31:11 -0800
Subject: [PATCH 11/29] Proper anonymous user restricting (#3645)

---
 backend/ee/onyx/db/analytics.py              |  3 ++-
 backend/ee/onyx/db/token_limit.py            | 11 +++++++++--
 backend/onyx/db/connector_credential_pair.py | 12 +++++++++---
 backend/onyx/db/document_set.py              | 11 +++++++++--
 backend/onyx/db/feedback.py                  | 11 +++++++++--
 backend/onyx/db/persona.py                   | 11 +++++++++--
 6 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/backend/ee/onyx/db/analytics.py b/backend/ee/onyx/db/analytics.py
index 5e525fa624d..b9ae0005d3a 100644
--- a/backend/ee/onyx/db/analytics.py
+++ b/backend/ee/onyx/db/analytics.py
@@ -345,7 +345,8 @@ def fetch_assistant_unique_users_total(
 def user_can_view_assistant_stats(
     db_session: Session, user: User | None, assistant_id: int
 ) -> bool:
-    # If user is None, assume the user is an admin or auth is disabled
+    # If user is None and auth is disabled, assume the user is an admin
+
     if user is None or user.role == UserRole.ADMIN:
         return True
 
diff --git a/backend/ee/onyx/db/token_limit.py b/backend/ee/onyx/db/token_limit.py
index 863f4450315..ca5249e6923 100644
--- a/backend/ee/onyx/db/token_limit.py
+++ b/backend/ee/onyx/db/token_limit.py
@@ -7,6 +7,7 @@
 from sqlalchemy.orm import aliased
 from sqlalchemy.orm import Session
 
+from onyx.configs.app_configs import DISABLE_AUTH
 from onyx.configs.constants import TokenRateLimitScope
 from onyx.db.models import TokenRateLimit
 from onyx.db.models import TokenRateLimit__UserGroup
@@ -20,8 +21,8 @@
 def _add_user_filters(
     stmt: Select, user: User | None, get_editable: bool = True
 ) -> Select:
-    # If user is None, assume the user is an admin or auth is disabled
-    if user is None or user.role == UserRole.ADMIN:
+    # If user is None and auth is disabled, assume the user is an admin
+    if (user is None and DISABLE_AUTH) or (user and user.role == UserRole.ADMIN):
         return stmt
 
     stmt = stmt.distinct()
@@ -47,6 +48,12 @@ def _add_user_filters(
     that the user isn't a curator for
     - if we are not editing, we show all token_rate_limits in the groups the user curates
     """
+
+    # If user is None, this is an anonymous user and we should only show public token_rate_limits
+    if user is None:
+        where_clause = TokenRateLimit.scope == TokenRateLimitScope.GLOBAL
+        return stmt.where(where_clause)
+
     where_clause = User__UG.user_id == user.id
     if user.role == UserRole.CURATOR and get_editable:
         where_clause &= User__UG.is_curator == True  # noqa: E712
diff --git a/backend/onyx/db/connector_credential_pair.py b/backend/onyx/db/connector_credential_pair.py
index ea72f1a9507..3378a8d493b 100644
--- a/backend/onyx/db/connector_credential_pair.py
+++ b/backend/onyx/db/connector_credential_pair.py
@@ -10,6 +10,7 @@
 from sqlalchemy.orm import joinedload
 from sqlalchemy.orm import Session
 
+from onyx.configs.app_configs import DISABLE_AUTH
 from onyx.configs.constants import DocumentSource
 from onyx.db.connector import fetch_connector_by_id
 from onyx.db.credentials import fetch_credential_by_id
@@ -28,15 +29,14 @@
 from onyx.utils.logger import setup_logger
 from onyx.utils.variable_functionality import fetch_ee_implementation_or_noop
 
-
 logger = setup_logger()
 
 
 def _add_user_filters(
     stmt: Select, user: User | None, get_editable: bool = True
 ) -> Select:
-    # If user is None, assume the user is an admin or auth is disabled
-    if user is None or user.role == UserRole.ADMIN:
+    # If user is None and auth is disabled, assume the user is an admin
+    if (user is None and DISABLE_AUTH) or (user and user.role == UserRole.ADMIN):
         return stmt
 
     stmt = stmt.distinct()
@@ -63,6 +63,12 @@ def _add_user_filters(
     - if we are not editing, we show all cc_pairs in the groups the user is a curator
     for (as well as public cc_pairs)
     """
+
+    # If user is None, this is an anonymous user and we should only show public cc_pairs
+    if user is None:
+        where_clause = ConnectorCredentialPair.access_type == AccessType.PUBLIC
+        return stmt.where(where_clause)
+
     where_clause = User__UG.user_id == user.id
     if user.role == UserRole.CURATOR and get_editable:
         where_clause &= User__UG.is_curator == True  # noqa: E712
diff --git a/backend/onyx/db/document_set.py b/backend/onyx/db/document_set.py
index 750021d29a1..dfc4f53a189 100644
--- a/backend/onyx/db/document_set.py
+++ b/backend/onyx/db/document_set.py
@@ -12,6 +12,7 @@
 from sqlalchemy.orm import aliased
 from sqlalchemy.orm import Session
 
+from onyx.configs.app_configs import DISABLE_AUTH
 from onyx.db.connector_credential_pair import get_cc_pair_groups_for_ids
 from onyx.db.connector_credential_pair import get_connector_credential_pairs
 from onyx.db.enums import AccessType
@@ -36,8 +37,8 @@
 def _add_user_filters(
     stmt: Select, user: User | None, get_editable: bool = True
 ) -> Select:
-    # If user is None, assume the user is an admin or auth is disabled
-    if user is None or user.role == UserRole.ADMIN:
+    # If user is None and auth is disabled, assume the user is an admin
+    if (user is None and DISABLE_AUTH) or (user and user.role == UserRole.ADMIN):
         return stmt
 
     stmt = stmt.distinct()
@@ -61,6 +62,12 @@ def _add_user_filters(
     - if we are not editing, we show all DocumentSets in the groups the user is a curator
     for (as well as public DocumentSets)
     """
+
+    # If user is None, this is an anonymous user and we should only show public DocumentSets
+    if user is None:
+        where_clause = DocumentSetDBModel.is_public == True  # noqa: E712
+        return stmt.where(where_clause)
+
     where_clause = User__UserGroup.user_id == user.id
     if user.role == UserRole.CURATOR and get_editable:
         where_clause &= User__UserGroup.is_curator == True  # noqa: E712
diff --git a/backend/onyx/db/feedback.py b/backend/onyx/db/feedback.py
index 7acf44fd7e4..0a8f9e969c6 100644
--- a/backend/onyx/db/feedback.py
+++ b/backend/onyx/db/feedback.py
@@ -13,6 +13,7 @@
 from sqlalchemy.orm import aliased
 from sqlalchemy.orm import Session
 
+from onyx.configs.app_configs import DISABLE_AUTH
 from onyx.configs.constants import MessageType
 from onyx.configs.constants import SearchFeedbackType
 from onyx.db.chat import get_chat_message
@@ -46,8 +47,8 @@ def _fetch_db_doc_by_id(doc_id: str, db_session: Session) -> DbDocument:
 def _add_user_filters(
     stmt: Select, user: User | None, get_editable: bool = True
 ) -> Select:
-    # If user is None, assume the user is an admin or auth is disabled
-    if user is None or user.role == UserRole.ADMIN:
+    # If user is None and auth is disabled, assume the user is an admin
+    if (user is None and DISABLE_AUTH) or (user and user.role == UserRole.ADMIN):
         return stmt
 
     stmt = stmt.distinct()
@@ -84,6 +85,12 @@ def _add_user_filters(
     - if we are not editing, we show all objects in the groups the user is a curator
     for (as well as public objects as well)
     """
+
+    # If user is None, this is an anonymous user and we should only show public documents
+    if user is None:
+        where_clause = CCPair.access_type == AccessType.PUBLIC
+        return stmt.where(where_clause)
+
     where_clause = User__UG.user_id == user.id
     if user.role == UserRole.CURATOR and get_editable:
         where_clause &= User__UG.is_curator == True  # noqa: E712
diff --git a/backend/onyx/db/persona.py b/backend/onyx/db/persona.py
index ec896c5d304..002ee0d4edb 100644
--- a/backend/onyx/db/persona.py
+++ b/backend/onyx/db/persona.py
@@ -17,6 +17,7 @@
 from sqlalchemy.orm import Session
 
 from onyx.auth.schemas import UserRole
+from onyx.configs.app_configs import DISABLE_AUTH
 from onyx.configs.chat_configs import BING_API_KEY
 from onyx.configs.chat_configs import CONTEXT_CHUNKS_ABOVE
 from onyx.configs.chat_configs import CONTEXT_CHUNKS_BELOW
@@ -45,8 +46,8 @@
 def _add_user_filters(
     stmt: Select, user: User | None, get_editable: bool = True
 ) -> Select:
-    # If user is None, assume the user is an admin or auth is disabled
-    if user is None or user.role == UserRole.ADMIN:
+    # If user is None and auth is disabled, assume the user is an admin
+    if (user is None and DISABLE_AUTH) or (user and user.role == UserRole.ADMIN):
         return stmt
 
     stmt = stmt.distinct()
@@ -78,6 +79,12 @@ def _add_user_filters(
     for (as well as public Personas)
     - if we are not editing, we return all Personas directly connected to the user
     """
+
+    # If user is None, this is an anonymous user and we should only show public Personas
+    if user is None:
+        where_clause = Persona.is_public == True  # noqa: E712
+        return stmt.where(where_clause)
+
     where_clause = User__UserGroup.user_id == user.id
     if user.role == UserRole.CURATOR and get_editable:
         where_clause &= User__UserGroup.is_curator == True  # noqa: E712

From ac182c74b3862ba75097f6f346b7d39357b5878d Mon Sep 17 00:00:00 2001
From: "Richard Kuo (Danswer)" <rkuo@onyx.app>
Date: Fri, 10 Jan 2025 12:11:33 -0800
Subject: [PATCH 12/29] log all start methods

---
 backend/onyx/background/celery/apps/indexing.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/backend/onyx/background/celery/apps/indexing.py b/backend/onyx/background/celery/apps/indexing.py
index 46282772ff4..818277ab47a 100644
--- a/backend/onyx/background/celery/apps/indexing.py
+++ b/backend/onyx/background/celery/apps/indexing.py
@@ -57,6 +57,10 @@ def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None
 @worker_init.connect
 def on_worker_init(sender: Any, **kwargs: Any) -> None:
     logger.info("worker_init signal received.")
+
+    all_start_methods: list[str] = multiprocessing.get_all_start_methods()
+    logger.info(f"Multiprocessing all start methods: {all_start_methods}")
+
     multiprocessing.set_start_method("spawn")  # fork is unsafe, set to spawn
     logger.info(f"Multiprocessing start method: {multiprocessing.get_start_method()}")
 

From b6c2ecfecb88d786129a6b3c6cf81699e2b6b997 Mon Sep 17 00:00:00 2001
From: "Richard Kuo (Danswer)" <rkuo@onyx.app>
Date: Fri, 10 Jan 2025 12:16:13 -0800
Subject: [PATCH 13/29] more debugging of start method

---
 backend/onyx/background/celery/apps/heavy.py    | 8 +++++++-
 backend/onyx/background/celery/apps/indexing.py | 4 +++-
 backend/onyx/background/celery/apps/light.py    | 9 ++++++++-
 backend/onyx/background/celery/apps/primary.py  | 8 +++++++-
 4 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/backend/onyx/background/celery/apps/heavy.py b/backend/onyx/background/celery/apps/heavy.py
index ee8958e7dd0..1a8b6587447 100644
--- a/backend/onyx/background/celery/apps/heavy.py
+++ b/backend/onyx/background/celery/apps/heavy.py
@@ -56,8 +56,14 @@ def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None
 @worker_init.connect
 def on_worker_init(sender: Any, **kwargs: Any) -> None:
     logger.info("worker_init signal received.")
+
+    all_start_methods: list[str] = multiprocessing.get_all_start_methods()
+    logger.info(f"Multiprocessing all start methods: {all_start_methods}")
+
     multiprocessing.set_start_method("spawn")  # fork is unsafe, set to spawn
-    logger.info(f"Multiprocessing start method: {multiprocessing.get_start_method()}")
+    logger.info(
+        f"Multiprocessing selected start method: {multiprocessing.get_start_method()}"
+    )
 
     SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_HEAVY_APP_NAME)
     SqlEngine.init_engine(pool_size=4, max_overflow=12)
diff --git a/backend/onyx/background/celery/apps/indexing.py b/backend/onyx/background/celery/apps/indexing.py
index 818277ab47a..1db1641ae52 100644
--- a/backend/onyx/background/celery/apps/indexing.py
+++ b/backend/onyx/background/celery/apps/indexing.py
@@ -62,7 +62,9 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
     logger.info(f"Multiprocessing all start methods: {all_start_methods}")
 
     multiprocessing.set_start_method("spawn")  # fork is unsafe, set to spawn
-    logger.info(f"Multiprocessing start method: {multiprocessing.get_start_method()}")
+    logger.info(
+        f"Multiprocessing selected start method: {multiprocessing.get_start_method()}"
+    )
 
     SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_INDEXING_APP_NAME)
 
diff --git a/backend/onyx/background/celery/apps/light.py b/backend/onyx/background/celery/apps/light.py
index 11f1341a1e0..73058ad3218 100644
--- a/backend/onyx/background/celery/apps/light.py
+++ b/backend/onyx/background/celery/apps/light.py
@@ -56,8 +56,15 @@ def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None
 @worker_init.connect
 def on_worker_init(sender: Any, **kwargs: Any) -> None:
     logger.info("worker_init signal received.")
+
+    all_start_methods: list[str] = multiprocessing.get_all_start_methods()
+    logger.info(f"Multiprocessing all start methods: {all_start_methods}")
+
     multiprocessing.set_start_method("spawn")  # fork is unsafe, set to spawn
-    logger.info(f"Multiprocessing start method: {multiprocessing.get_start_method()}")
+    logger.info(
+        f"Multiprocessing selected start method: {multiprocessing.get_start_method()}"
+    )
+
     logger.info(f"Concurrency: {sender.concurrency}")
 
     SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_LIGHT_APP_NAME)
diff --git a/backend/onyx/background/celery/apps/primary.py b/backend/onyx/background/celery/apps/primary.py
index af2105b8c6d..23f24852796 100644
--- a/backend/onyx/background/celery/apps/primary.py
+++ b/backend/onyx/background/celery/apps/primary.py
@@ -80,8 +80,14 @@ def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None
 @worker_init.connect
 def on_worker_init(sender: Any, **kwargs: Any) -> None:
     logger.info("worker_init signal received.")
+
+    all_start_methods: list[str] = multiprocessing.get_all_start_methods()
+    logger.info(f"Multiprocessing all start methods: {all_start_methods}")
+
     multiprocessing.set_start_method("spawn")  # fork is unsafe, set to spawn
-    logger.info(f"Multiprocessing start method: {multiprocessing.get_start_method()}")
+    logger.info(
+        f"Multiprocessing selected start method: {multiprocessing.get_start_method()}"
+    )
 
     SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_PRIMARY_APP_NAME)
     SqlEngine.init_engine(pool_size=8, max_overflow=0)

From 2163a138ed57d604d97d8f3ed9fa6daa48be9a59 Mon Sep 17 00:00:00 2001
From: "Richard Kuo (Danswer)" <rkuo@onyx.app>
Date: Fri, 10 Jan 2025 12:41:05 -0800
Subject: [PATCH 14/29] logging

---
 backend/onyx/background/celery/tasks/indexing/tasks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/onyx/background/celery/tasks/indexing/tasks.py b/backend/onyx/background/celery/tasks/indexing/tasks.py
index b29dd1e8a08..771ee8e709a 100644
--- a/backend/onyx/background/celery/tasks/indexing/tasks.py
+++ b/backend/onyx/background/celery/tasks/indexing/tasks.py
@@ -861,7 +861,7 @@ def connector_indexing_proxy_task(
         f"Indexing watchdog - starting: attempt={index_attempt_id} "
         f"cc_pair={cc_pair_id} "
         f"search_settings={search_settings_id} "
-        f"multiprocessing={multiprocessing.get_start_method()}"
+        f"mp_start_method={multiprocessing.get_start_method()}"
     )
 
     if not self.request.id:

From 384a38418bce7b1fa57091765585a8f7c50e4846 Mon Sep 17 00:00:00 2001
From: "Richard Kuo (Danswer)" <rkuo@onyx.app>
Date: Fri, 10 Jan 2025 12:59:34 -0800
Subject: [PATCH 15/29] test set_spawn_method and handle exceptions

---
 backend/onyx/background/celery/apps/heavy.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/backend/onyx/background/celery/apps/heavy.py b/backend/onyx/background/celery/apps/heavy.py
index 1a8b6587447..c49ccfa751b 100644
--- a/backend/onyx/background/celery/apps/heavy.py
+++ b/backend/onyx/background/celery/apps/heavy.py
@@ -60,7 +60,17 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
     all_start_methods: list[str] = multiprocessing.get_all_start_methods()
     logger.info(f"Multiprocessing all start methods: {all_start_methods}")
 
-    multiprocessing.set_start_method("spawn")  # fork is unsafe, set to spawn
+    try:
+        multiprocessing.set_start_method("spawn")  # fork is unsafe, set to spawn
+    except Exception:
+        logger.info("multiprocessing.set_start_method exceptioned.")
+        try:
+            multiprocessing.set_start_method(
+                "spawn", force=True
+            )  # fork is unsafe, set to spawn
+        except Exception:
+            logger.info("multiprocessing.set_start_method force=True exceptioned.")
+
     logger.info(
         f"Multiprocessing selected start method: {multiprocessing.get_start_method()}"
     )

From 4400a945e3349d02184ff646093f2209409b129a Mon Sep 17 00:00:00 2001
From: "Richard Kuo (Danswer)" <rkuo@onyx.app>
Date: Fri, 10 Jan 2025 14:18:49 -0800
Subject: [PATCH 16/29] optimize another index attempt check

---
 ...add_index_to_index_attempt_time_created.py | 36 +++++++++++++++++++
 .../background/celery/tasks/indexing/tasks.py |  9 +++++
 backend/onyx/db/index_attempt.py              | 14 ++------
 backend/onyx/db/models.py                     |  3 +-
 4 files changed, 50 insertions(+), 12 deletions(-)
 create mode 100644 backend/alembic/versions/0f7ff6d75b57_add_index_to_index_attempt_time_created.py

diff --git a/backend/alembic/versions/0f7ff6d75b57_add_index_to_index_attempt_time_created.py b/backend/alembic/versions/0f7ff6d75b57_add_index_to_index_attempt_time_created.py
new file mode 100644
index 00000000000..23db56bd61e
--- /dev/null
+++ b/backend/alembic/versions/0f7ff6d75b57_add_index_to_index_attempt_time_created.py
@@ -0,0 +1,36 @@
+"""add index to index_attempt.time_created
+
+Revision ID: 0f7ff6d75b57
+Revises: 369644546676
+Create Date: 2025-01-10 14:01:14.067144
+
+"""
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision = "0f7ff6d75b57"
+down_revision = "369644546676"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.create_index(
+        op.f("ix_index_attempt_status"),
+        "index_attempt",
+        ["status"],
+        unique=False,
+    )
+
+    op.create_index(
+        op.f("ix_index_attempt_time_created"),
+        "index_attempt",
+        ["time_created"],
+        unique=False,
+    )
+
+
+def downgrade() -> None:
+    op.drop_index(op.f("ix_index_attempt_time_created"), table_name="index_attempt")
+
+    op.drop_index(op.f("ix_index_attempt_status"), table_name="index_attempt")
diff --git a/backend/onyx/background/celery/tasks/indexing/tasks.py b/backend/onyx/background/celery/tasks/indexing/tasks.py
index 9fd73972d0e..bbc64d2420e 100644
--- a/backend/onyx/background/celery/tasks/indexing/tasks.py
+++ b/backend/onyx/background/celery/tasks/indexing/tasks.py
@@ -417,6 +417,15 @@ def check_for_indexing(self: Task, *, tenant_id: str | None) -> int | None:
             unfenced_attempt_ids = get_unfenced_index_attempt_ids(
                 db_session, redis_client
             )
+
+            if tenant_id in debug_tenants:
+                ttl = redis_client.ttl(OnyxRedisLocks.CHECK_INDEXING_BEAT_LOCK)
+                task_logger.info(
+                    f"check_for_indexing after get unfenced lock: "
+                    f"tenant={tenant_id} "
+                    f"ttl={ttl}"
+                )
+
             for attempt_id in unfenced_attempt_ids:
                 # debugging logic - remove after we're done
                 if tenant_id in debug_tenants:
diff --git a/backend/onyx/db/index_attempt.py b/backend/onyx/db/index_attempt.py
index 20e8521d207..692a27976f5 100644
--- a/backend/onyx/db/index_attempt.py
+++ b/backend/onyx/db/index_attempt.py
@@ -9,7 +9,6 @@
 from sqlalchemy import func
 from sqlalchemy import select
 from sqlalchemy import update
-from sqlalchemy.orm import joinedload
 from sqlalchemy.orm import Session
 
 from onyx.connectors.models import Document
@@ -118,21 +117,14 @@ def get_in_progress_index_attempts(
 def get_all_index_attempts_by_status(
     status: IndexingStatus, db_session: Session
 ) -> list[IndexAttempt]:
-    """This eagerly loads the connector and credential so that the db_session can be expired
-    before running long-living indexing jobs, which causes increasing memory usage.
+    """Returns index attempts with the given status.
+    Only recommend calling this with non-terminal states as the full list of
+    terminal statuses may be quite large.
 
     Results are ordered by time_created (oldest to newest)."""
     stmt = select(IndexAttempt)
     stmt = stmt.where(IndexAttempt.status == status)
     stmt = stmt.order_by(IndexAttempt.time_created)
-    stmt = stmt.options(
-        joinedload(IndexAttempt.connector_credential_pair).joinedload(
-            ConnectorCredentialPair.connector
-        ),
-        joinedload(IndexAttempt.connector_credential_pair).joinedload(
-            ConnectorCredentialPair.credential
-        ),
-    )
     new_attempts = db_session.scalars(stmt)
     return list(new_attempts.all())
 
diff --git a/backend/onyx/db/models.py b/backend/onyx/db/models.py
index ff1c98d13d8..a1f7967641e 100644
--- a/backend/onyx/db/models.py
+++ b/backend/onyx/db/models.py
@@ -763,7 +763,7 @@ class IndexAttempt(Base):
     # the run once API
     from_beginning: Mapped[bool] = mapped_column(Boolean)
     status: Mapped[IndexingStatus] = mapped_column(
-        Enum(IndexingStatus, native_enum=False)
+        Enum(IndexingStatus, native_enum=False, index=True)
     )
     # The two below may be slightly out of sync if user switches Embedding Model
     new_docs_indexed: Mapped[int | None] = mapped_column(Integer, default=0)
@@ -782,6 +782,7 @@ class IndexAttempt(Base):
     time_created: Mapped[datetime.datetime] = mapped_column(
         DateTime(timezone=True),
         server_default=func.now(),
+        index=True,
     )
     # when the actual indexing run began
     # NOTE: will use the api_server clock rather than DB server clock

From ccef35028700e9467e9ac501f329ed84ede7e963 Mon Sep 17 00:00:00 2001
From: "Richard Kuo (Danswer)" <rkuo@onyx.app>
Date: Fri, 10 Jan 2025 14:19:31 -0800
Subject: [PATCH 17/29] try using spawn specifically

---
 backend/onyx/background/indexing/job_client.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/backend/onyx/background/indexing/job_client.py b/backend/onyx/background/indexing/job_client.py
index 444894f8d6e..c99e12fbc7d 100644
--- a/backend/onyx/background/indexing/job_client.py
+++ b/backend/onyx/background/indexing/job_client.py
@@ -4,9 +4,10 @@
 
 NOTE: cannot use Celery directly due to
 https://github.com/celery/celery/issues/7007#issuecomment-1740139367"""
+import multiprocessing as mp
 from collections.abc import Callable
 from dataclasses import dataclass
-from multiprocessing import Process
+from multiprocessing.context import SpawnProcess
 from typing import Any
 from typing import Literal
 from typing import Optional
@@ -63,7 +64,7 @@ class SimpleJob:
     """Drop in replacement for `dask.distributed.Future`"""
 
     id: int
-    process: Optional["Process"] = None
+    process: Optional["SpawnProcess"] = None
 
     def cancel(self) -> bool:
         return self.release()
@@ -131,7 +132,8 @@ def submit(self, func: Callable, *args: Any, pure: bool = True) -> SimpleJob | N
         job_id = self.job_id_counter
         self.job_id_counter += 1
 
-        process = Process(target=_run_in_process, args=(func, args), daemon=True)
+        ctx = mp.get_context("spawn")
+        process = ctx.Process(target=_run_in_process, args=(func, args), daemon=True)
         job = SimpleJob(id=job_id, process=process)
         process.start()
 

From 6afd27f9c9d2be6d51d3fba6ad209615b8b46b0e Mon Sep 17 00:00:00 2001
From: hagen-danswer <hagen@danswer.ai>
Date: Fri, 10 Jan 2025 16:51:33 -0800
Subject: [PATCH 18/29] fix group sync name capitalization (#3653)

* fix group sync name capitalization

* everything is lowercased now

* comments

* Added test for be2ab2aa50ee migration

* polish
---
 .../be2ab2aa50ee_fix_capitalization.py        |  38 ++++++
 backend/ee/onyx/db/document.py                |   6 +-
 backend/ee/onyx/db/external_perm.py           |  17 ++-
 backend/onyx/access/utils.py                  |   9 +-
 .../tasks/doc_permission_syncing/tasks.py     |   4 +-
 .../connectors/confluence/onyx_confluence.py  |  55 ++++----
 .../tests/integration/common_utils/reset.py   | 117 +++++++++-------
 .../tests/migrations/test_migrations.py       | 125 ++++++++++++++++++
 8 files changed, 285 insertions(+), 86 deletions(-)
 create mode 100644 backend/alembic/versions/be2ab2aa50ee_fix_capitalization.py
 create mode 100644 backend/tests/integration/tests/migrations/test_migrations.py

diff --git a/backend/alembic/versions/be2ab2aa50ee_fix_capitalization.py b/backend/alembic/versions/be2ab2aa50ee_fix_capitalization.py
new file mode 100644
index 00000000000..ea6f201cc65
--- /dev/null
+++ b/backend/alembic/versions/be2ab2aa50ee_fix_capitalization.py
@@ -0,0 +1,38 @@
+"""fix_capitalization
+
+Revision ID: be2ab2aa50ee
+Revises: 369644546676
+Create Date: 2025-01-10 13:13:26.228960
+
+"""
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision = "be2ab2aa50ee"
+down_revision = "369644546676"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.execute(
+        """
+        UPDATE document
+        SET
+            external_user_group_ids = ARRAY(
+                SELECT LOWER(unnest(external_user_group_ids))
+            ),
+            last_modified = NOW()
+        WHERE
+            external_user_group_ids IS NOT NULL
+            AND external_user_group_ids::text[] <> ARRAY(
+                SELECT LOWER(unnest(external_user_group_ids))
+            )::text[]
+    """
+    )
+
+
+def downgrade() -> None:
+    # No way to cleanly persist the bad state through an upgrade/downgrade
+    # cycle, so we just pass
+    pass
diff --git a/backend/ee/onyx/db/document.py b/backend/ee/onyx/db/document.py
index 2ec5a3623c1..ad61cff4fef 100644
--- a/backend/ee/onyx/db/document.py
+++ b/backend/ee/onyx/db/document.py
@@ -5,7 +5,7 @@
 from sqlalchemy.orm import Session
 
 from onyx.access.models import ExternalAccess
-from onyx.access.utils import prefix_group_w_source
+from onyx.access.utils import build_ext_group_name_for_onyx
 from onyx.configs.constants import DocumentSource
 from onyx.db.models import Document as DbDocument
 
@@ -25,7 +25,7 @@ def upsert_document_external_perms__no_commit(
     ).first()
 
     prefixed_external_groups = [
-        prefix_group_w_source(
+        build_ext_group_name_for_onyx(
             ext_group_name=group_id,
             source=source_type,
         )
@@ -66,7 +66,7 @@ def upsert_document_external_perms(
     ).first()
 
     prefixed_external_groups: set[str] = {
-        prefix_group_w_source(
+        build_ext_group_name_for_onyx(
             ext_group_name=group_id,
             source=source_type,
         )
diff --git a/backend/ee/onyx/db/external_perm.py b/backend/ee/onyx/db/external_perm.py
index 16de8bb4110..9992f86df60 100644
--- a/backend/ee/onyx/db/external_perm.py
+++ b/backend/ee/onyx/db/external_perm.py
@@ -6,8 +6,9 @@
 from sqlalchemy import select
 from sqlalchemy.orm import Session
 
-from onyx.access.utils import prefix_group_w_source
+from onyx.access.utils import build_ext_group_name_for_onyx
 from onyx.configs.constants import DocumentSource
+from onyx.db.models import User
 from onyx.db.models import User__ExternalUserGroupId
 from onyx.db.users import batch_add_ext_perm_user_if_not_exists
 from onyx.db.users import get_user_by_email
@@ -61,8 +62,10 @@ def replace_user__ext_group_for_cc_pair(
             all_group_member_emails.add(user_email)
 
     # batch add users if they don't exist and get their ids
-    all_group_members = batch_add_ext_perm_user_if_not_exists(
-        db_session=db_session, emails=list(all_group_member_emails)
+    all_group_members: list[User] = batch_add_ext_perm_user_if_not_exists(
+        db_session=db_session,
+        # NOTE: this function handles case sensitivity for emails
+        emails=list(all_group_member_emails),
     )
 
     delete_user__ext_group_for_cc_pair__no_commit(
@@ -84,12 +87,14 @@ def replace_user__ext_group_for_cc_pair(
                     f" with email {user_email} not found"
                 )
                 continue
+            external_group_id = build_ext_group_name_for_onyx(
+                ext_group_name=external_group.id,
+                source=source,
+            )
             new_external_permissions.append(
                 User__ExternalUserGroupId(
                     user_id=user_id,
-                    external_user_group_id=prefix_group_w_source(
-                        external_group.id, source
-                    ),
+                    external_user_group_id=external_group_id,
                     cc_pair_id=cc_pair_id,
                 )
             )
diff --git a/backend/onyx/access/utils.py b/backend/onyx/access/utils.py
index 3ff9c42bc71..52d0e32748c 100644
--- a/backend/onyx/access/utils.py
+++ b/backend/onyx/access/utils.py
@@ -19,6 +19,9 @@ def prefix_external_group(ext_group_name: str) -> str:
     return f"external_group:{ext_group_name}"
 
 
-def prefix_group_w_source(ext_group_name: str, source: DocumentSource) -> str:
-    """External groups may collide across sources, every source needs its own prefix."""
-    return f"{source.value.upper()}_{ext_group_name}"
+def build_ext_group_name_for_onyx(ext_group_name: str, source: DocumentSource) -> str:
+    """
+    External groups may collide across sources, every source needs its own prefix.
+    NOTE: the name is lowercased to handle case sensitivity for group names
+    """
+    return f"{source.value}_{ext_group_name}".lower()
diff --git a/backend/onyx/background/celery/tasks/doc_permission_syncing/tasks.py b/backend/onyx/background/celery/tasks/doc_permission_syncing/tasks.py
index 20ad0a07565..5e1e3c2c0f4 100644
--- a/backend/onyx/background/celery/tasks/doc_permission_syncing/tasks.py
+++ b/backend/onyx/background/celery/tasks/doc_permission_syncing/tasks.py
@@ -391,5 +391,7 @@ def update_external_document_permissions_task(
             )
         return True
     except Exception:
-        logger.exception("Error Syncing Document Permissions")
+        logger.exception(
+            f"Error Syncing Document Permissions: connector_id={connector_id} doc_id={doc_id}"
+        )
         return False
diff --git a/backend/onyx/connectors/confluence/onyx_confluence.py b/backend/onyx/connectors/confluence/onyx_confluence.py
index e6a2b957ee7..96b9a370207 100644
--- a/backend/onyx/connectors/confluence/onyx_confluence.py
+++ b/backend/onyx/connectors/confluence/onyx_confluence.py
@@ -135,32 +135,6 @@ def __init__(self, url: str, *args: Any, **kwargs: Any) -> None:
         super(OnyxConfluence, self).__init__(url, *args, **kwargs)
         self._wrap_methods()
 
-    def get_current_user(self, expand: str | None = None) -> Any:
-        """
-        Implements a method that isn't in the third party client.
-
-        Get information about the current user
-        :param expand: OPTIONAL expand for get status of user.
-                Possible param is "status". Results are "Active, Deactivated"
-        :return: Returns the user details
-        """
-
-        from atlassian.errors import ApiPermissionError  # type:ignore
-
-        url = "rest/api/user/current"
-        params = {}
-        if expand:
-            params["expand"] = expand
-        try:
-            response = self.get(url, params=params)
-        except HTTPError as e:
-            if e.response.status_code == 403:
-                raise ApiPermissionError(
-                    "The calling user does not have permission", reason=e
-                )
-            raise
-        return response
-
     def _wrap_methods(self) -> None:
         """
         For each attribute that is callable (i.e., a method) and doesn't start with an underscore,
@@ -363,6 +337,9 @@ def get_all_space_permissions_server(
         fetch the permissions of a space.
         This is better logging than calling the get_space_permissions method
         because it returns a jsonrpc response.
+        TODO: Make this call these endpoints for newer confluence versions:
+        - /rest/api/space/{spaceKey}/permissions
+        - /rest/api/space/{spaceKey}/permissions/anonymous
         """
         url = "rpc/json-rpc/confluenceservice-v2"
         data = {
@@ -381,6 +358,32 @@ def get_all_space_permissions_server(
 
         return response.get("result", [])
 
+    def get_current_user(self, expand: str | None = None) -> Any:
+        """
+        Implements a method that isn't in the third party client.
+
+        Get information about the current user
+        :param expand: OPTIONAL expand for get status of user.
+                Possible param is "status". Results are "Active, Deactivated"
+        :return: Returns the user details
+        """
+
+        from atlassian.errors import ApiPermissionError  # type:ignore
+
+        url = "rest/api/user/current"
+        params = {}
+        if expand:
+            params["expand"] = expand
+        try:
+            response = self.get(url, params=params)
+        except HTTPError as e:
+            if e.response.status_code == 403:
+                raise ApiPermissionError(
+                    "The calling user does not have permission", reason=e
+                )
+            raise
+        return response
+
 
 def _validate_connector_configuration(
     credentials: dict[str, Any],
diff --git a/backend/tests/integration/common_utils/reset.py b/backend/tests/integration/common_utils/reset.py
index 116d91c420d..4c6afb10388 100644
--- a/backend/tests/integration/common_utils/reset.py
+++ b/backend/tests/integration/common_utils/reset.py
@@ -63,57 +63,57 @@ def _run_migrations(
     logging.getLogger("alembic").setLevel(logging.INFO)
 
 
-def reset_postgres(
-    database: str = "postgres", config_name: str = "alembic", setup_onyx: bool = True
+def downgrade_postgres(
+    database: str = "postgres",
+    config_name: str = "alembic",
+    revision: str = "base",
+    clear_data: bool = False,
 ) -> None:
-    """Reset the Postgres database."""
-
-    # NOTE: need to delete all rows to allow migrations to be rolled back
-    # as there are a few downgrades that don't properly handle data in tables
-    conn = psycopg2.connect(
-        dbname=database,
-        user=POSTGRES_USER,
-        password=POSTGRES_PASSWORD,
-        host=POSTGRES_HOST,
-        port=POSTGRES_PORT,
-    )
-    cur = conn.cursor()
+    """Downgrade Postgres database to base state."""
+    if clear_data:
+        if revision != "base":
+            logger.warning("Clearing data without rolling back to base state")
+        # Delete all rows to allow migrations to be rolled back
+        conn = psycopg2.connect(
+            dbname=database,
+            user=POSTGRES_USER,
+            password=POSTGRES_PASSWORD,
+            host=POSTGRES_HOST,
+            port=POSTGRES_PORT,
+        )
+        cur = conn.cursor()
 
-    # Disable triggers to prevent foreign key constraints from being checked
-    cur.execute("SET session_replication_role = 'replica';")
+        # Disable triggers to prevent foreign key constraints from being checked
+        cur.execute("SET session_replication_role = 'replica';")
 
-    # Fetch all table names in the current database
-    cur.execute(
+        # Fetch all table names in the current database
+        cur.execute(
+            """
+            SELECT tablename
+            FROM pg_tables
+            WHERE schemaname = 'public'
         """
-        SELECT tablename
-        FROM pg_tables
-        WHERE schemaname = 'public'
-    """
-    )
-
-    tables = cur.fetchall()
+        )
 
-    for table in tables:
-        table_name = table[0]
+        tables = cur.fetchall()
 
-        # Don't touch migration history
-        if table_name == "alembic_version":
-            continue
+        for table in tables:
+            table_name = table[0]
 
-        # Don't touch Kombu
-        if table_name == "kombu_message" or table_name == "kombu_queue":
-            continue
+            # Don't touch migration history or Kombu
+            if table_name in ("alembic_version", "kombu_message", "kombu_queue"):
+                continue
 
-        cur.execute(f'DELETE FROM "{table_name}"')
+            cur.execute(f'DELETE FROM "{table_name}"')
 
-    # Re-enable triggers
-    cur.execute("SET session_replication_role = 'origin';")
+        # Re-enable triggers
+        cur.execute("SET session_replication_role = 'origin';")
 
-    conn.commit()
-    cur.close()
-    conn.close()
+        conn.commit()
+        cur.close()
+        conn.close()
 
-    # downgrade to base + upgrade back to head
+    # Downgrade to base
     conn_str = build_connection_string(
         db=database,
         user=POSTGRES_USER,
@@ -126,20 +126,43 @@ def reset_postgres(
         conn_str,
         config_name,
         direction="downgrade",
-        revision="base",
+        revision=revision,
+    )
+
+
+def upgrade_postgres(
+    database: str = "postgres", config_name: str = "alembic", revision: str = "head"
+) -> None:
+    """Upgrade Postgres database to latest version."""
+    conn_str = build_connection_string(
+        db=database,
+        user=POSTGRES_USER,
+        password=POSTGRES_PASSWORD,
+        host=POSTGRES_HOST,
+        port=POSTGRES_PORT,
+        db_api=SYNC_DB_API,
     )
     _run_migrations(
         conn_str,
         config_name,
         direction="upgrade",
-        revision="head",
+        revision=revision,
     )
-    if not setup_onyx:
-        return
 
-    # do the same thing as we do on API server startup
-    with get_session_context_manager() as db_session:
-        setup_postgres(db_session)
+
+def reset_postgres(
+    database: str = "postgres",
+    config_name: str = "alembic",
+    setup_onyx: bool = True,
+) -> None:
+    """Reset the Postgres database."""
+    downgrade_postgres(
+        database=database, config_name=config_name, revision="base", clear_data=True
+    )
+    upgrade_postgres(database=database, config_name=config_name, revision="head")
+    if setup_onyx:
+        with get_session_context_manager() as db_session:
+            setup_postgres(db_session)
 
 
 def reset_vespa() -> None:
diff --git a/backend/tests/integration/tests/migrations/test_migrations.py b/backend/tests/integration/tests/migrations/test_migrations.py
new file mode 100644
index 00000000000..19b6fb1fba6
--- /dev/null
+++ b/backend/tests/integration/tests/migrations/test_migrations.py
@@ -0,0 +1,125 @@
+import pytest
+from sqlalchemy import text
+
+from onyx.configs.constants import DEFAULT_BOOST
+from onyx.db.engine import get_session_context_manager
+from tests.integration.common_utils.reset import downgrade_postgres
+from tests.integration.common_utils.reset import upgrade_postgres
+
+
+@pytest.mark.skip(
+    reason="Migration test no longer needed - migration has been applied to production"
+)
+def test_fix_capitalization_migration() -> None:
+    """Test that the be2ab2aa50ee migration correctly lowercases external_user_group_ids"""
+    # Reset the database and run migrations up to the second to last migration
+    downgrade_postgres(
+        database="postgres", config_name="alembic", revision="base", clear_data=True
+    )
+    upgrade_postgres(
+        database="postgres",
+        config_name="alembic",
+        # Upgrade it to the migration before the fix
+        revision="369644546676",
+    )
+
+    # Insert test data with mixed case group IDs
+    test_data = [
+        {
+            "id": "test_doc_1",
+            "external_user_group_ids": ["Group1", "GROUP2", "group3"],
+            "semantic_id": "test_doc_1",
+            "boost": DEFAULT_BOOST,
+            "hidden": False,
+            "from_ingestion_api": False,
+            "last_modified": "NOW()",
+        },
+        {
+            "id": "test_doc_2",
+            "external_user_group_ids": ["UPPER1", "upper2", "UPPER3"],
+            "semantic_id": "test_doc_2",
+            "boost": DEFAULT_BOOST,
+            "hidden": False,
+            "from_ingestion_api": False,
+            "last_modified": "NOW()",
+        },
+    ]
+
+    # Insert the test data
+    with get_session_context_manager() as db_session:
+        for doc in test_data:
+            db_session.execute(
+                text(
+                    """
+                    INSERT INTO document (
+                        id,
+                        external_user_group_ids,
+                        semantic_id,
+                        boost,
+                        hidden,
+                        from_ingestion_api,
+                        last_modified
+                    )
+                    VALUES (
+                        :id,
+                        :group_ids,
+                        :semantic_id,
+                        :boost,
+                        :hidden,
+                        :from_ingestion_api,
+                        :last_modified
+                    )
+                    """
+                ),
+                {
+                    "id": doc["id"],
+                    "group_ids": doc["external_user_group_ids"],
+                    "semantic_id": doc["semantic_id"],
+                    "boost": doc["boost"],
+                    "hidden": doc["hidden"],
+                    "from_ingestion_api": doc["from_ingestion_api"],
+                    "last_modified": doc["last_modified"],
+                },
+            )
+        db_session.commit()
+
+    # Verify the data was inserted correctly
+    with get_session_context_manager() as db_session:
+        results = db_session.execute(
+            text(
+                """
+                SELECT id, external_user_group_ids
+                FROM document
+                WHERE id IN ('test_doc_1', 'test_doc_2')
+                ORDER BY id
+                """
+            )
+        ).fetchall()
+
+        # Verify initial state
+        assert len(results) == 2
+        assert results[0].external_user_group_ids == ["Group1", "GROUP2", "group3"]
+        assert results[1].external_user_group_ids == ["UPPER1", "upper2", "UPPER3"]
+
+    # Run migrations again to apply the fix
+    upgrade_postgres(
+        database="postgres", config_name="alembic", revision="be2ab2aa50ee"
+    )
+
+    # Verify the fix was applied
+    with get_session_context_manager() as db_session:
+        results = db_session.execute(
+            text(
+                """
+                SELECT id, external_user_group_ids
+                FROM document
+                WHERE id IN ('test_doc_1', 'test_doc_2')
+                ORDER BY id
+                """
+            )
+        ).fetchall()
+
+        # Verify all group IDs are lowercase
+        assert len(results) == 2
+        assert results[0].external_user_group_ids == ["group1", "group2", "group3"]
+        assert results[1].external_user_group_ids == ["upper1", "upper2", "upper3"]

From f5bdf9d2c9f5a2130c98a22e20b9c89d7ae8efcc Mon Sep 17 00:00:00 2001
From: "Richard Kuo (Danswer)" <rkuo@onyx.app>
Date: Mon, 13 Jan 2025 02:46:03 -0800
Subject: [PATCH 19/29] move to celeryd_init

---
 .../onyx/background/celery/apps/app_base.py   | 23 +++++++++++++++++++
 backend/onyx/background/celery/apps/light.py  |  9 --------
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/backend/onyx/background/celery/apps/app_base.py b/backend/onyx/background/celery/apps/app_base.py
index 5e767dfbefc..440b7cba32d 100644
--- a/backend/onyx/background/celery/apps/app_base.py
+++ b/backend/onyx/background/celery/apps/app_base.py
@@ -1,4 +1,5 @@
 import logging
+import multiprocessing
 import time
 from typing import Any
 
@@ -167,6 +168,28 @@ def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None
     # logger.info(f"Multiprocessing start method - setting to spawn.")
     # multiprocessing.set_start_method("spawn")  # fork is unsafe, set to spawn
 
+    all_start_methods: list[str] = multiprocessing.get_all_start_methods()
+    logger.info(f"Multiprocessing all start methods: {all_start_methods}")
+
+    try:
+        multiprocessing.set_start_method("spawn")  # fork is unsafe, set to spawn
+    except Exception:
+        logger.info(
+            "multiprocessing.set_start_method exceptioned. Trying force=True..."
+        )
+        try:
+            multiprocessing.set_start_method(
+                "spawn", force=True
+            )  # fork is unsafe, set to spawn
+        except Exception:
+            logger.info(
+                "multiprocessing.set_start_method force=True exceptioned even with force=True."
+            )
+
+    logger.info(
+        f"Multiprocessing selected start method: {multiprocessing.get_start_method()}"
+    )
+
 
 def wait_for_redis(sender: Any, **kwargs: Any) -> None:
     """Waits for redis to become ready subject to a hardcoded timeout.
diff --git a/backend/onyx/background/celery/apps/light.py b/backend/onyx/background/celery/apps/light.py
index 73058ad3218..695bda69cc0 100644
--- a/backend/onyx/background/celery/apps/light.py
+++ b/backend/onyx/background/celery/apps/light.py
@@ -1,4 +1,3 @@
-import multiprocessing
 from typing import Any
 
 from celery import Celery
@@ -57,14 +56,6 @@ def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None
 def on_worker_init(sender: Any, **kwargs: Any) -> None:
     logger.info("worker_init signal received.")
 
-    all_start_methods: list[str] = multiprocessing.get_all_start_methods()
-    logger.info(f"Multiprocessing all start methods: {all_start_methods}")
-
-    multiprocessing.set_start_method("spawn")  # fork is unsafe, set to spawn
-    logger.info(
-        f"Multiprocessing selected start method: {multiprocessing.get_start_method()}"
-    )
-
     logger.info(f"Concurrency: {sender.concurrency}")
 
     SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_LIGHT_APP_NAME)

From be3cfdd4a6bd95d195b5948cc1e5822df39282dd Mon Sep 17 00:00:00 2001
From: "Richard Kuo (Danswer)" <rkuo@onyx.app>
Date: Mon, 13 Jan 2025 10:46:20 -0800
Subject: [PATCH 20/29] saved files

---
 backend/onyx/background/celery/apps/heavy.py  | 19 -------------------
 .../onyx/background/celery/apps/indexing.py   | 13 +------------
 .../onyx/background/celery/apps/primary.py    |  9 ---------
 3 files changed, 1 insertion(+), 40 deletions(-)

diff --git a/backend/onyx/background/celery/apps/heavy.py b/backend/onyx/background/celery/apps/heavy.py
index c49ccfa751b..7216e858d42 100644
--- a/backend/onyx/background/celery/apps/heavy.py
+++ b/backend/onyx/background/celery/apps/heavy.py
@@ -1,4 +1,3 @@
-import multiprocessing
 from typing import Any
 
 from celery import Celery
@@ -57,24 +56,6 @@ def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None
 def on_worker_init(sender: Any, **kwargs: Any) -> None:
     logger.info("worker_init signal received.")
 
-    all_start_methods: list[str] = multiprocessing.get_all_start_methods()
-    logger.info(f"Multiprocessing all start methods: {all_start_methods}")
-
-    try:
-        multiprocessing.set_start_method("spawn")  # fork is unsafe, set to spawn
-    except Exception:
-        logger.info("multiprocessing.set_start_method exceptioned.")
-        try:
-            multiprocessing.set_start_method(
-                "spawn", force=True
-            )  # fork is unsafe, set to spawn
-        except Exception:
-            logger.info("multiprocessing.set_start_method force=True exceptioned.")
-
-    logger.info(
-        f"Multiprocessing selected start method: {multiprocessing.get_start_method()}"
-    )
-
     SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_HEAVY_APP_NAME)
     SqlEngine.init_engine(pool_size=4, max_overflow=12)
 
diff --git a/backend/onyx/background/celery/apps/indexing.py b/backend/onyx/background/celery/apps/indexing.py
index 1db1641ae52..0c116984f7c 100644
--- a/backend/onyx/background/celery/apps/indexing.py
+++ b/backend/onyx/background/celery/apps/indexing.py
@@ -1,4 +1,3 @@
-import multiprocessing
 from typing import Any
 
 from celery import Celery
@@ -58,21 +57,11 @@ def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None
 def on_worker_init(sender: Any, **kwargs: Any) -> None:
     logger.info("worker_init signal received.")
 
-    all_start_methods: list[str] = multiprocessing.get_all_start_methods()
-    logger.info(f"Multiprocessing all start methods: {all_start_methods}")
-
-    multiprocessing.set_start_method("spawn")  # fork is unsafe, set to spawn
-    logger.info(
-        f"Multiprocessing selected start method: {multiprocessing.get_start_method()}"
-    )
-
     SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_INDEXING_APP_NAME)
 
     # rkuo: been seeing transient connection exceptions here, so upping the connection count
     # from just concurrency/concurrency to concurrency/concurrency*2
-    SqlEngine.init_engine(
-        pool_size=sender.concurrency, max_overflow=sender.concurrency * 2
-    )
+    SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=8)
 
     app_base.wait_for_redis(sender, **kwargs)
     app_base.wait_for_db(sender, **kwargs)
diff --git a/backend/onyx/background/celery/apps/primary.py b/backend/onyx/background/celery/apps/primary.py
index 23f24852796..b4f9868ac58 100644
--- a/backend/onyx/background/celery/apps/primary.py
+++ b/backend/onyx/background/celery/apps/primary.py
@@ -1,5 +1,4 @@
 import logging
-import multiprocessing
 from typing import Any
 from typing import cast
 
@@ -81,14 +80,6 @@ def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None
 def on_worker_init(sender: Any, **kwargs: Any) -> None:
     logger.info("worker_init signal received.")
 
-    all_start_methods: list[str] = multiprocessing.get_all_start_methods()
-    logger.info(f"Multiprocessing all start methods: {all_start_methods}")
-
-    multiprocessing.set_start_method("spawn")  # fork is unsafe, set to spawn
-    logger.info(
-        f"Multiprocessing selected start method: {multiprocessing.get_start_method()}"
-    )
-
     SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_PRIMARY_APP_NAME)
     SqlEngine.init_engine(pool_size=8, max_overflow=0)
 

From 9a09222b7d42f00255206b340798a62e27211aab Mon Sep 17 00:00:00 2001
From: "Richard Kuo (Danswer)" <rkuo@onyx.app>
Date: Mon, 13 Jan 2025 10:58:33 -0800
Subject: [PATCH 21/29] add comments

---
 backend/onyx/background/celery/apps/app_base.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/backend/onyx/background/celery/apps/app_base.py b/backend/onyx/background/celery/apps/app_base.py
index 440b7cba32d..3f0d50950b5 100644
--- a/backend/onyx/background/celery/apps/app_base.py
+++ b/backend/onyx/background/celery/apps/app_base.py
@@ -163,10 +163,10 @@ def on_task_postrun(
 
 def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None:
     """The first signal sent on celery worker startup"""
-    # rkuo: commenting out as set_start_method seems to work here on macOS
-    # but not in the cloud and it is unclear why.
-    # logger.info(f"Multiprocessing start method - setting to spawn.")
-    # multiprocessing.set_start_method("spawn")  # fork is unsafe, set to spawn
+
+    # NOTE(rkuo): start method "fork" is unsafe and we really need it to be "spawn"
+    # But something is blocking set_start_method from working in the cloud unless
+    # force=True. so we use force=True as a fallback.
 
     all_start_methods: list[str] = multiprocessing.get_all_start_methods()
     logger.info(f"Multiprocessing all start methods: {all_start_methods}")

From b6dd999c1b795484ba222fbbfe371a7baff84c79 Mon Sep 17 00:00:00 2001
From: "Richard Kuo (Danswer)" <rkuo@onyx.app>
Date: Mon, 13 Jan 2025 11:31:57 -0800
Subject: [PATCH 22/29] add some type hints

---
 backend/onyx/background/celery/apps/app_base.py |  2 +-
 backend/onyx/background/celery/apps/heavy.py    |  7 ++++---
 backend/onyx/background/celery/apps/indexing.py |  7 ++++---
 backend/onyx/background/celery/apps/light.py    | 10 +++++-----
 backend/onyx/background/celery/apps/primary.py  |  7 ++++---
 5 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/backend/onyx/background/celery/apps/app_base.py b/backend/onyx/background/celery/apps/app_base.py
index 3f0d50950b5..9b320aae425 100644
--- a/backend/onyx/background/celery/apps/app_base.py
+++ b/backend/onyx/background/celery/apps/app_base.py
@@ -161,7 +161,7 @@ def on_task_postrun(
         return
 
 
-def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None:
+def on_celeryd_init(sender: str, conf: Any = None, **kwargs: Any) -> None:
     """The first signal sent on celery worker startup"""
 
     # NOTE(rkuo): start method "fork" is unsafe and we really need it to be "spawn"
diff --git a/backend/onyx/background/celery/apps/heavy.py b/backend/onyx/background/celery/apps/heavy.py
index 7216e858d42..4854940fd9c 100644
--- a/backend/onyx/background/celery/apps/heavy.py
+++ b/backend/onyx/background/celery/apps/heavy.py
@@ -3,6 +3,7 @@
 from celery import Celery
 from celery import signals
 from celery import Task
+from celery.apps.worker import Worker
 from celery.signals import celeryd_init
 from celery.signals import worker_init
 from celery.signals import worker_ready
@@ -48,16 +49,16 @@ def on_task_postrun(
 
 
 @celeryd_init.connect
-def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None:
+def on_celeryd_init(sender: str, conf: Any = None, **kwargs: Any) -> None:
     app_base.on_celeryd_init(sender, conf, **kwargs)
 
 
 @worker_init.connect
-def on_worker_init(sender: Any, **kwargs: Any) -> None:
+def on_worker_init(sender: Worker, **kwargs: Any) -> None:
     logger.info("worker_init signal received.")
 
     SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_HEAVY_APP_NAME)
-    SqlEngine.init_engine(pool_size=4, max_overflow=12)
+    SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=8)  # type: ignore
 
     app_base.wait_for_redis(sender, **kwargs)
     app_base.wait_for_db(sender, **kwargs)
diff --git a/backend/onyx/background/celery/apps/indexing.py b/backend/onyx/background/celery/apps/indexing.py
index 0c116984f7c..89681ea741e 100644
--- a/backend/onyx/background/celery/apps/indexing.py
+++ b/backend/onyx/background/celery/apps/indexing.py
@@ -3,6 +3,7 @@
 from celery import Celery
 from celery import signals
 from celery import Task
+from celery.apps.worker import Worker
 from celery.signals import celeryd_init
 from celery.signals import worker_init
 from celery.signals import worker_process_init
@@ -49,19 +50,19 @@ def on_task_postrun(
 
 
 @celeryd_init.connect
-def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None:
+def on_celeryd_init(sender: str, conf: Any = None, **kwargs: Any) -> None:
     app_base.on_celeryd_init(sender, conf, **kwargs)
 
 
 @worker_init.connect
-def on_worker_init(sender: Any, **kwargs: Any) -> None:
+def on_worker_init(sender: Worker, **kwargs: Any) -> None:
     logger.info("worker_init signal received.")
 
     SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_INDEXING_APP_NAME)
 
     # rkuo: been seeing transient connection exceptions here, so upping the connection count
     # from just concurrency/concurrency to concurrency/concurrency*2
-    SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=8)
+    SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=8)  # type: ignore
 
     app_base.wait_for_redis(sender, **kwargs)
     app_base.wait_for_db(sender, **kwargs)
diff --git a/backend/onyx/background/celery/apps/light.py b/backend/onyx/background/celery/apps/light.py
index 695bda69cc0..abc2cfab128 100644
--- a/backend/onyx/background/celery/apps/light.py
+++ b/backend/onyx/background/celery/apps/light.py
@@ -3,6 +3,7 @@
 from celery import Celery
 from celery import signals
 from celery import Task
+from celery.apps.worker import Worker
 from celery.signals import celeryd_init
 from celery.signals import worker_init
 from celery.signals import worker_ready
@@ -14,7 +15,6 @@
 from onyx.utils.logger import setup_logger
 from shared_configs.configs import MULTI_TENANT
 
-
 logger = setup_logger()
 
 celery_app = Celery(__name__)
@@ -48,18 +48,18 @@ def on_task_postrun(
 
 
 @celeryd_init.connect
-def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None:
+def on_celeryd_init(sender: str, conf: Any = None, **kwargs: Any) -> None:
     app_base.on_celeryd_init(sender, conf, **kwargs)
 
 
 @worker_init.connect
-def on_worker_init(sender: Any, **kwargs: Any) -> None:
+def on_worker_init(sender: Worker, **kwargs: Any) -> None:
     logger.info("worker_init signal received.")
 
-    logger.info(f"Concurrency: {sender.concurrency}")
+    logger.info(f"Concurrency: {sender.concurrency}")  # type: ignore
 
     SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_LIGHT_APP_NAME)
-    SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=8)
+    SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=8)  # type: ignore
 
     app_base.wait_for_redis(sender, **kwargs)
     app_base.wait_for_db(sender, **kwargs)
diff --git a/backend/onyx/background/celery/apps/primary.py b/backend/onyx/background/celery/apps/primary.py
index b4f9868ac58..8056e3d5e19 100644
--- a/backend/onyx/background/celery/apps/primary.py
+++ b/backend/onyx/background/celery/apps/primary.py
@@ -6,6 +6,7 @@
 from celery import Celery
 from celery import signals
 from celery import Task
+from celery.apps.worker import Worker
 from celery.exceptions import WorkerShutdown
 from celery.signals import celeryd_init
 from celery.signals import worker_init
@@ -72,12 +73,12 @@ def on_task_postrun(
 
 
 @celeryd_init.connect
-def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None:
+def on_celeryd_init(sender: str, conf: Any = None, **kwargs: Any) -> None:
     app_base.on_celeryd_init(sender, conf, **kwargs)
 
 
 @worker_init.connect
-def on_worker_init(sender: Any, **kwargs: Any) -> None:
+def on_worker_init(sender: Worker, **kwargs: Any) -> None:
     logger.info("worker_init signal received.")
 
     SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_PRIMARY_APP_NAME)
@@ -133,7 +134,7 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
         raise WorkerShutdown("Primary worker lock could not be acquired!")
 
     # tacking on our own user data to the sender
-    sender.primary_worker_lock = lock
+    sender.primary_worker_lock = lock  # type: ignore
 
     # As currently designed, when this worker starts as "primary", we reinitialize redis
     # to a clean state (for our purposes, anyway)

From d96d2fc6e963f0f5c1993a268814f5dce4685a24 Mon Sep 17 00:00:00 2001
From: "Richard Kuo (Danswer)" <rkuo@onyx.app>
Date: Mon, 13 Jan 2025 11:35:58 -0800
Subject: [PATCH 23/29] add comment

---
 backend/onyx/background/indexing/job_client.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/backend/onyx/background/indexing/job_client.py b/backend/onyx/background/indexing/job_client.py
index c99e12fbc7d..a679eebe7fa 100644
--- a/backend/onyx/background/indexing/job_client.py
+++ b/backend/onyx/background/indexing/job_client.py
@@ -132,6 +132,8 @@ def submit(self, func: Callable, *args: Any, pure: bool = True) -> SimpleJob | N
         job_id = self.job_id_counter
         self.job_id_counter += 1
 
+        # this approach allows us to always "spawn" a new process regardless of
+        # get_start_method's current setting
         ctx = mp.get_context("spawn")
         process = ctx.Process(target=_run_in_process, args=(func, args), daemon=True)
         job = SimpleJob(id=job_id, process=process)

From 4f8e48df7c8497f53d6df96e81a437b95fe7f8c6 Mon Sep 17 00:00:00 2001
From: "Richard Kuo (Danswer)" <rkuo@onyx.app>
Date: Mon, 13 Jan 2025 11:50:04 -0800
Subject: [PATCH 24/29] try more sql settings

---
 backend/onyx/background/celery/apps/indexing.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/backend/onyx/background/celery/apps/indexing.py b/backend/onyx/background/celery/apps/indexing.py
index 89681ea741e..a241ff19b00 100644
--- a/backend/onyx/background/celery/apps/indexing.py
+++ b/backend/onyx/background/celery/apps/indexing.py
@@ -60,9 +60,13 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
 
     SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_INDEXING_APP_NAME)
 
-    # rkuo: been seeing transient connection exceptions here, so upping the connection count
-    # from just concurrency/concurrency to concurrency/concurrency*2
-    SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=8)  # type: ignore
+    # rkuo: Transient errors keep happening in the worker threads for indexing
+    # "SSL connection has been closed unexpectedly"
+    # fixing spawn method didn't help (although it seemed like it should)
+    # setting pre ping might help.
+    SqlEngine.init_engine(
+        pool_size=sender.concurrency, max_overflow=8, pool_pre_ping=True
+    )  # type: ignore
 
     app_base.wait_for_redis(sender, **kwargs)
     app_base.wait_for_db(sender, **kwargs)

From 7d86b2833585fce6c553504fd3201e8423648ebe Mon Sep 17 00:00:00 2001
From: "Richard Kuo (Danswer)" <rkuo@onyx.app>
Date: Mon, 13 Jan 2025 12:14:32 -0800
Subject: [PATCH 25/29] maybe we don't need pre ping yet

---
 backend/onyx/background/celery/apps/app_base.py |  4 ++--
 backend/onyx/background/celery/apps/indexing.py | 10 ++++------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/backend/onyx/background/celery/apps/app_base.py b/backend/onyx/background/celery/apps/app_base.py
index 9b320aae425..40a98f38abe 100644
--- a/backend/onyx/background/celery/apps/app_base.py
+++ b/backend/onyx/background/celery/apps/app_base.py
@@ -175,7 +175,7 @@ def on_celeryd_init(sender: str, conf: Any = None, **kwargs: Any) -> None:
         multiprocessing.set_start_method("spawn")  # fork is unsafe, set to spawn
     except Exception:
         logger.info(
-            "multiprocessing.set_start_method exceptioned. Trying force=True..."
+            "Multiprocessing set_start_method exceptioned. Trying force=True..."
         )
         try:
             multiprocessing.set_start_method(
@@ -183,7 +183,7 @@ def on_celeryd_init(sender: str, conf: Any = None, **kwargs: Any) -> None:
             )  # fork is unsafe, set to spawn
         except Exception:
             logger.info(
-                "multiprocessing.set_start_method force=True exceptioned even with force=True."
+                "Multiprocessing set_start_method force=True exceptioned even with force=True."
             )
 
     logger.info(
diff --git a/backend/onyx/background/celery/apps/indexing.py b/backend/onyx/background/celery/apps/indexing.py
index a241ff19b00..e222da5e3b6 100644
--- a/backend/onyx/background/celery/apps/indexing.py
+++ b/backend/onyx/background/celery/apps/indexing.py
@@ -60,13 +60,11 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
 
     SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_INDEXING_APP_NAME)
 
-    # rkuo: Transient errors keep happening in the worker threads for indexing
+    # rkuo: Transient errors keep happening in the indexing watchdog threads.
     # "SSL connection has been closed unexpectedly"
-    # fixing spawn method didn't help (although it seemed like it should)
-    # setting pre ping might help.
-    SqlEngine.init_engine(
-        pool_size=sender.concurrency, max_overflow=8, pool_pre_ping=True
-    )  # type: ignore
+    # actually setting the spawn method in the cloud fixes 95% of these.
+    # setting pre ping might help even more, but not worrying about that yet
+    SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=8)  # type: ignore
 
     app_base.wait_for_redis(sender, **kwargs)
     app_base.wait_for_db(sender, **kwargs)

From 9ea2ae267e11e1fb59611a8abae793403c686a15 Mon Sep 17 00:00:00 2001
From: Chris Weaver <25087905+Weves@users.noreply.github.com>
Date: Mon, 13 Jan 2025 12:36:45 -0800
Subject: [PATCH 26/29] Performance monitoring (#3658)

* Initial scaffolding for metrics

* iterate

* more

* More metrics + SyncRecord concept

* Add indices, standardize timing

* Small cleanup

* Address comments
---
 .vscode/launch.template.jsonc                 |  29 +-
 .../versions/97dbb53fa8c8_add_syncrecord.py   |  72 +++
 ...7bf7_add_time_updated_to_usergroup_and_.py |  41 ++
 .../background/celery/tasks/vespa/tasks.py    |  64 ++-
 backend/ee/onyx/db/user_group.py              |   6 +-
 .../onyx/background/celery/apps/monitoring.py |  95 ++++
 .../background/celery/configs/monitoring.py   |  21 +
 .../background/celery/tasks/beat_schedule.py  |  11 +
 .../celery/tasks/connector_deletion/tasks.py  |  17 +
 .../celery/tasks/monitoring/tasks.py          | 427 ++++++++++++++++++
 .../background/celery/tasks/vespa/tasks.py    |  81 +++-
 .../celery/versioned_apps/monitoring.py       |  15 +
 backend/onyx/configs/constants.py             |   5 +
 backend/onyx/db/document_set.py               |   3 +-
 backend/onyx/db/enums.py                      |  21 +-
 backend/onyx/db/models.py                     |  52 ++-
 backend/onyx/db/sync_record.py                | 110 +++++
 backend/onyx/utils/telemetry.py               |   1 +
 backend/supervisord.conf                      |  12 +
 19 files changed, 1061 insertions(+), 22 deletions(-)
 create mode 100644 backend/alembic/versions/97dbb53fa8c8_add_syncrecord.py
 create mode 100644 backend/alembic/versions/fec3db967bf7_add_time_updated_to_usergroup_and_.py
 create mode 100644 backend/onyx/background/celery/apps/monitoring.py
 create mode 100644 backend/onyx/background/celery/configs/monitoring.py
 create mode 100644 backend/onyx/background/celery/tasks/monitoring/tasks.py
 create mode 100644 backend/onyx/background/celery/versioned_apps/monitoring.py
 create mode 100644 backend/onyx/db/sync_record.py

diff --git a/.vscode/launch.template.jsonc b/.vscode/launch.template.jsonc
index c3dd6d9914d..8c965d36e80 100644
--- a/.vscode/launch.template.jsonc
+++ b/.vscode/launch.template.jsonc
@@ -28,6 +28,7 @@
 		  		"Celery heavy", 
 		  		"Celery indexing", 
 		  		"Celery beat",
+                "Celery monitoring",
             ],
 			"presentation": {
 				 "group": "1",
@@ -51,7 +52,8 @@
 		  		"Celery light", 
 		  		"Celery heavy", 
 		  		"Celery indexing", 
-		  		"Celery beat"
+		  		"Celery beat",
+                "Celery monitoring",
 		  	],
 			"presentation": {
 				 "group": "1",
@@ -269,6 +271,31 @@
 			 },
             "consoleTitle": "Celery indexing Console"
         },
+        {
+            "name": "Celery monitoring",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "celery",
+            "cwd": "${workspaceFolder}/backend",
+            "envFile": "${workspaceFolder}/.vscode/.env",
+            "env": {},
+            "args": [
+                "-A",
+                "onyx.background.celery.versioned_apps.monitoring",
+                "worker",
+                "--pool=solo",
+                "--concurrency=1",
+                "--prefetch-multiplier=1",
+                "--loglevel=INFO",
+                "--hostname=monitoring@%n",
+                "-Q",
+                "monitoring",
+            ],
+            "presentation": {
+				 "group": "2",
+			 },
+            "consoleTitle": "Celery monitoring Console"
+        },
         {
             "name": "Celery beat",
             "type": "debugpy",
diff --git a/backend/alembic/versions/97dbb53fa8c8_add_syncrecord.py b/backend/alembic/versions/97dbb53fa8c8_add_syncrecord.py
new file mode 100644
index 00000000000..1504de39cbf
--- /dev/null
+++ b/backend/alembic/versions/97dbb53fa8c8_add_syncrecord.py
@@ -0,0 +1,72 @@
+"""Add SyncRecord
+
+Revision ID: 97dbb53fa8c8
+Revises: 369644546676
+Create Date: 2025-01-11 19:39:50.426302
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "97dbb53fa8c8"
+down_revision = "be2ab2aa50ee"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        "sync_record",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("entity_id", sa.Integer(), nullable=False),
+        sa.Column(
+            "sync_type",
+            sa.Enum(
+                "DOCUMENT_SET",
+                "USER_GROUP",
+                "CONNECTOR_DELETION",
+                name="synctype",
+                native_enum=False,
+                length=40,
+            ),
+            nullable=False,
+        ),
+        sa.Column(
+            "sync_status",
+            sa.Enum(
+                "IN_PROGRESS",
+                "SUCCESS",
+                "FAILED",
+                "CANCELED",
+                name="syncstatus",
+                native_enum=False,
+                length=40,
+            ),
+            nullable=False,
+        ),
+        sa.Column("num_docs_synced", sa.Integer(), nullable=False),
+        sa.Column("sync_start_time", sa.DateTime(timezone=True), nullable=False),
+        sa.Column("sync_end_time", sa.DateTime(timezone=True), nullable=True),
+        sa.PrimaryKeyConstraint("id"),
+    )
+
+    # Add index for fetch_latest_sync_record query
+    op.create_index(
+        "ix_sync_record_entity_id_sync_type_sync_start_time",
+        "sync_record",
+        ["entity_id", "sync_type", "sync_start_time"],
+    )
+
+    # Add index for cleanup_sync_records query
+    op.create_index(
+        "ix_sync_record_entity_id_sync_type_sync_status",
+        "sync_record",
+        ["entity_id", "sync_type", "sync_status"],
+    )
+
+
+def downgrade() -> None:
+    op.drop_index("ix_sync_record_entity_id_sync_type_sync_status")
+    op.drop_index("ix_sync_record_entity_id_sync_type_sync_start_time")
+    op.drop_table("sync_record")
diff --git a/backend/alembic/versions/fec3db967bf7_add_time_updated_to_usergroup_and_.py b/backend/alembic/versions/fec3db967bf7_add_time_updated_to_usergroup_and_.py
new file mode 100644
index 00000000000..99bc3ffae0c
--- /dev/null
+++ b/backend/alembic/versions/fec3db967bf7_add_time_updated_to_usergroup_and_.py
@@ -0,0 +1,41 @@
+"""Add time_updated to UserGroup and DocumentSet
+
+Revision ID: fec3db967bf7
+Revises: 97dbb53fa8c8
+Create Date: 2025-01-12 15:49:02.289100
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "fec3db967bf7"
+down_revision = "97dbb53fa8c8"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "document_set",
+        sa.Column(
+            "time_last_modified_by_user",
+            sa.DateTime(timezone=True),
+            nullable=False,
+            server_default=sa.func.now(),
+        ),
+    )
+    op.add_column(
+        "user_group",
+        sa.Column(
+            "time_last_modified_by_user",
+            sa.DateTime(timezone=True),
+            nullable=False,
+            server_default=sa.func.now(),
+        ),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("user_group", "time_last_modified_by_user")
+    op.drop_column("document_set", "time_last_modified_by_user")
diff --git a/backend/ee/onyx/background/celery/tasks/vespa/tasks.py b/backend/ee/onyx/background/celery/tasks/vespa/tasks.py
index bd6cd4c9f0b..45c65b73ce9 100644
--- a/backend/ee/onyx/background/celery/tasks/vespa/tasks.py
+++ b/backend/ee/onyx/background/celery/tasks/vespa/tasks.py
@@ -8,6 +8,9 @@
 from ee.onyx.db.user_group import mark_user_group_as_synced
 from ee.onyx.db.user_group import prepare_user_group_for_deletion
 from onyx.background.celery.apps.app_base import task_logger
+from onyx.db.enums import SyncStatus
+from onyx.db.enums import SyncType
+from onyx.db.sync_record import update_sync_record_status
 from onyx.redis.redis_usergroup import RedisUserGroup
 from onyx.utils.logger import setup_logger
 
@@ -43,24 +46,59 @@ def monitor_usergroup_taskset(
         f"User group sync progress: usergroup_id={usergroup_id} remaining={count} initial={initial_count}"
     )
     if count > 0:
+        update_sync_record_status(
+            db_session=db_session,
+            entity_id=usergroup_id,
+            sync_type=SyncType.USER_GROUP,
+            sync_status=SyncStatus.IN_PROGRESS,
+            num_docs_synced=count,
+        )
         return
 
     user_group = fetch_user_group(db_session=db_session, user_group_id=usergroup_id)
     if user_group:
         usergroup_name = user_group.name
-        if user_group.is_up_for_deletion:
-            # this prepare should have been run when the deletion was scheduled,
-            # but run it again to be sure we're ready to go
-            mark_user_group_as_synced(db_session, user_group)
-            prepare_user_group_for_deletion(db_session, usergroup_id)
-            delete_user_group(db_session=db_session, user_group=user_group)
-            task_logger.info(
-                f"Deleted usergroup: name={usergroup_name} id={usergroup_id}"
-            )
-        else:
-            mark_user_group_as_synced(db_session=db_session, user_group=user_group)
-            task_logger.info(
-                f"Synced usergroup. name={usergroup_name} id={usergroup_id}"
+        try:
+            if user_group.is_up_for_deletion:
+                # this prepare should have been run when the deletion was scheduled,
+                # but run it again to be sure we're ready to go
+                mark_user_group_as_synced(db_session, user_group)
+                prepare_user_group_for_deletion(db_session, usergroup_id)
+                delete_user_group(db_session=db_session, user_group=user_group)
+
+                update_sync_record_status(
+                    db_session=db_session,
+                    entity_id=usergroup_id,
+                    sync_type=SyncType.USER_GROUP,
+                    sync_status=SyncStatus.SUCCESS,
+                    num_docs_synced=initial_count,
+                )
+
+                task_logger.info(
+                    f"Deleted usergroup: name={usergroup_name} id={usergroup_id}"
+                )
+            else:
+                mark_user_group_as_synced(db_session=db_session, user_group=user_group)
+
+                update_sync_record_status(
+                    db_session=db_session,
+                    entity_id=usergroup_id,
+                    sync_type=SyncType.USER_GROUP,
+                    sync_status=SyncStatus.SUCCESS,
+                    num_docs_synced=initial_count,
+                )
+
+                task_logger.info(
+                    f"Synced usergroup. name={usergroup_name} id={usergroup_id}"
+                )
+        except Exception as e:
+            update_sync_record_status(
+                db_session=db_session,
+                entity_id=usergroup_id,
+                sync_type=SyncType.USER_GROUP,
+                sync_status=SyncStatus.FAILED,
+                num_docs_synced=initial_count,
             )
+            raise e
 
     rug.reset()
diff --git a/backend/ee/onyx/db/user_group.py b/backend/ee/onyx/db/user_group.py
index 1b1fcca74b4..0adff8097a9 100644
--- a/backend/ee/onyx/db/user_group.py
+++ b/backend/ee/onyx/db/user_group.py
@@ -374,7 +374,7 @@ def _add_user_group__cc_pair_relationships__no_commit(
 
 
 def insert_user_group(db_session: Session, user_group: UserGroupCreate) -> UserGroup:
-    db_user_group = UserGroup(name=user_group.name)
+    db_user_group = UserGroup(name=user_group.name, time_updated=func.now())
     db_session.add(db_user_group)
     db_session.flush()  # give the group an ID
 
@@ -630,6 +630,10 @@ def update_user_group(
         select(User).where(User.id.in_(removed_user_ids))  # type: ignore
     ).unique()
     _validate_curator_status__no_commit(db_session, list(removed_users))
+
+    # update "time_updated" to now
+    db_user_group.time_last_modified_by_user = func.now()
+
     db_session.commit()
     return db_user_group
 
diff --git a/backend/onyx/background/celery/apps/monitoring.py b/backend/onyx/background/celery/apps/monitoring.py
new file mode 100644
index 00000000000..49c78cafe1d
--- /dev/null
+++ b/backend/onyx/background/celery/apps/monitoring.py
@@ -0,0 +1,95 @@
+import multiprocessing
+from typing import Any
+
+from celery import Celery
+from celery import signals
+from celery import Task
+from celery.signals import celeryd_init
+from celery.signals import worker_init
+from celery.signals import worker_ready
+from celery.signals import worker_shutdown
+
+import onyx.background.celery.apps.app_base as app_base
+from onyx.configs.constants import POSTGRES_CELERY_WORKER_MONITORING_APP_NAME
+from onyx.db.engine import SqlEngine
+from onyx.utils.logger import setup_logger
+from shared_configs.configs import MULTI_TENANT
+
+
+logger = setup_logger()
+
+celery_app = Celery(__name__)
+celery_app.config_from_object("onyx.background.celery.configs.monitoring")
+
+
+@signals.task_prerun.connect
+def on_task_prerun(
+    sender: Any | None = None,
+    task_id: str | None = None,
+    task: Task | None = None,
+    args: tuple | None = None,
+    kwargs: dict | None = None,
+    **kwds: Any,
+) -> None:
+    app_base.on_task_prerun(sender, task_id, task, args, kwargs, **kwds)
+
+
+@signals.task_postrun.connect
+def on_task_postrun(
+    sender: Any | None = None,
+    task_id: str | None = None,
+    task: Task | None = None,
+    args: tuple | None = None,
+    kwargs: dict | None = None,
+    retval: Any | None = None,
+    state: str | None = None,
+    **kwds: Any,
+) -> None:
+    app_base.on_task_postrun(sender, task_id, task, args, kwargs, retval, state, **kwds)
+
+
+@celeryd_init.connect
+def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None:
+    app_base.on_celeryd_init(sender, conf, **kwargs)
+
+
+@worker_init.connect
+def on_worker_init(sender: Any, **kwargs: Any) -> None:
+    logger.info("worker_init signal received.")
+    logger.info(f"Multiprocessing start method: {multiprocessing.get_start_method()}")
+
+    SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_MONITORING_APP_NAME)
+    SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=3)
+
+    app_base.wait_for_redis(sender, **kwargs)
+    app_base.wait_for_db(sender, **kwargs)
+
+    # Less startup checks in multi-tenant case
+    if MULTI_TENANT:
+        return
+
+    app_base.on_secondary_worker_init(sender, **kwargs)
+
+
+@worker_ready.connect
+def on_worker_ready(sender: Any, **kwargs: Any) -> None:
+    app_base.on_worker_ready(sender, **kwargs)
+
+
+@worker_shutdown.connect
+def on_worker_shutdown(sender: Any, **kwargs: Any) -> None:
+    app_base.on_worker_shutdown(sender, **kwargs)
+
+
+@signals.setup_logging.connect
+def on_setup_logging(
+    loglevel: Any, logfile: Any, format: Any, colorize: Any, **kwargs: Any
+) -> None:
+    app_base.on_setup_logging(loglevel, logfile, format, colorize, **kwargs)
+
+
+celery_app.autodiscover_tasks(
+    [
+        "onyx.background.celery.tasks.monitoring",
+    ]
+)
diff --git a/backend/onyx/background/celery/configs/monitoring.py b/backend/onyx/background/celery/configs/monitoring.py
new file mode 100644
index 00000000000..90f7b889cce
--- /dev/null
+++ b/backend/onyx/background/celery/configs/monitoring.py
@@ -0,0 +1,21 @@
+import onyx.background.celery.configs.base as shared_config
+
+broker_url = shared_config.broker_url
+broker_connection_retry_on_startup = shared_config.broker_connection_retry_on_startup
+broker_pool_limit = shared_config.broker_pool_limit
+broker_transport_options = shared_config.broker_transport_options
+
+redis_socket_keepalive = shared_config.redis_socket_keepalive
+redis_retry_on_timeout = shared_config.redis_retry_on_timeout
+redis_backend_health_check_interval = shared_config.redis_backend_health_check_interval
+
+result_backend = shared_config.result_backend
+result_expires = shared_config.result_expires  # 86400 seconds is the default
+
+task_default_priority = shared_config.task_default_priority
+task_acks_late = shared_config.task_acks_late
+
+# Monitoring worker specific settings
+worker_concurrency = 1  # Single worker is sufficient for monitoring
+worker_pool = "solo"
+worker_prefetch_multiplier = 1
diff --git a/backend/onyx/background/celery/tasks/beat_schedule.py b/backend/onyx/background/celery/tasks/beat_schedule.py
index 8b08e77877c..58e27b91e5b 100644
--- a/backend/onyx/background/celery/tasks/beat_schedule.py
+++ b/backend/onyx/background/celery/tasks/beat_schedule.py
@@ -3,6 +3,7 @@
 
 from onyx.configs.app_configs import LLM_MODEL_UPDATE_API_URL
 from onyx.configs.constants import OnyxCeleryPriority
+from onyx.configs.constants import OnyxCeleryQueues
 from onyx.configs.constants import OnyxCeleryTask
 
 # choosing 15 minutes because it roughly gives us enough time to process many tasks
@@ -68,6 +69,16 @@
             "expires": BEAT_EXPIRES_DEFAULT,
         },
     },
+    {
+        "name": "monitor-background-processes",
+        "task": OnyxCeleryTask.MONITOR_BACKGROUND_PROCESSES,
+        "schedule": timedelta(minutes=5),
+        "options": {
+            "priority": OnyxCeleryPriority.LOW,
+            "expires": BEAT_EXPIRES_DEFAULT,
+            "queue": OnyxCeleryQueues.MONITORING,
+        },
+    },
     {
         "name": "check-for-doc-permissions-sync",
         "task": OnyxCeleryTask.CHECK_FOR_DOC_PERMISSIONS_SYNC,
diff --git a/backend/onyx/background/celery/tasks/connector_deletion/tasks.py b/backend/onyx/background/celery/tasks/connector_deletion/tasks.py
index 8c7647468d0..bf7e949d3f8 100644
--- a/backend/onyx/background/celery/tasks/connector_deletion/tasks.py
+++ b/backend/onyx/background/celery/tasks/connector_deletion/tasks.py
@@ -17,7 +17,10 @@
 from onyx.db.connector_credential_pair import get_connector_credential_pairs
 from onyx.db.engine import get_session_with_tenant
 from onyx.db.enums import ConnectorCredentialPairStatus
+from onyx.db.enums import SyncType
 from onyx.db.search_settings import get_all_search_settings
+from onyx.db.sync_record import cleanup_sync_records
+from onyx.db.sync_record import insert_sync_record
 from onyx.redis.redis_connector import RedisConnector
 from onyx.redis.redis_connector_delete import RedisConnectorDeletePayload
 from onyx.redis.redis_pool import get_redis_client
@@ -118,6 +121,13 @@ def try_generate_document_cc_pair_cleanup_tasks(
         return None
 
     if cc_pair.status != ConnectorCredentialPairStatus.DELETING:
+        # there should be no in-progress sync records if this is up to date
+        # clean it up just in case things got into a bad state
+        cleanup_sync_records(
+            db_session=db_session,
+            entity_id=cc_pair_id,
+            sync_type=SyncType.CONNECTOR_DELETION,
+        )
         return None
 
     # set a basic fence to start
@@ -126,6 +136,13 @@ def try_generate_document_cc_pair_cleanup_tasks(
         submitted=datetime.now(timezone.utc),
     )
 
+    # create before setting fence to avoid race condition where the monitoring
+    # task updates the sync record before it is created
+    insert_sync_record(
+        db_session=db_session,
+        entity_id=cc_pair_id,
+        sync_type=SyncType.CONNECTOR_DELETION,
+    )
     redis_connector.delete.set_fence(fence_payload)
 
     try:
diff --git a/backend/onyx/background/celery/tasks/monitoring/tasks.py b/backend/onyx/background/celery/tasks/monitoring/tasks.py
new file mode 100644
index 00000000000..a78939ef036
--- /dev/null
+++ b/backend/onyx/background/celery/tasks/monitoring/tasks.py
@@ -0,0 +1,427 @@
+import json
+from collections.abc import Callable
+from datetime import timedelta
+from typing import Any
+
+from celery import shared_task
+from celery import Task
+from pydantic import BaseModel
+from redis import Redis
+from sqlalchemy import select
+from sqlalchemy.orm import Session
+
+from onyx.background.celery.apps.app_base import task_logger
+from onyx.background.celery.tasks.vespa.tasks import celery_get_queue_length
+from onyx.configs.app_configs import JOB_TIMEOUT
+from onyx.configs.constants import OnyxCeleryQueues
+from onyx.configs.constants import OnyxCeleryTask
+from onyx.db.engine import get_db_current_time
+from onyx.db.engine import get_session_with_tenant
+from onyx.db.enums import IndexingStatus
+from onyx.db.enums import SyncType
+from onyx.db.models import ConnectorCredentialPair
+from onyx.db.models import DocumentSet
+from onyx.db.models import IndexAttempt
+from onyx.db.models import SyncRecord
+from onyx.db.models import UserGroup
+from onyx.redis.redis_pool import get_redis_client
+from onyx.utils.telemetry import optional_telemetry
+from onyx.utils.telemetry import RecordType
+
+
+_CONNECTOR_INDEX_ATTEMPT_START_LATENCY_KEY_FMT = (
+    "monitoring_connector_index_attempt_start_latency:{cc_pair_id}:{index_attempt_id}"
+)
+
+_CONNECTOR_INDEX_ATTEMPT_RUN_SUCCESS_KEY_FMT = (
+    "monitoring_connector_index_attempt_run_success:{cc_pair_id}:{index_attempt_id}"
+)
+
+
+def _mark_metric_as_emitted(redis_std: Redis, key: str) -> None:
+    """Mark a metric as having been emitted by setting a Redis key with expiration"""
+    redis_std.set(key, "1", ex=24 * 60 * 60)  # Expire after 1 day
+
+
+def _has_metric_been_emitted(redis_std: Redis, key: str) -> bool:
+    """Check if a metric has been emitted by checking for existence of Redis key"""
+    return bool(redis_std.exists(key))
+
+
+class Metric(BaseModel):
+    key: str | None  # only required if we need to store that we have emitted this metric
+    name: str
+    value: Any
+    tags: dict[str, str]
+
+    def log(self) -> None:
+        """Log the metric in a standardized format"""
+        data = {
+            "metric": self.name,
+            "value": self.value,
+            "tags": self.tags,
+        }
+        task_logger.info(json.dumps(data))
+
+    def emit(self) -> None:
+        # Convert value to appropriate type
+        float_value = (
+            float(self.value) if isinstance(self.value, (int, float)) else None
+        )
+        int_value = int(self.value) if isinstance(self.value, int) else None
+        string_value = str(self.value) if isinstance(self.value, str) else None
+        bool_value = bool(self.value) if isinstance(self.value, bool) else None
+
+        if (
+            float_value is None
+            and int_value is None
+            and string_value is None
+            and bool_value is None
+        ):
+            task_logger.error(
+                f"Invalid metric value type: {type(self.value)} "
+                f"({self.value}) for metric {self.name}."
+            )
+            return
+
+        # don't send None values over the wire
+        data = {
+            k: v
+            for k, v in {
+                "metric_name": self.name,
+                "float_value": float_value,
+                "int_value": int_value,
+                "string_value": string_value,
+                "bool_value": bool_value,
+                "tags": self.tags,
+            }.items()
+            if v is not None
+        }
+        optional_telemetry(
+            record_type=RecordType.METRIC,
+            data=data,
+        )
+
+
+def _collect_queue_metrics(redis_celery: Redis) -> list[Metric]:
+    """Collect metrics about queue lengths for different Celery queues"""
+    metrics = []
+    queue_mappings = {
+        "celery_queue_length": "celery",
+        "indexing_queue_length": "indexing",
+        "sync_queue_length": "sync",
+        "deletion_queue_length": "deletion",
+        "pruning_queue_length": "pruning",
+        "permissions_sync_queue_length": OnyxCeleryQueues.CONNECTOR_DOC_PERMISSIONS_SYNC,
+        "external_group_sync_queue_length": OnyxCeleryQueues.CONNECTOR_EXTERNAL_GROUP_SYNC,
+        "permissions_upsert_queue_length": OnyxCeleryQueues.DOC_PERMISSIONS_UPSERT,
+    }
+
+    for name, queue in queue_mappings.items():
+        metrics.append(
+            Metric(
+                key=None,
+                name=name,
+                value=celery_get_queue_length(queue, redis_celery),
+                tags={"queue": name},
+            )
+        )
+
+    return metrics
+
+
+def _build_connector_start_latency_metric(
+    cc_pair: ConnectorCredentialPair,
+    recent_attempt: IndexAttempt,
+    second_most_recent_attempt: IndexAttempt | None,
+    redis_std: Redis,
+) -> Metric | None:
+    if not recent_attempt.time_started:
+        return None
+
+    # check if we already emitted a metric for this index attempt
+    metric_key = _CONNECTOR_INDEX_ATTEMPT_START_LATENCY_KEY_FMT.format(
+        cc_pair_id=cc_pair.id,
+        index_attempt_id=recent_attempt.id,
+    )
+    if _has_metric_been_emitted(redis_std, metric_key):
+        task_logger.info(
+            f"Skipping metric for connector {cc_pair.connector.id} "
+            f"index attempt {recent_attempt.id} because it has already been "
+            "emitted"
+        )
+        return None
+
+    # Connector start latency
+    # first run case - we should start as soon as it's created
+    if not second_most_recent_attempt:
+        desired_start_time = cc_pair.connector.time_created
+    else:
+        if not cc_pair.connector.refresh_freq:
+            task_logger.error(
+                "Found non-initial index attempt for connector "
+                "without refresh_freq. This should never happen."
+            )
+            return None
+
+        desired_start_time = second_most_recent_attempt.time_updated + timedelta(
+            seconds=cc_pair.connector.refresh_freq
+        )
+
+    start_latency = (recent_attempt.time_started - desired_start_time).total_seconds()
+
+    return Metric(
+        key=metric_key,
+        name="connector_start_latency",
+        value=start_latency,
+        tags={},
+    )
+
+
+def _build_run_success_metric(
+    cc_pair: ConnectorCredentialPair, recent_attempt: IndexAttempt, redis_std: Redis
+) -> Metric | None:
+    metric_key = _CONNECTOR_INDEX_ATTEMPT_RUN_SUCCESS_KEY_FMT.format(
+        cc_pair_id=cc_pair.id,
+        index_attempt_id=recent_attempt.id,
+    )
+
+    if _has_metric_been_emitted(redis_std, metric_key):
+        task_logger.info(
+            f"Skipping metric for connector {cc_pair.connector.id} "
+            f"index attempt {recent_attempt.id} because it has already been "
+            "emitted"
+        )
+        return None
+
+    if recent_attempt.status in [
+        IndexingStatus.SUCCESS,
+        IndexingStatus.FAILED,
+        IndexingStatus.CANCELED,
+    ]:
+        return Metric(
+            key=metric_key,
+            name="connector_run_succeeded",
+            value=recent_attempt.status == IndexingStatus.SUCCESS,
+            tags={"source": str(cc_pair.connector.source)},
+        )
+
+    return None
+
+
+def _collect_connector_metrics(db_session: Session, redis_std: Redis) -> list[Metric]:
+    """Collect metrics about connector runs from the past hour"""
+    # NOTE: use get_db_current_time since the IndexAttempt times are set based on DB time
+    one_hour_ago = get_db_current_time(db_session) - timedelta(hours=1)
+
+    # Get all connector credential pairs
+    cc_pairs = db_session.scalars(select(ConnectorCredentialPair)).all()
+
+    metrics = []
+    for cc_pair in cc_pairs:
+        # Get most recent attempt in the last hour
+        recent_attempts = (
+            db_session.query(IndexAttempt)
+            .filter(
+                IndexAttempt.connector_credential_pair_id == cc_pair.id,
+                IndexAttempt.time_created >= one_hour_ago,
+            )
+            .order_by(IndexAttempt.time_created.desc())
+            .limit(2)
+            .all()
+        )
+        recent_attempt = recent_attempts[0] if recent_attempts else None
+        second_most_recent_attempt = (
+            recent_attempts[1] if len(recent_attempts) > 1 else None
+        )
+
+        # if no metric to emit, skip
+        if not recent_attempt:
+            continue
+
+        # Connector start latency
+        start_latency_metric = _build_connector_start_latency_metric(
+            cc_pair, recent_attempt, second_most_recent_attempt, redis_std
+        )
+        if start_latency_metric:
+            metrics.append(start_latency_metric)
+
+        # Connector run success/failure
+        run_success_metric = _build_run_success_metric(
+            cc_pair, recent_attempt, redis_std
+        )
+        if run_success_metric:
+            metrics.append(run_success_metric)
+
+    return metrics
+
+
+def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]:
+    """Collect metrics about document set and group syncing speed"""
+    # NOTE: use get_db_current_time since the SyncRecord times are set based on DB time
+    one_hour_ago = get_db_current_time(db_session) - timedelta(hours=1)
+
+    # Get all sync records from the last hour
+    recent_sync_records = db_session.scalars(
+        select(SyncRecord)
+        .where(SyncRecord.sync_start_time >= one_hour_ago)
+        .order_by(SyncRecord.sync_start_time.desc())
+    ).all()
+
+    metrics = []
+    for sync_record in recent_sync_records:
+        # Skip if no end time (sync still in progress)
+        if not sync_record.sync_end_time:
+            continue
+
+        # Check if we already emitted a metric for this sync record
+        metric_key = (
+            f"sync_speed:{sync_record.sync_type}:"
+            f"{sync_record.entity_id}:{sync_record.id}"
+        )
+        if _has_metric_been_emitted(redis_std, metric_key):
+            task_logger.debug(
+                f"Skipping metric for sync record {sync_record.id} "
+                "because it has already been emitted"
+            )
+            continue
+
+        # Calculate sync duration in minutes
+        sync_duration_mins = (
+            sync_record.sync_end_time - sync_record.sync_start_time
+        ).total_seconds() / 60.0
+
+        # Calculate sync speed (docs/min) - avoid division by zero
+        sync_speed = (
+            sync_record.num_docs_synced / sync_duration_mins
+            if sync_duration_mins > 0
+            else None
+        )
+
+        if sync_speed is None:
+            task_logger.error(
+                "Something went wrong with sync speed calculation. "
+                f"Sync record: {sync_record.id}"
+            )
+            continue
+
+        metrics.append(
+            Metric(
+                key=metric_key,
+                name="sync_speed_docs_per_min",
+                value=sync_speed,
+                tags={
+                    "sync_type": str(sync_record.sync_type),
+                    "status": str(sync_record.sync_status),
+                },
+            )
+        )
+
+        # Add sync start latency metric
+        start_latency_key = (
+            f"sync_start_latency:{sync_record.sync_type}"
+            f":{sync_record.entity_id}:{sync_record.id}"
+        )
+        if _has_metric_been_emitted(redis_std, start_latency_key):
+            task_logger.debug(
+                f"Skipping start latency metric for sync record {sync_record.id} "
+                "because it has already been emitted"
+            )
+            continue
+
+        # Get the entity's last update time based on sync type
+        entity: DocumentSet | UserGroup | None = None
+        if sync_record.sync_type == SyncType.DOCUMENT_SET:
+            entity = db_session.scalar(
+                select(DocumentSet).where(DocumentSet.id == sync_record.entity_id)
+            )
+        elif sync_record.sync_type == SyncType.USER_GROUP:
+            entity = db_session.scalar(
+                select(UserGroup).where(UserGroup.id == sync_record.entity_id)
+            )
+        else:
+            # Skip other sync types
+            task_logger.debug(
+                f"Skipping sync record {sync_record.id} "
+                f"with type {sync_record.sync_type} "
+                f"and id {sync_record.entity_id} "
+                "because it is not a document set or user group"
+            )
+            continue
+
+        if entity is None:
+            task_logger.error(
+                f"Could not find entity for sync record {sync_record.id} "
+                f"with type {sync_record.sync_type} and id {sync_record.entity_id}"
+            )
+            continue
+
+        # Calculate start latency in seconds
+        start_latency = (
+            sync_record.sync_start_time - entity.time_last_modified_by_user
+        ).total_seconds()
+        if start_latency < 0:
+            task_logger.error(
+                f"Start latency is negative for sync record {sync_record.id} "
+                f"with type {sync_record.sync_type} and id {sync_record.entity_id}."
+                "This is likely because the entity was updated between the time the "
+                "time the sync finished and this job ran. Skipping."
+            )
+            continue
+
+        metrics.append(
+            Metric(
+                key=start_latency_key,
+                name="sync_start_latency_seconds",
+                value=start_latency,
+                tags={
+                    "sync_type": str(sync_record.sync_type),
+                },
+            )
+        )
+
+    return metrics
+
+
+@shared_task(
+    name=OnyxCeleryTask.MONITOR_BACKGROUND_PROCESSES,
+    soft_time_limit=JOB_TIMEOUT,
+    queue=OnyxCeleryQueues.MONITORING,
+    bind=True,
+)
+def monitor_background_processes(self: Task, *, tenant_id: str | None) -> None:
+    """Collect and emit metrics about background processes.
+    This task runs periodically to gather metrics about:
+    - Queue lengths for different Celery queues
+    - Connector run metrics (start latency, success rate)
+    - Syncing speed metrics
+    - Worker status and task counts
+    """
+    task_logger.info("Starting background process monitoring")
+
+    try:
+        # Get Redis client for Celery broker
+        redis_celery = self.app.broker_connection().channel().client  # type: ignore
+        redis_std = get_redis_client(tenant_id=tenant_id)
+
+        # Define metric collection functions and their dependencies
+        metric_functions: list[Callable[[], list[Metric]]] = [
+            lambda: _collect_queue_metrics(redis_celery),
+            lambda: _collect_connector_metrics(db_session, redis_std),
+            lambda: _collect_sync_metrics(db_session, redis_std),
+        ]
+        # Collect and log each metric
+        with get_session_with_tenant(tenant_id) as db_session:
+            for metric_fn in metric_functions:
+                metrics = metric_fn()
+                for metric in metrics:
+                    metric.log()
+                    metric.emit()
+                    if metric.key:
+                        _mark_metric_as_emitted(redis_std, metric.key)
+
+        task_logger.info("Successfully collected background process metrics")
+
+    except Exception as e:
+        task_logger.exception("Error collecting background process metrics")
+        raise e
diff --git a/backend/onyx/background/celery/tasks/vespa/tasks.py b/backend/onyx/background/celery/tasks/vespa/tasks.py
index 8eabeb7d8a8..dea1981f0fa 100644
--- a/backend/onyx/background/celery/tasks/vespa/tasks.py
+++ b/backend/onyx/background/celery/tasks/vespa/tasks.py
@@ -1,6 +1,7 @@
 import random
 import time
 import traceback
+from collections.abc import Callable
 from datetime import datetime
 from datetime import timezone
 from http import HTTPStatus
@@ -53,10 +54,16 @@
 from onyx.db.document_set import mark_document_set_as_synced
 from onyx.db.engine import get_session_with_tenant
 from onyx.db.enums import IndexingStatus
+from onyx.db.enums import SyncStatus
+from onyx.db.enums import SyncType
 from onyx.db.index_attempt import delete_index_attempts
 from onyx.db.index_attempt import get_index_attempt
 from onyx.db.index_attempt import mark_attempt_failed
 from onyx.db.models import DocumentSet
+from onyx.db.models import UserGroup
+from onyx.db.sync_record import cleanup_sync_records
+from onyx.db.sync_record import insert_sync_record
+from onyx.db.sync_record import update_sync_record_status
 from onyx.document_index.document_index_utils import get_both_index_names
 from onyx.document_index.factory import get_default_document_index
 from onyx.document_index.interfaces import VespaDocumentFields
@@ -283,6 +290,13 @@ def try_generate_document_set_sync_tasks(
         return None
 
     if document_set.is_up_to_date:
+        # there should be no in-progress sync records if this is up to date
+        # clean it up just in case things got into a bad state
+        cleanup_sync_records(
+            db_session=db_session,
+            entity_id=document_set_id,
+            sync_type=SyncType.DOCUMENT_SET,
+        )
         return None
 
     # add tasks to celery and build up the task set to monitor in redis
@@ -311,6 +325,13 @@ def try_generate_document_set_sync_tasks(
         f"document_set={document_set.id} tasks_generated={tasks_generated}"
     )
 
+    # create before setting fence to avoid race condition where the monitoring
+    # task updates the sync record before it is created
+    insert_sync_record(
+        db_session=db_session,
+        entity_id=document_set_id,
+        sync_type=SyncType.DOCUMENT_SET,
+    )
     # set this only after all tasks have been added
     rds.set_fence(tasks_generated)
     return tasks_generated
@@ -332,8 +353,9 @@ def try_generate_user_group_sync_tasks(
         return None
 
     # race condition with the monitor/cleanup function if we use a cached result!
-    fetch_user_group = fetch_versioned_implementation(
-        "onyx.db.user_group", "fetch_user_group"
+    fetch_user_group = cast(
+        Callable[[Session, int], UserGroup | None],
+        fetch_versioned_implementation("onyx.db.user_group", "fetch_user_group"),
     )
 
     usergroup = fetch_user_group(db_session, usergroup_id)
@@ -341,6 +363,13 @@ def try_generate_user_group_sync_tasks(
         return None
 
     if usergroup.is_up_to_date:
+        # there should be no in-progress sync records if this is up to date
+        # clean it up just in case things got into a bad state
+        cleanup_sync_records(
+            db_session=db_session,
+            entity_id=usergroup_id,
+            sync_type=SyncType.USER_GROUP,
+        )
         return None
 
     # add tasks to celery and build up the task set to monitor in redis
@@ -368,8 +397,16 @@ def try_generate_user_group_sync_tasks(
         f"usergroup={usergroup.id} tasks_generated={tasks_generated}"
     )
 
+    # create before setting fence to avoid race condition where the monitoring
+    # task updates the sync record before it is created
+    insert_sync_record(
+        db_session=db_session,
+        entity_id=usergroup_id,
+        sync_type=SyncType.USER_GROUP,
+    )
     # set this only after all tasks have been added
     rug.set_fence(tasks_generated)
+
     return tasks_generated
 
 
@@ -419,6 +456,13 @@ def monitor_document_set_taskset(
         f"remaining={count} initial={initial_count}"
     )
     if count > 0:
+        update_sync_record_status(
+            db_session=db_session,
+            entity_id=document_set_id,
+            sync_type=SyncType.DOCUMENT_SET,
+            sync_status=SyncStatus.IN_PROGRESS,
+            num_docs_synced=count,
+        )
         return
 
     document_set = cast(
@@ -437,6 +481,13 @@ def monitor_document_set_taskset(
             task_logger.info(
                 f"Successfully synced document set: document_set={document_set_id}"
             )
+        update_sync_record_status(
+            db_session=db_session,
+            entity_id=document_set_id,
+            sync_type=SyncType.DOCUMENT_SET,
+            sync_status=SyncStatus.SUCCESS,
+            num_docs_synced=initial_count,
+        )
 
     rds.reset()
 
@@ -470,6 +521,14 @@ def monitor_connector_deletion_taskset(
         f"Connector deletion progress: cc_pair={cc_pair_id} remaining={remaining} initial={fence_data.num_tasks}"
     )
     if remaining > 0:
+        with get_session_with_tenant(tenant_id) as db_session:
+            update_sync_record_status(
+                db_session=db_session,
+                entity_id=cc_pair_id,
+                sync_type=SyncType.CONNECTOR_DELETION,
+                sync_status=SyncStatus.IN_PROGRESS,
+                num_docs_synced=remaining,
+            )
         return
 
     with get_session_with_tenant(tenant_id) as db_session:
@@ -545,11 +604,29 @@ def monitor_connector_deletion_taskset(
                 )
                 db_session.delete(connector)
             db_session.commit()
+
+            update_sync_record_status(
+                db_session=db_session,
+                entity_id=cc_pair_id,
+                sync_type=SyncType.CONNECTOR_DELETION,
+                sync_status=SyncStatus.SUCCESS,
+                num_docs_synced=fence_data.num_tasks,
+            )
+
         except Exception as e:
             db_session.rollback()
             stack_trace = traceback.format_exc()
             error_message = f"Error: {str(e)}\n\nStack Trace:\n{stack_trace}"
             add_deletion_failure_message(db_session, cc_pair_id, error_message)
+
+            update_sync_record_status(
+                db_session=db_session,
+                entity_id=cc_pair_id,
+                sync_type=SyncType.CONNECTOR_DELETION,
+                sync_status=SyncStatus.FAILED,
+                num_docs_synced=fence_data.num_tasks,
+            )
+
             task_logger.exception(
                 f"Connector deletion exceptioned: "
                 f"cc_pair={cc_pair_id} connector={cc_pair.connector_id} credential={cc_pair.credential_id}"
diff --git a/backend/onyx/background/celery/versioned_apps/monitoring.py b/backend/onyx/background/celery/versioned_apps/monitoring.py
new file mode 100644
index 00000000000..29093d1ba3a
--- /dev/null
+++ b/backend/onyx/background/celery/versioned_apps/monitoring.py
@@ -0,0 +1,15 @@
+"""Factory stub for running celery worker / celery beat."""
+from celery import Celery
+
+from onyx.utils.variable_functionality import set_is_ee_based_on_env_variable
+
+set_is_ee_based_on_env_variable()
+
+
+def get_app() -> Celery:
+    from onyx.background.celery.apps.monitoring import celery_app
+
+    return celery_app
+
+
+app = get_app()
diff --git a/backend/onyx/configs/constants.py b/backend/onyx/configs/constants.py
index 5a6ba4c6eda..a3d21bdc724 100644
--- a/backend/onyx/configs/constants.py
+++ b/backend/onyx/configs/constants.py
@@ -47,6 +47,7 @@
 POSTGRES_CELERY_WORKER_LIGHT_APP_NAME = "celery_worker_light"
 POSTGRES_CELERY_WORKER_HEAVY_APP_NAME = "celery_worker_heavy"
 POSTGRES_CELERY_WORKER_INDEXING_APP_NAME = "celery_worker_indexing"
+POSTGRES_CELERY_WORKER_MONITORING_APP_NAME = "celery_worker_monitoring"
 POSTGRES_CELERY_WORKER_INDEXING_CHILD_APP_NAME = "celery_worker_indexing_child"
 POSTGRES_PERMISSIONS_APP_NAME = "permissions"
 POSTGRES_UNKNOWN_APP_NAME = "unknown"
@@ -260,6 +261,9 @@ class OnyxCeleryQueues:
     # Indexing queue
     CONNECTOR_INDEXING = "connector_indexing"
 
+    # Monitoring queue
+    MONITORING = "monitoring"
+
 
 class OnyxRedisLocks:
     PRIMARY_WORKER = "da_lock:primary_worker"
@@ -308,6 +312,7 @@ class OnyxCeleryTask:
     CHECK_FOR_EXTERNAL_GROUP_SYNC = "check_for_external_group_sync"
     CHECK_FOR_LLM_MODEL_UPDATE = "check_for_llm_model_update"
     MONITOR_VESPA_SYNC = "monitor_vespa_sync"
+    MONITOR_BACKGROUND_PROCESSES = "monitor_background_processes"
     KOMBU_MESSAGE_CLEANUP_TASK = "kombu_message_cleanup_task"
     CONNECTOR_PERMISSION_SYNC_GENERATOR_TASK = (
         "connector_permission_sync_generator_task"
diff --git a/backend/onyx/db/document_set.py b/backend/onyx/db/document_set.py
index dfc4f53a189..54e9c3fc50c 100644
--- a/backend/onyx/db/document_set.py
+++ b/backend/onyx/db/document_set.py
@@ -218,6 +218,7 @@ def insert_document_set(
             description=document_set_creation_request.description,
             user_id=user_id,
             is_public=document_set_creation_request.is_public,
+            time_updated=func.now(),
         )
         db_session.add(new_document_set_row)
         db_session.flush()  # ensure the new document set gets assigned an ID
@@ -293,7 +294,7 @@ def update_document_set(
         document_set_row.description = document_set_update_request.description
         document_set_row.is_up_to_date = False
         document_set_row.is_public = document_set_update_request.is_public
-
+        document_set_row.time_last_modified_by_user = func.now()
         versioned_private_doc_set_fn = fetch_versioned_implementation(
             "onyx.db.document_set", "make_doc_set_private"
         )
diff --git a/backend/onyx/db/enums.py b/backend/onyx/db/enums.py
index 0ccb1470ca7..b32825298e3 100644
--- a/backend/onyx/db/enums.py
+++ b/backend/onyx/db/enums.py
@@ -24,12 +24,27 @@ class IndexingMode(str, PyEnum):
     REINDEX = "reindex"
 
 
-# these may differ in the future, which is why we're okay with this duplication
-class DeletionStatus(str, PyEnum):
-    NOT_STARTED = "not_started"
+class SyncType(str, PyEnum):
+    DOCUMENT_SET = "document_set"
+    USER_GROUP = "user_group"
+    CONNECTOR_DELETION = "connector_deletion"
+
+    def __str__(self) -> str:
+        return self.value
+
+
+class SyncStatus(str, PyEnum):
     IN_PROGRESS = "in_progress"
     SUCCESS = "success"
     FAILED = "failed"
+    CANCELED = "canceled"
+
+    def is_terminal(self) -> bool:
+        terminal_states = {
+            SyncStatus.SUCCESS,
+            SyncStatus.FAILED,
+        }
+        return self in terminal_states
 
 
 # Consistent with Celery task statuses
diff --git a/backend/onyx/db/models.py b/backend/onyx/db/models.py
index ff1c98d13d8..044ea0f42a1 100644
--- a/backend/onyx/db/models.py
+++ b/backend/onyx/db/models.py
@@ -44,7 +44,7 @@
 from onyx.configs.constants import DocumentSource
 from onyx.configs.constants import FileOrigin
 from onyx.configs.constants import MessageType
-from onyx.db.enums import AccessType, IndexingMode
+from onyx.db.enums import AccessType, IndexingMode, SyncType, SyncStatus
 from onyx.configs.constants import NotificationType
 from onyx.configs.constants import SearchFeedbackType
 from onyx.configs.constants import TokenRateLimitScope
@@ -880,6 +880,46 @@ def __repr__(self) -> str:
         )
 
 
+class SyncRecord(Base):
+    """
+    Represents the status of a "sync" operation (e.g. document set, user group, deletion).
+
+    A "sync" operation is an operation which needs to update a set of documents within
+    Vespa, usually to match the state of Postgres.
+    """
+
+    __tablename__ = "sync_record"
+
+    id: Mapped[int] = mapped_column(Integer, primary_key=True)
+    # document set id, user group id, or deletion id
+    entity_id: Mapped[int] = mapped_column(Integer)
+
+    sync_type: Mapped[SyncType] = mapped_column(Enum(SyncType, native_enum=False))
+    sync_status: Mapped[SyncStatus] = mapped_column(Enum(SyncStatus, native_enum=False))
+
+    num_docs_synced: Mapped[int] = mapped_column(Integer, default=0)
+
+    sync_start_time: Mapped[datetime.datetime] = mapped_column(DateTime(timezone=True))
+    sync_end_time: Mapped[datetime.datetime | None] = mapped_column(
+        DateTime(timezone=True), nullable=True
+    )
+
+    __table_args__ = (
+        Index(
+            "ix_sync_record_entity_id_sync_type_sync_start_time",
+            "entity_id",
+            "sync_type",
+            "sync_start_time",
+        ),
+        Index(
+            "ix_sync_record_entity_id_sync_type_sync_status",
+            "entity_id",
+            "sync_type",
+            "sync_status",
+        ),
+    )
+
+
 class DocumentByConnectorCredentialPair(Base):
     """Represents an indexing of a document by a specific connector / credential pair"""
 
@@ -1283,6 +1323,11 @@ class DocumentSet(Base):
     # given access to it either via the `users` or `groups` relationships
     is_public: Mapped[bool] = mapped_column(Boolean, nullable=False, default=True)
 
+    # Last time a user updated this document set
+    time_last_modified_by_user: Mapped[datetime.datetime] = mapped_column(
+        DateTime(timezone=True), server_default=func.now()
+    )
+
     connector_credential_pairs: Mapped[list[ConnectorCredentialPair]] = relationship(
         "ConnectorCredentialPair",
         secondary=DocumentSet__ConnectorCredentialPair.__table__,
@@ -1762,6 +1807,11 @@ class UserGroup(Base):
         Boolean, nullable=False, default=False
     )
 
+    # Last time a user updated this user group
+    time_last_modified_by_user: Mapped[datetime.datetime] = mapped_column(
+        DateTime(timezone=True), server_default=func.now()
+    )
+
     users: Mapped[list[User]] = relationship(
         "User",
         secondary=User__UserGroup.__table__,
diff --git a/backend/onyx/db/sync_record.py b/backend/onyx/db/sync_record.py
new file mode 100644
index 00000000000..81b613c271c
--- /dev/null
+++ b/backend/onyx/db/sync_record.py
@@ -0,0 +1,110 @@
+from sqlalchemy import and_
+from sqlalchemy import desc
+from sqlalchemy import func
+from sqlalchemy import select
+from sqlalchemy import update
+from sqlalchemy.orm import Session
+
+from onyx.db.enums import SyncStatus
+from onyx.db.enums import SyncType
+from onyx.db.models import SyncRecord
+
+
+def insert_sync_record(
+    db_session: Session,
+    entity_id: int | None,
+    sync_type: SyncType,
+) -> SyncRecord:
+    """Insert a new sync record into the database.
+
+    Args:
+        db_session: The database session to use
+        entity_id: The ID of the entity being synced (document set ID, user group ID, etc.)
+        sync_type: The type of sync operation
+    """
+    sync_record = SyncRecord(
+        entity_id=entity_id,
+        sync_type=sync_type,
+        sync_status=SyncStatus.IN_PROGRESS,
+        num_docs_synced=0,
+        sync_start_time=func.now(),
+    )
+    db_session.add(sync_record)
+    db_session.commit()
+
+    return sync_record
+
+
+def fetch_latest_sync_record(
+    db_session: Session,
+    entity_id: int,
+    sync_type: SyncType,
+) -> SyncRecord | None:
+    """Fetch the most recent sync record for a given entity ID and status.
+
+    Args:
+        db_session: The database session to use
+        entity_id: The ID of the entity to fetch sync record for
+        sync_type: The type of sync operation
+    """
+    stmt = (
+        select(SyncRecord)
+        .where(
+            and_(
+                SyncRecord.entity_id == entity_id,
+                SyncRecord.sync_type == sync_type,
+            )
+        )
+        .order_by(desc(SyncRecord.sync_start_time))
+        .limit(1)
+    )
+
+    result = db_session.execute(stmt)
+    return result.scalar_one_or_none()
+
+
+def update_sync_record_status(
+    db_session: Session,
+    entity_id: int,
+    sync_type: SyncType,
+    sync_status: SyncStatus,
+    num_docs_synced: int | None = None,
+) -> None:
+    """Update the status of a sync record.
+
+    Args:
+        db_session: The database session to use
+        entity_id: The ID of the entity being synced
+        sync_type: The type of sync operation
+        sync_status: The new status to set
+        num_docs_synced: Optional number of documents synced to update
+    """
+    sync_record = fetch_latest_sync_record(db_session, entity_id, sync_type)
+    if sync_record is None:
+        raise ValueError(
+            f"No sync record found for entity_id={entity_id} sync_type={sync_type}"
+        )
+
+    sync_record.sync_status = sync_status
+    if num_docs_synced is not None:
+        sync_record.num_docs_synced = num_docs_synced
+
+    if sync_status.is_terminal():
+        sync_record.sync_end_time = func.now()  # type: ignore
+
+    db_session.commit()
+
+
+def cleanup_sync_records(
+    db_session: Session, entity_id: int, sync_type: SyncType
+) -> None:
+    """Cleanup sync records for a given entity ID and sync type by marking them as failed."""
+    stmt = (
+        update(SyncRecord)
+        .where(SyncRecord.entity_id == entity_id)
+        .where(SyncRecord.sync_type == sync_type)
+        .where(SyncRecord.sync_status == SyncStatus.IN_PROGRESS)
+        .values(sync_status=SyncStatus.CANCELED, sync_end_time=func.now())
+    )
+    db_session.execute(stmt)
+    db_session.commit()
diff --git a/backend/onyx/utils/telemetry.py b/backend/onyx/utils/telemetry.py
index 23a239a5564..793bab8ce64 100644
--- a/backend/onyx/utils/telemetry.py
+++ b/backend/onyx/utils/telemetry.py
@@ -33,6 +33,7 @@ class RecordType(str, Enum):
     USAGE = "usage"
     LATENCY = "latency"
     FAILURE = "failure"
+    METRIC = "metric"
 
 
 def get_or_generate_uuid() -> str:
diff --git a/backend/supervisord.conf b/backend/supervisord.conf
index 1a17f5d17d0..78d5679bae7 100644
--- a/backend/supervisord.conf
+++ b/backend/supervisord.conf
@@ -65,6 +65,18 @@ autorestart=true
 startsecs=10
 stopasgroup=true
 
+[program:celery_worker_monitoring]
+command=celery -A onyx.background.celery.versioned_apps.monitoring worker
+    --loglevel=INFO
+    --hostname=monitoring@%%n
+    -Q monitoring
+stdout_logfile=/var/log/celery_worker_monitoring.log
+stdout_logfile_maxbytes=16MB
+redirect_stderr=true
+autorestart=true
+startsecs=10
+stopasgroup=true
+
 # Job scheduler for periodic tasks
 [program:celery_beat]
 command=celery -A onyx.background.celery.versioned_apps.beat beat

From a610b6bd8d9cbbfe29f3866389e8f4a60858b022 Mon Sep 17 00:00:00 2001
From: Weves <chrisweaver101@gmail.com>
Date: Mon, 13 Jan 2025 13:11:07 -0800
Subject: [PATCH 27/29] Support new model for image input

---
 web/src/lib/llm/utils.ts | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/web/src/lib/llm/utils.ts b/web/src/lib/llm/utils.ts
index 6fc1fc7a9f2..3eca6caccfb 100644
--- a/web/src/lib/llm/utils.ts
+++ b/web/src/lib/llm/utils.ts
@@ -102,12 +102,13 @@ const MODEL_NAMES_SUPPORTING_IMAGE_INPUT = [
   // meta models
   "llama-3.2-90b-vision-instruct",
   "llama-3.2-11b-vision-instruct",
+  "Llama-3-2-11B-Vision-Instruct-yb",
 ];
 
 export function checkLLMSupportsImageInput(model: string) {
   // Original exact match check
   const exactMatch = MODEL_NAMES_SUPPORTING_IMAGE_INPUT.some(
-    (modelName) => modelName === model
+    (modelName) => modelName.toLowerCase() === model.toLowerCase()
   );
 
   if (exactMatch) {
@@ -116,12 +117,13 @@ export function checkLLMSupportsImageInput(model: string) {
 
   // Additional check for the last part of the model name
   const modelParts = model.split(/[/.]/);
-  const lastPart = modelParts[modelParts.length - 1];
+  const lastPart = modelParts[modelParts.length - 1]?.toLowerCase();
 
   return MODEL_NAMES_SUPPORTING_IMAGE_INPUT.some((modelName) => {
     const modelNameParts = modelName.split(/[/.]/);
     const modelNameLastPart = modelNameParts[modelNameParts.length - 1];
-    return modelNameLastPart === lastPart;
+    // lastPart is already lowercased above for tiny performance gain
+    return modelNameLastPart?.toLowerCase() === lastPart;
   });
 }
 

From c4323573d241deb20c3d838a51dedb155a2ebea7 Mon Sep 17 00:00:00 2001
From: "Richard Kuo (Danswer)" <rkuo@onyx.app>
Date: Mon, 13 Jan 2025 13:23:40 -0800
Subject: [PATCH 28/29] fix alembic

---
 .../0f7ff6d75b57_add_index_to_index_attempt_time_created.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/alembic/versions/0f7ff6d75b57_add_index_to_index_attempt_time_created.py b/backend/alembic/versions/0f7ff6d75b57_add_index_to_index_attempt_time_created.py
index 23db56bd61e..e23a6186e27 100644
--- a/backend/alembic/versions/0f7ff6d75b57_add_index_to_index_attempt_time_created.py
+++ b/backend/alembic/versions/0f7ff6d75b57_add_index_to_index_attempt_time_created.py
@@ -9,7 +9,7 @@
 
 # revision identifiers, used by Alembic.
 revision = "0f7ff6d75b57"
-down_revision = "369644546676"
+down_revision = "fec3db967bf7"
 branch_labels: None = None
 depends_on: None = None
 

From d8aa21ca3a8f37addc9819f6e7589ca9dbd843ce Mon Sep 17 00:00:00 2001
From: "Richard Kuo (Danswer)" <rkuo@onyx.app>
Date: Mon, 13 Jan 2025 14:32:26 -0800
Subject: [PATCH 29/29] fix missed var names

---
 backend/ee/onyx/db/user_group.py | 4 +++-
 backend/onyx/db/document_set.py  | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/backend/ee/onyx/db/user_group.py b/backend/ee/onyx/db/user_group.py
index 0adff8097a9..791a6cebce5 100644
--- a/backend/ee/onyx/db/user_group.py
+++ b/backend/ee/onyx/db/user_group.py
@@ -374,7 +374,9 @@ def _add_user_group__cc_pair_relationships__no_commit(
 
 
 def insert_user_group(db_session: Session, user_group: UserGroupCreate) -> UserGroup:
-    db_user_group = UserGroup(name=user_group.name, time_updated=func.now())
+    db_user_group = UserGroup(
+        name=user_group.name, time_last_modified_by_user=func.now()
+    )
     db_session.add(db_user_group)
     db_session.flush()  # give the group an ID
 
diff --git a/backend/onyx/db/document_set.py b/backend/onyx/db/document_set.py
index 54e9c3fc50c..7df2ca0ac12 100644
--- a/backend/onyx/db/document_set.py
+++ b/backend/onyx/db/document_set.py
@@ -218,7 +218,7 @@ def insert_document_set(
             description=document_set_creation_request.description,
             user_id=user_id,
             is_public=document_set_creation_request.is_public,
-            time_updated=func.now(),
+            time_last_modified_by_user=func.now(),
         )
         db_session.add(new_document_set_row)
         db_session.flush()  # ensure the new document set gets assigned an ID