Fix: Chroma Upsert instead of Add (#3086)

potter-potter · web-flow · commit 31a53c8a2858 · 2024-05-23T19:56:19.000Z
Thanks to @0xjgv we have upserting instead of adding in Chroma. This will prevent duplicate embeddings. Also including a huggingface example. We had examples for all the other embedders.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,8 @@
   to avoid text being dynamically injected into the XML document.
 * Add the missing `form_extraction_skip_tables` argument to the `partition_pdf_or_image` call.
 
+* **Chromadb change from Add to Upsert using element_id to make idempotent**
+
 ## 0.14.2
 
 ### Enhancements
diff --git a/unstructured/ingest/connector/chroma.py b/unstructured/ingest/connector/chroma.py
@@ -111,7 +111,8 @@ def upsert_batch(self, batch):
 
         try:
             # Chroma wants lists even if there is only one element
-            collection.add(
+            # Upserting to prevent duplicates
+            collection.upsert(
                 ids=batch["ids"],
                 documents=batch["documents"],
                 embeddings=batch["embeddings"],
@@ -147,8 +148,9 @@ def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs)
             self.upsert_batch(self.prepare_chroma_list(chunk))
 
     def normalize_dict(self, element_dict: dict) -> dict:
+        element_id = element_dict.get("element_id", str(uuid.uuid4()))
         return {
-            "id": str(uuid.uuid4()),
+            "id": element_id,
             "embedding": element_dict.pop("embeddings", None),
             "document": element_dict.pop("text", None),
             "metadata": flatten_dict(