datastax · bjchambers · Jun 26, 2024 · Jun 25, 2024 · Jun 25, 2024 · Jun 26, 2024
diff --git a/libs/knowledge-store/README.md b/libs/knowledge-store/README.md
@@ -19,12 +19,12 @@ The Graph Store makes use of the following metadata fields on each `Document`:
 
 #### Hyperlinks
 
-To connect nodes based on hyperlinks, you can use the `HtmlLinkEdgeExtractor` as shown below:
+To connect nodes based on hyperlinks, you can use the `HtmlLinkExtractor` as shown below:
 
 ```python
-from ragstack_knowledge_store.langchain.extractors import HtmlLinkEdgeExtractor
+from ragstack_knowledge_store.langchain.extractors import HtmlLinkExtractor
 
-html_link_extractor = HtmlLinkEdgeExtractor()
+html_link_extractor = HtmlLinkExtractor()
 
 for doc in documents:
     doc.metadata["content_id"] = doc.metadata["source"]

diff --git a/libs/knowledge-store/notebooks/astra_support.ipynb b/libs/knowledge-store/notebooks/astra_support.ipynb
@@ -9,18 +9,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "zsh:1: no matches found: ragstack-ai-langchain[knowledge-store]\n",
-      "Note: you may need to restart the kernel to use updated packages.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%pip install -q ragstack-ai-langchain[knowledge-store] beautifulsoup4 markdownify python-dotenv"
    ]
@@ -45,20 +36,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "1374"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Use sitemaps to crawl the content\n",
     "SITEMAPS = [\n",
@@ -115,7 +95,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -125,10 +105,10 @@
     "from typing import AsyncIterator, Iterable\n",
     "from ragstack_knowledge_store.graph_store import CONTENT_ID\n",
     "from markdownify import MarkdownConverter\n",
-    "from ragstack_langchain.graph_store.extractors import HtmlLinkEdgeExtractor\n",
+    "from ragstack_langchain.graph_store.extractors import HtmlLinkExtractor\n",
     "\n",
     "markdown_converter = MarkdownConverter(heading_style=\"ATX\")\n",
-    "html_link_extractor = HtmlLinkEdgeExtractor()\n",
+    "html_link_extractor = HtmlLinkExtractor()\n",
     "\n",
     "\n",
     "def select_content(soup: BeautifulSoup, url: str) -> BeautifulSoup:\n",
@@ -180,7 +160,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
@@ -229,7 +209,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -238,7 +218,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -260,7 +240,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -319,7 +299,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -354,7 +334,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -363,7 +343,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -451,7 +431,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -475,9 +455,43 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "Astra DB Serverless uses the JVector vector search engine to construct a graph-based index. JVector is part of the DiskANN family and is designed to facilitate approximate nearest neighbor (ANN) search, which is crucial for handling high-dimensional vector spaces efficiently.\n",
+       "\n",
+       "Here are the key aspects of JVector and its indexing algorithms:\n",
+       "\n",
+       "1. **Graph-Based Index**: JVector constructs a single-layer graph with nonblocking concurrency control. This allows for scalable and efficient search operations.\n",
+       "\n",
+       "2. **Incremental Updates**: JVector supports incremental construction and updates to the index, making it suitable for dynamic datasets.\n",
+       "\n",
+       "3. **Two-Pass Search**: JVector employs a two-pass search strategy:\n",
+       "   - **First Pass**: Uses lossily compressed representations of vectors stored in memory to quickly narrow down candidates.\n",
+       "   - **Second Pass**: Uses more accurate representations read from disk to refine the search results.\n",
+       "\n",
+       "4. **Compression Techniques**: JVector supports various vector compression techniques to optimize memory usage and performance:\n",
+       "   - **Product Quantization (PQ)**: A method that compresses vectors by splitting them into subspaces and quantizing each subspace separately.\n",
+       "   - **Binary Quantization (BQ)**: Another compression method, although it is generally less effective than PQ for most embedding models.\n",
+       "   - **Fused ADC (Asymmetric Distance Computation)**: Combines PQ with efficient distance computation methods to enhance search speed.\n",
+       "\n",
+       "5. **DiskANN Architecture**: JVector builds on the DiskANN design, allowing it to handle larger-than-memory indexes by storing additional data on disk.\n",
+       "\n",
+       "6. **High-Dimensional Optimization**: JVector uses the Panama Vector API (SIMD) to optimize ANN indexing and search operations, ensuring high performance even with large datasets.\n",
+       "\n",
+       "In summary, Astra DB Serverless leverages the JVector engine, which employs a graph-based index with advanced compression and search optimization techniques to provide efficient vector search capabilities."
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "run_and_render(mmr_graph_rag_chain, QUESTION)"
    ]

diff --git a/libs/langchain/ragstack_langchain/graph_store/extractors/__init__.py b/libs/langchain/ragstack_langchain/graph_store/extractors/__init__.py
@@ -1,7 +1,7 @@
-from .edge_extractor import EdgeExtractor
-from .html_link_edge_extractor import HtmlLinkEdgeExtractor
+from .link_extractor import LinkExtractor
+from .html_link_extractor import HtmlLinkExtractor
 
 __all__ = [
-    "EdgeExtractor",
-    "HtmlLinkEdgeExtractor",
+    "LinkExtractor",
+    "HtmlLinkExtractor",
 ]
diff --git a/libs/langchain/ragstack_langchain/graph_store/extractors/edge_extractor.py b/libs/langchain/ragstack_langchain/graph_store/extractors/edge_extractor.py
diff --git a/...re/extractors/html_link_edge_extractor.py → ...h_store/extractors/html_link_extractor.py b/...re/extractors/html_link_edge_extractor.py → ...h_store/extractors/html_link_extractor.py
@@ -1,12 +1,10 @@
-from langchain_core.documents import Document
 from typing import TYPE_CHECKING, Set, Union
 from urllib.parse import urldefrag, urljoin, urlparse
 
 from ragstack_langchain.graph_store.links import (
-    add_links,
     Link
 )
-from .edge_extractor import EdgeExtractor
+from .link_extractor import LinkExtractor
 
 if TYPE_CHECKING:
     from bs4 import BeautifulSoup
@@ -47,17 +45,15 @@ def _parse_hrefs(soup: "BeautifulSoup", url: str, drop_fragments: bool = True) -
     return links
 
 
-class HtmlLinkEdgeExtractor(EdgeExtractor[Union[str, "BeautifulSoup"]]):
+class HtmlLinkExtractor(LinkExtractor[Union[str, "BeautifulSoup"]]):
     def __init__(
-        self, url_field: str = "source", *, kind: str = "hyperlink", drop_fragments: bool = True
+        self, *, kind: str = "hyperlink", drop_fragments: bool = True
     ):
         """Extract hyperlinks from HTML content.
 
-        Expects the `page_content` to be HTML.
+        Expects the input to be an HTML string or a `BeautifulSoup` object.
 
         Args:
-            url_field: Name of the metadata field containing the URL
-                of the content. Defaults to "source".
             kind: The kind of edge to extract. Defaults to "hyperlink".
             drop_fragments: Whether fragments in URLs and links shoud be
                 dropped. Defaults to `True`.
@@ -66,30 +62,29 @@ def __init__(
             import bs4  # noqa:F401
         except ImportError:
             raise ImportError(
-                "BeautifulSoup4 is required for HtmlLinkEdgeExtractor. "
+                "BeautifulSoup4 is required for HtmlLinkExtractor. "
                 "Please install it with `pip install beautifulsoup4`."
             )
 
-        self.url_field = url_field
         self._kind = kind
         self.drop_fragments = drop_fragments
 
     def extract_one(
         self,
-        document: Document,
         input: Union[str, "BeautifulSoup"],
-    ):
+        *,
+        base_url: str,
+    ) -> Set[Link]:
         if isinstance(input, str):
             from bs4 import BeautifulSoup
 
             input = BeautifulSoup(input, "html.parser")
 
-        url = document.metadata[self.url_field]
         if self.drop_fragments:
-            url = urldefrag(url).url
+            base_url = urldefrag(base_url).url
 
-        hrefs = _parse_hrefs(input, url, self.drop_fragments)
+        hrefs = _parse_hrefs(input, base_url, self.drop_fragments)
 
-        add_links(document,
-                  Link.incoming(kind=self._kind, tag=url),
-                  *[Link.outgoing(kind=self._kind, tag=url) for url in hrefs])
+        links = { Link.outgoing(kind=self._kind, tag=url) for url in hrefs }
+        links.add(Link.incoming(kind=self._kind, tag=base_url))
+        return links
diff --git a/libs/langchain/ragstack_langchain/graph_store/extractors/link_extractor.py b/libs/langchain/ragstack_langchain/graph_store/extractors/link_extractor.py
@@ -0,0 +1,48 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from itertools import repeat
+from typing import Any, Dict, Generic, Iterable, Optional, Set, TypeVar
+
+from ragstack_knowledge_store._utils import strict_zip
+
+from ragstack_langchain.graph_store.links import Link
+
+InputT = TypeVar("InputT")
+
+METADATA_LINKS_KEY = "links"
+
+
+
+class LinkExtractor(ABC, Generic[InputT]):
+    """Interface for extracting links (incoming, outgoing, bidirectional)."""
+
+    @abstractmethod
+    def extract_one(self, input: InputT, **kwargs: Any) -> Set[Link]:
+        """Add edges from each `input` to the corresponding documents.
+
+        Args:
+            input: The input content to extract edges from.
+            **kwargs: Additional keyword arguments for the extractor.
+
+        Returns:
+            Set of links extracted from the input.
+        """
+
+    def extract_batch(self,
+                      inputs: Iterable[InputT],
+                      batch_kwargs: Optional[Iterable[Dict[str, Any]]] = None,
+                      **kwargs: Any):
+        """Add edges from each `input` to the corresponding documents.
+
+        Args:
+            inputs: The input content to extract edges from.
+            batch_kwargs: Iterable of keyword arguments for each input.
+                Defaults to empty dictionaries.
+            **kwargs: Additional arguments to the extractor.
+
+        Returns:
+            Iterable over the set of links extracted from the input.
+        """
+        for (input, kwargs) in zip(inputs, batch_kwargs or repeat({})):
+            yield self.extract_one(input, **kwargs)