Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ref: Rename EdgeExtractor to LinkExtractor #516

Merged
merged 9 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions libs/knowledge-store/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@ The Graph Store makes use of the following metadata fields on each `Document`:

#### Hyperlinks

To connect nodes based on hyperlinks, you can use the `HtmlLinkEdgeExtractor` as shown below:
To connect nodes based on hyperlinks, you can use the `HtmlLinkExtractor` as shown below:

```python
from ragstack_knowledge_store.langchain.extractors import HtmlLinkEdgeExtractor
from ragstack_knowledge_store.langchain.extractors import HtmlLinkExtractor

html_link_extractor = HtmlLinkEdgeExtractor()
html_link_extractor = HtmlLinkExtractor()

for doc in documents:
doc.metadata["content_id"] = doc.metadata["source"]
Expand Down
88 changes: 51 additions & 37 deletions libs/knowledge-store/notebooks/astra_support.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,9 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"zsh:1: no matches found: ragstack-ai-langchain[knowledge-store]\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"outputs": [],
"source": [
"%pip install -q ragstack-ai-langchain[knowledge-store] beautifulsoup4 markdownify python-dotenv"
]
Expand All @@ -45,20 +36,9 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1374"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# Use sitemaps to crawl the content\n",
"SITEMAPS = [\n",
Expand Down Expand Up @@ -115,7 +95,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -125,10 +105,10 @@
"from typing import AsyncIterator, Iterable\n",
"from ragstack_knowledge_store.graph_store import CONTENT_ID\n",
"from markdownify import MarkdownConverter\n",
"from ragstack_langchain.graph_store.extractors import HtmlLinkEdgeExtractor\n",
"from ragstack_langchain.graph_store.extractors import HtmlLinkExtractor\n",
"\n",
"markdown_converter = MarkdownConverter(heading_style=\"ATX\")\n",
"html_link_extractor = HtmlLinkEdgeExtractor()\n",
"html_link_extractor = HtmlLinkExtractor()\n",
"\n",
"\n",
"def select_content(soup: BeautifulSoup, url: str) -> BeautifulSoup:\n",
Expand Down Expand Up @@ -180,7 +160,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 1,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -229,7 +209,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -238,7 +218,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -260,7 +240,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -319,7 +299,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -354,7 +334,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -363,7 +343,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -451,7 +431,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -475,9 +455,43 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/markdown": [
"Astra DB Serverless uses the JVector vector search engine to construct a graph-based index. JVector is part of the DiskANN family and is designed to facilitate approximate nearest neighbor (ANN) search, which is crucial for handling high-dimensional vector spaces efficiently.\n",
"\n",
"Here are the key aspects of JVector and its indexing algorithms:\n",
"\n",
"1. **Graph-Based Index**: JVector constructs a single-layer graph with nonblocking concurrency control. This allows for scalable and efficient search operations.\n",
"\n",
"2. **Incremental Updates**: JVector supports incremental construction and updates to the index, making it suitable for dynamic datasets.\n",
"\n",
"3. **Two-Pass Search**: JVector employs a two-pass search strategy:\n",
" - **First Pass**: Uses lossily compressed representations of vectors stored in memory to quickly narrow down candidates.\n",
" - **Second Pass**: Uses more accurate representations read from disk to refine the search results.\n",
"\n",
"4. **Compression Techniques**: JVector supports various vector compression techniques to optimize memory usage and performance:\n",
" - **Product Quantization (PQ)**: A method that compresses vectors by splitting them into subspaces and quantizing each subspace separately.\n",
" - **Binary Quantization (BQ)**: Another compression method, although it is generally less effective than PQ for most embedding models.\n",
" - **Fused ADC (Asymmetric Distance Computation)**: Combines PQ with efficient distance computation methods to enhance search speed.\n",
"\n",
"5. **DiskANN Architecture**: JVector builds on the DiskANN design, allowing it to handle larger-than-memory indexes by storing additional data on disk.\n",
"\n",
"6. **High-Dimensional Optimization**: JVector uses the Panama Vector API (SIMD) to optimize ANN indexing and search operations, ensuring high performance even with large datasets.\n",
"\n",
"In summary, Astra DB Serverless leverages the JVector engine, which employs a graph-based index with advanced compression and search optimization techniques to provide efficient vector search capabilities."
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"run_and_render(mmr_graph_rag_chain, QUESTION)"
]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .edge_extractor import EdgeExtractor
from .html_link_edge_extractor import HtmlLinkEdgeExtractor
from .link_extractor import LinkExtractor
from .html_link_extractor import HtmlLinkExtractor

__all__ = [
"EdgeExtractor",
"HtmlLinkEdgeExtractor",
"LinkExtractor",
"HtmlLinkExtractor",
]

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
from langchain_core.documents import Document
from typing import TYPE_CHECKING, Set, Union
from urllib.parse import urldefrag, urljoin, urlparse

from ragstack_langchain.graph_store.links import (
add_links,
Link
)
from .edge_extractor import EdgeExtractor
from .link_extractor import LinkExtractor

if TYPE_CHECKING:
from bs4 import BeautifulSoup
Expand Down Expand Up @@ -47,17 +45,15 @@ def _parse_hrefs(soup: "BeautifulSoup", url: str, drop_fragments: bool = True) -
return links


class HtmlLinkEdgeExtractor(EdgeExtractor[Union[str, "BeautifulSoup"]]):
class HtmlLinkExtractor(LinkExtractor[Union[str, "BeautifulSoup"]]):
def __init__(
self, url_field: str = "source", *, kind: str = "hyperlink", drop_fragments: bool = True
self, *, kind: str = "hyperlink", drop_fragments: bool = True
):
"""Extract hyperlinks from HTML content.

Expects the `page_content` to be HTML.
Expects the input to be an HTML string or a `BeautifulSoup` object.

Args:
url_field: Name of the metadata field containing the URL
of the content. Defaults to "source".
kind: The kind of edge to extract. Defaults to "hyperlink".
drop_fragments: Whether fragments in URLs and links shoud be
dropped. Defaults to `True`.
Expand All @@ -66,30 +62,29 @@ def __init__(
import bs4 # noqa:F401
except ImportError:
raise ImportError(
"BeautifulSoup4 is required for HtmlLinkEdgeExtractor. "
"BeautifulSoup4 is required for HtmlLinkExtractor. "
"Please install it with `pip install beautifulsoup4`."
)

self.url_field = url_field
self._kind = kind
self.drop_fragments = drop_fragments

def extract_one(
self,
document: Document,
input: Union[str, "BeautifulSoup"],
):
*,
base_url: str,
) -> Set[Link]:
if isinstance(input, str):
from bs4 import BeautifulSoup

input = BeautifulSoup(input, "html.parser")

url = document.metadata[self.url_field]
if self.drop_fragments:
url = urldefrag(url).url
base_url = urldefrag(base_url).url

hrefs = _parse_hrefs(input, url, self.drop_fragments)
hrefs = _parse_hrefs(input, base_url, self.drop_fragments)

add_links(document,
Link.incoming(kind=self._kind, tag=url),
*[Link.outgoing(kind=self._kind, tag=url) for url in hrefs])
links = { Link.outgoing(kind=self._kind, tag=url) for url in hrefs }
links.add(Link.incoming(kind=self._kind, tag=base_url))
return links
bjchambers marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from itertools import repeat
from typing import Any, Dict, Generic, Iterable, Optional, Set, TypeVar

from ragstack_knowledge_store._utils import strict_zip

from ragstack_langchain.graph_store.links import Link

InputT = TypeVar("InputT")

METADATA_LINKS_KEY = "links"



class LinkExtractor(ABC, Generic[InputT]):
"""Interface for extracting links (incoming, outgoing, bidirectional)."""

@abstractmethod
def extract_one(self, input: InputT, **kwargs: Any) -> Set[Link]:
"""Add edges from each `input` to the corresponding documents.

Args:
input: The input content to extract edges from.
**kwargs: Additional keyword arguments for the extractor.

Returns:
Set of links extracted from the input.
"""

def extract_batch(self,
bjchambers marked this conversation as resolved.
Show resolved Hide resolved
inputs: Iterable[InputT],
batch_kwargs: Optional[Iterable[Dict[str, Any]]] = None,
bjchambers marked this conversation as resolved.
Show resolved Hide resolved
**kwargs: Any):
"""Add edges from each `input` to the corresponding documents.

Args:
inputs: The input content to extract edges from.
batch_kwargs: Iterable of keyword arguments for each input.
Defaults to empty dictionaries.
**kwargs: Additional arguments to the extractor.

Returns:
Iterable over the set of links extracted from the input.
"""
for (input, kwargs) in zip(inputs, batch_kwargs or repeat({})):
yield self.extract_one(input, **kwargs)
Loading
Loading