Skip to content

Commit

Permalink
Add document interface to Graphql (#1272)
Browse files Browse the repository at this point in the history
* re-enable vector tests

* reestructure vectors module

* add empty graph test

* remove embeddings module from raphtory-graphql

* expose document interface through GraphQL

* split generate_doc into different trait

* make DocumentSource private

* make EntityDocument public for the crate

* expose document id as a vector of strings

* fix compilation errors and warnings

* ignore empty graph test in vectors module

* fix tantivity bug

* remove unused code
  • Loading branch information
ricopinazo authored Oct 2, 2023
1 parent a99719e commit 82e3f2d
Show file tree
Hide file tree
Showing 19 changed files with 957 additions and 980 deletions.
3 changes: 2 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion raphtory-graphql/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ tracing-subscriber = {version = "0.3.16", features = ["std", "env-filter"]}
walkdir = "2"
ordered-float = "3.7.0"
uuid = "1.4.1"
async-openai = "0.14.0"
clap = { version = "4.3.11", features = ["derive"] }
chrono = { version = "0.4", features = ["serde"] }

Expand Down
2 changes: 1 addition & 1 deletion raphtory-graphql/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use raphtory::{
core::Prop,
prelude::{Graph, GraphViewOps, PropertyAdditionOps},
search::IndexedGraph,
vectors::VectorizedGraph,
vectors::vectorized_graph::VectorizedGraph,
};
use std::{
collections::{HashMap, HashSet},
Expand Down
19 changes: 0 additions & 19 deletions raphtory-graphql/src/embeddings.rs

This file was deleted.

1 change: 0 additions & 1 deletion raphtory-graphql/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ pub use crate::{model::algorithm::Algorithm, server::RaphtoryServer};
use base64::{prelude::BASE64_URL_SAFE_NO_PAD, DecodeError, Engine};
use raphtory::{core::utils::errors::GraphError, db::api::view::internal::MaterializedGraph};

pub mod embeddings;
mod model;
mod observability;
mod routes;
Expand Down
8 changes: 5 additions & 3 deletions raphtory-graphql/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,23 @@ use dotenv::dotenv;
use std::env;

mod data;
mod embeddings;
mod model;
mod observability;
mod routes;
mod server;

fn default_cache_dir() -> String {
"".to_owned()
}

#[derive(Parser, Debug)]
struct Args {
/// graphs to vectorize for similarity search
#[arg(short, long, num_args = 0.., value_delimiter = ' ')]
vectorize: Vec<String>,

/// directory to use to store the embbeding cache
// parenthesis are actually necessary or this does not compile!
#[arg(short, long, default_value_t = ("".to_string()))]
#[arg(short, long, default_value_t = default_cache_dir())]
cache: String,
}

Expand Down
28 changes: 28 additions & 0 deletions raphtory-graphql/src/model/graph/document.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
use dynamic_graphql::SimpleObject;
use raphtory::vectors::Document;

#[derive(SimpleObject)]
pub(crate) struct GqlDocument {
/// Return a vector with the name of the node or the names of src and dst of the edge: [src, dst]
name: Vec<String>, // size 1 for nodes, size 2 for edges: [src, dst]
/// Return the type of entity: "node" or "edge"
entity_type: String,
content: String,
}

impl From<Document> for GqlDocument {
fn from(value: Document) -> Self {
match value {
Document::Node { name, content } => Self {
name: vec![name],
entity_type: "node".to_owned(),
content,
},
Document::Edge { src, dst, content } => Self {
name: vec![src, dst],
entity_type: "edge".to_owned(),
content,
},
}
}
}
1 change: 1 addition & 0 deletions raphtory-graphql/src/model/graph/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use raphtory::{
};
use std::collections::HashSet;

pub(crate) mod document;
pub(crate) mod edge;
pub(crate) mod graph;
pub(crate) mod node;
Expand Down
12 changes: 9 additions & 3 deletions raphtory-graphql/src/model/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
use crate::{
data::Data,
model::graph::graph::{GqlGraph, GraphMeta},
model::graph::{
document::GqlDocument,
graph::{GqlGraph, GraphMeta},
},
};
use async_graphql::Context;
use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine};
Expand Down Expand Up @@ -99,7 +102,7 @@ impl QueryRoot {
limit: Option<usize>,
window_start: Option<i64>,
window_end: Option<i64>,
) -> Option<Vec<String>> {
) -> Option<Vec<GqlDocument>> {
let init = init.unwrap_or(1);
let min_nodes = min_nodes.unwrap_or(0);
let min_edges = min_edges.unwrap_or(0);
Expand All @@ -119,7 +122,10 @@ impl QueryRoot {
window_start,
window_end,
)
.await,
.await
.into_iter()
.map(|doc| doc.into())
.collect_vec(),
)
}
}
Expand Down
2 changes: 1 addition & 1 deletion raphtory-graphql/src/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use poem::{get, listener::TcpListener, middleware::Cors, EndpointExt, Route, Ser
use raphtory::{
db::graph::{edge::EdgeView, vertex::VertexView},
prelude::Graph,
vectors::{Embedding, Vectorizable},
vectors::{vectorizable::Vectorizable, Embedding},
};
use std::{collections::HashMap, future::Future, ops::Deref, path::Path};
use tokio::{io::Result as IoResult, signal};
Expand Down
7 changes: 5 additions & 2 deletions raphtory/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ tantivy = {version="0.20", optional=true} # 0.21 does not work (see https://gith
# vectors optional dependencies
futures-util = {version="0.3.0", optional=true}
async-trait = {version="0.1.73", optional=true}
async-openai = {version="0.14.0", optional=true}

# python binding optional dependencies
pyo3 = {version= "0.19.2", features=["multiple-pymethods", "chrono"], optional=true}
Expand All @@ -77,14 +78,16 @@ pretty_assertions = "1"
quickcheck = "1"
quickcheck_macros = "1"
tempfile = "3.2"
tokio = { version = "1.27.0", features = ["full"]} # for vector testing
dotenv = "0.15.0" # for vector testing

[features]
default = ["search"]
default = ["search", "vectors"]
# Enables the graph loader io module
io = ["dep:zip", "dep:neo4rs", "dep:bzip2", "dep:flate2", "dep:csv", "dep:serde_json", "dep:reqwest", "dep:tokio"]
# Enables generating the pyo3 python bindings
python = ["io", "vectors", "dep:pyo3", "dep:pyo3-asyncio", "dep:num", "dep:display-error-chain", "dep:arrow2"]
# search
search = ["dep:tantivy"]
# vectors
vectors = ["dep:futures-util", "dep:async-trait"]
vectors = ["dep:futures-util", "dep:async-trait", "dep:async-openai"]
7 changes: 5 additions & 2 deletions raphtory/src/python/packages/vectors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@ use crate::{
},
prelude::{EdgeViewOps, VertexViewOps},
python::graph::views::graph_view::PyGraphView,
vectors::{Embedding, EmbeddingFunction, Vectorizable, VectorizedGraph},
vectors::{
vectorizable::Vectorizable, vectorized_graph::VectorizedGraph, DocumentOps, Embedding,
EmbeddingFunction,
},
};
use futures_util::future::BoxFuture;
use itertools::Itertools;
Expand Down Expand Up @@ -88,7 +91,7 @@ impl PyVectorizedGraph {
None,
)
.await;
Ok(docs)
Ok(docs.into_iter().map(|doc| doc.into_content()).collect_vec())
})
}
}
Expand Down
48 changes: 48 additions & 0 deletions raphtory/src/vectors/document_source.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
use crate::{
db::graph::{edge::EdgeView, vertex::VertexView},
prelude::{EdgeViewOps, GraphViewOps, VertexViewOps},
vectors::{entity_id::EntityId, EntityDocument},
};

pub(crate) trait DocumentSource: Sized {
fn generate_doc<T>(&self, template: &T) -> EntityDocument
where
T: Fn(&Self) -> String;
}

impl<G: GraphViewOps> DocumentSource for VertexView<G> {
fn generate_doc<T>(&self, template: &T) -> EntityDocument
where
T: Fn(&Self) -> String,
{
let raw_content = template(self);
let content = match raw_content.char_indices().nth(1000) {
Some((index, _)) => (&raw_content[..index]).to_owned(),
None => raw_content,
};
// TODO: allow multi document entities !!!!!
// shortened to 1000 (around 250 tokens) to avoid exceeding the max number of tokens,
// when embedding but also when inserting documents into prompts

EntityDocument {
id: EntityId::Node { id: self.id() },
content,
}
}
}

impl<G: GraphViewOps> DocumentSource for EdgeView<G> {
fn generate_doc<T>(&self, template: &T) -> EntityDocument
where
T: Fn(&Self) -> String,
{
let content = template(self);
EntityDocument {
id: EntityId::Edge {
src: self.src().id(),
dst: self.dst().id(),
},
content,
}
}
}
49 changes: 49 additions & 0 deletions raphtory/src/vectors/embeddings.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
use crate::vectors::Embedding;
use async_openai::{
types::{CreateEmbeddingRequest, EmbeddingInput},
Client,
};
use itertools::Itertools;

pub async fn openai_embedding(texts: Vec<String>) -> Vec<Embedding> {
println!("computing embeddings for {} texts", texts.len());
let client = Client::new();
let request = CreateEmbeddingRequest {
model: "text-embedding-ada-002".to_owned(),
input: EmbeddingInput::StringArray(texts),
user: None,
};
let response = client.embeddings().create(request).await.unwrap();
println!("Generated embeddings successfully");
response.data.into_iter().map(|e| e.embedding).collect_vec()
}

// this is currently commented out so we don't need to add any new dependencies
// but might be potentially useful in the future
// async fn sentence_transformers_embeddings(texts: Vec<String>) -> Vec<Embedding> {
// println!("computing embeddings for {} texts", texts.len());
// Python::with_gil(|py| {
// let sentence_transformers = py.import("sentence_transformers")?;
// let locals = [("sentence_transformers", sentence_transformers)].into_py_dict(py);
// locals.set_item("texts", texts);
//
// let pyarray: &PyArray2<f32> = py
// .eval(
// &format!(
// "sentence_transformers.SentenceTransformer('thenlper/gte-small').encode(texts)"
// ),
// Some(locals),
// None,
// )?
// .extract()?;
//
// let readonly = pyarray.readonly();
// let chunks = readonly.as_slice().unwrap().chunks(384).into_iter();
// let embeddings = chunks
// .map(|chunk| chunk.iter().copied().collect_vec())
// .collect_vec();
//
// Ok::<Vec<Vec<f32>>, Box<dyn std::error::Error>>(embeddings)
// })
// .unwrap()
// }
57 changes: 57 additions & 0 deletions raphtory/src/vectors/entity_id.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
use crate::{
db::graph::{edge::EdgeView, vertex::VertexView},
prelude::{EdgeViewOps, GraphViewOps, VertexViewOps},
};
use serde::Serializer;
use std::fmt::{Display, Formatter};

#[derive(Clone, Debug, Eq, PartialEq, Hash)]
pub(crate) enum EntityId {
Node { id: u64 },
Edge { src: u64, dst: u64 },
}

impl<G: GraphViewOps> From<&VertexView<G>> for EntityId {
fn from(value: &VertexView<G>) -> Self {
EntityId::Node { id: value.id() }
}
}

impl<G: GraphViewOps> From<VertexView<G>> for EntityId {
fn from(value: VertexView<G>) -> Self {
EntityId::Node { id: value.id() }
}
}

impl<G: GraphViewOps> From<&EdgeView<G>> for EntityId {
fn from(value: &EdgeView<G>) -> Self {
EntityId::Edge {
src: value.src().id(),
dst: value.dst().id(),
}
}
}

impl<G: GraphViewOps> From<EdgeView<G>> for EntityId {
fn from(value: EdgeView<G>) -> Self {
EntityId::Edge {
src: value.src().id(),
dst: value.dst().id(),
}
}
}

impl Display for EntityId {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
EntityId::Node { id } => f.serialize_u64(*id),
EntityId::Edge { src, dst } => {
f.serialize_u64(*src)
.expect("src ID couldn't be serialized");
f.write_str("-")
.expect("edge ID separator couldn't be serialized");
f.serialize_u64(*dst)
}
}
}
}
Loading

0 comments on commit 82e3f2d

Please sign in to comment.