Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add document interface to Graphql #1272

Merged
merged 16 commits into from
Oct 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion raphtory-graphql/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ tracing-subscriber = {version = "0.3.16", features = ["std", "env-filter"]}
walkdir = "2"
ordered-float = "3.7.0"
uuid = "1.4.1"
async-openai = "0.14.0"
clap = { version = "4.3.11", features = ["derive"] }
chrono = { version = "0.4", features = ["serde"] }

Expand Down
2 changes: 1 addition & 1 deletion raphtory-graphql/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use raphtory::{
core::Prop,
prelude::{Graph, GraphViewOps, PropertyAdditionOps},
search::IndexedGraph,
vectors::VectorizedGraph,
vectors::vectorized_graph::VectorizedGraph,
};
use std::{
collections::{HashMap, HashSet},
Expand Down
19 changes: 0 additions & 19 deletions raphtory-graphql/src/embeddings.rs

This file was deleted.

1 change: 0 additions & 1 deletion raphtory-graphql/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ pub use crate::{model::algorithm::Algorithm, server::RaphtoryServer};
use base64::{prelude::BASE64_URL_SAFE_NO_PAD, DecodeError, Engine};
use raphtory::{core::utils::errors::GraphError, db::api::view::internal::MaterializedGraph};

pub mod embeddings;
mod model;
mod observability;
mod routes;
Expand Down
8 changes: 5 additions & 3 deletions raphtory-graphql/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,23 @@ use dotenv::dotenv;
use std::env;

mod data;
mod embeddings;
mod model;
mod observability;
mod routes;
mod server;

fn default_cache_dir() -> String {
"".to_owned()
}

#[derive(Parser, Debug)]
struct Args {
/// graphs to vectorize for similarity search
#[arg(short, long, num_args = 0.., value_delimiter = ' ')]
vectorize: Vec<String>,

/// directory to use to store the embbeding cache
// parenthesis are actually necessary or this does not compile!
#[arg(short, long, default_value_t = ("".to_string()))]
#[arg(short, long, default_value_t = default_cache_dir())]
cache: String,
}

Expand Down
28 changes: 28 additions & 0 deletions raphtory-graphql/src/model/graph/document.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
use dynamic_graphql::SimpleObject;
use raphtory::vectors::Document;

#[derive(SimpleObject)]
pub(crate) struct GqlDocument {
/// Return a vector with the name of the node or the names of src and dst of the edge: [src, dst]
name: Vec<String>, // size 1 for nodes, size 2 for edges: [src, dst]
/// Return the type of entity: "node" or "edge"
entity_type: String,
content: String,
}

impl From<Document> for GqlDocument {
fn from(value: Document) -> Self {
match value {
Document::Node { name, content } => Self {
name: vec![name],
entity_type: "node".to_owned(),
content,
},
Document::Edge { src, dst, content } => Self {
name: vec![src, dst],
entity_type: "edge".to_owned(),
content,
},
}
}
}
1 change: 1 addition & 0 deletions raphtory-graphql/src/model/graph/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use raphtory::{
};
use std::collections::HashSet;

pub(crate) mod document;
pub(crate) mod edge;
pub(crate) mod graph;
pub(crate) mod node;
Expand Down
12 changes: 9 additions & 3 deletions raphtory-graphql/src/model/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
use crate::{
data::Data,
model::graph::graph::{GqlGraph, GraphMeta},
model::graph::{
document::GqlDocument,
graph::{GqlGraph, GraphMeta},
},
};
use async_graphql::Context;
use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine};
Expand Down Expand Up @@ -99,7 +102,7 @@ impl QueryRoot {
limit: Option<usize>,
window_start: Option<i64>,
window_end: Option<i64>,
) -> Option<Vec<String>> {
) -> Option<Vec<GqlDocument>> {
let init = init.unwrap_or(1);
let min_nodes = min_nodes.unwrap_or(0);
let min_edges = min_edges.unwrap_or(0);
Expand All @@ -119,7 +122,10 @@ impl QueryRoot {
window_start,
window_end,
)
.await,
.await
.into_iter()
.map(|doc| doc.into())
.collect_vec(),
)
}
}
Expand Down
2 changes: 1 addition & 1 deletion raphtory-graphql/src/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use poem::{get, listener::TcpListener, middleware::Cors, EndpointExt, Route, Ser
use raphtory::{
db::graph::{edge::EdgeView, vertex::VertexView},
prelude::Graph,
vectors::{Embedding, Vectorizable},
vectors::{vectorizable::Vectorizable, Embedding},
};
use std::{collections::HashMap, future::Future, ops::Deref, path::Path};
use tokio::{io::Result as IoResult, signal};
Expand Down
7 changes: 5 additions & 2 deletions raphtory/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ tantivy = {version="0.20", optional=true} # 0.21 does not work (see https://gith
# vectors optional dependencies
futures-util = {version="0.3.0", optional=true}
async-trait = {version="0.1.73", optional=true}
async-openai = {version="0.14.0", optional=true}

# python binding optional dependencies
pyo3 = {version= "0.19.2", features=["multiple-pymethods", "chrono"], optional=true}
Expand All @@ -77,14 +78,16 @@ pretty_assertions = "1"
quickcheck = "1"
quickcheck_macros = "1"
tempfile = "3.2"
tokio = { version = "1.27.0", features = ["full"]} # for vector testing
dotenv = "0.15.0" # for vector testing

[features]
default = ["search"]
default = ["search", "vectors"]
# Enables the graph loader io module
io = ["dep:zip", "dep:neo4rs", "dep:bzip2", "dep:flate2", "dep:csv", "dep:serde_json", "dep:reqwest", "dep:tokio"]
# Enables generating the pyo3 python bindings
python = ["io", "vectors", "dep:pyo3", "dep:pyo3-asyncio", "dep:num", "dep:display-error-chain", "dep:arrow2"]
# search
search = ["dep:tantivy"]
# vectors
vectors = ["dep:futures-util", "dep:async-trait"]
vectors = ["dep:futures-util", "dep:async-trait", "dep:async-openai"]
7 changes: 5 additions & 2 deletions raphtory/src/python/packages/vectors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@ use crate::{
},
prelude::{EdgeViewOps, VertexViewOps},
python::graph::views::graph_view::PyGraphView,
vectors::{Embedding, EmbeddingFunction, Vectorizable, VectorizedGraph},
vectors::{
vectorizable::Vectorizable, vectorized_graph::VectorizedGraph, DocumentOps, Embedding,
EmbeddingFunction,
},
};
use futures_util::future::BoxFuture;
use itertools::Itertools;
Expand Down Expand Up @@ -88,7 +91,7 @@ impl PyVectorizedGraph {
None,
)
.await;
Ok(docs)
Ok(docs.into_iter().map(|doc| doc.into_content()).collect_vec())
})
}
}
Expand Down
48 changes: 48 additions & 0 deletions raphtory/src/vectors/document_source.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
use crate::{
db::graph::{edge::EdgeView, vertex::VertexView},
prelude::{EdgeViewOps, GraphViewOps, VertexViewOps},
vectors::{entity_id::EntityId, EntityDocument},
};

pub(crate) trait DocumentSource: Sized {
fn generate_doc<T>(&self, template: &T) -> EntityDocument
where
T: Fn(&Self) -> String;
}

impl<G: GraphViewOps> DocumentSource for VertexView<G> {
fn generate_doc<T>(&self, template: &T) -> EntityDocument
where
T: Fn(&Self) -> String,
{
let raw_content = template(self);
let content = match raw_content.char_indices().nth(1000) {
Some((index, _)) => (&raw_content[..index]).to_owned(),
None => raw_content,
};
// TODO: allow multi document entities !!!!!
// shortened to 1000 (around 250 tokens) to avoid exceeding the max number of tokens,
// when embedding but also when inserting documents into prompts

EntityDocument {
id: EntityId::Node { id: self.id() },
content,
}
}
}

impl<G: GraphViewOps> DocumentSource for EdgeView<G> {
fn generate_doc<T>(&self, template: &T) -> EntityDocument
where
T: Fn(&Self) -> String,
{
let content = template(self);
EntityDocument {
id: EntityId::Edge {
src: self.src().id(),
dst: self.dst().id(),
},
content,
}
}
}
49 changes: 49 additions & 0 deletions raphtory/src/vectors/embeddings.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
use crate::vectors::Embedding;
use async_openai::{
types::{CreateEmbeddingRequest, EmbeddingInput},
Client,
};
use itertools::Itertools;

pub async fn openai_embedding(texts: Vec<String>) -> Vec<Embedding> {
println!("computing embeddings for {} texts", texts.len());
let client = Client::new();
let request = CreateEmbeddingRequest {
model: "text-embedding-ada-002".to_owned(),
input: EmbeddingInput::StringArray(texts),
user: None,
};
let response = client.embeddings().create(request).await.unwrap();
println!("Generated embeddings successfully");
response.data.into_iter().map(|e| e.embedding).collect_vec()
}

// this is currently commented out so we don't need to add any new dependencies
// but might be potentially useful in the future
// async fn sentence_transformers_embeddings(texts: Vec<String>) -> Vec<Embedding> {
// println!("computing embeddings for {} texts", texts.len());
// Python::with_gil(|py| {
// let sentence_transformers = py.import("sentence_transformers")?;
// let locals = [("sentence_transformers", sentence_transformers)].into_py_dict(py);
// locals.set_item("texts", texts);
//
// let pyarray: &PyArray2<f32> = py
// .eval(
// &format!(
// "sentence_transformers.SentenceTransformer('thenlper/gte-small').encode(texts)"
// ),
// Some(locals),
// None,
// )?
// .extract()?;
//
// let readonly = pyarray.readonly();
// let chunks = readonly.as_slice().unwrap().chunks(384).into_iter();
// let embeddings = chunks
// .map(|chunk| chunk.iter().copied().collect_vec())
// .collect_vec();
//
// Ok::<Vec<Vec<f32>>, Box<dyn std::error::Error>>(embeddings)
// })
// .unwrap()
// }
57 changes: 57 additions & 0 deletions raphtory/src/vectors/entity_id.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
use crate::{
db::graph::{edge::EdgeView, vertex::VertexView},
prelude::{EdgeViewOps, GraphViewOps, VertexViewOps},
};
use serde::Serializer;
use std::fmt::{Display, Formatter};

#[derive(Clone, Debug, Eq, PartialEq, Hash)]
pub(crate) enum EntityId {
Node { id: u64 },
Edge { src: u64, dst: u64 },
}

impl<G: GraphViewOps> From<&VertexView<G>> for EntityId {
fn from(value: &VertexView<G>) -> Self {
EntityId::Node { id: value.id() }
}
}

impl<G: GraphViewOps> From<VertexView<G>> for EntityId {
fn from(value: VertexView<G>) -> Self {
EntityId::Node { id: value.id() }
}
}

impl<G: GraphViewOps> From<&EdgeView<G>> for EntityId {
fn from(value: &EdgeView<G>) -> Self {
EntityId::Edge {
src: value.src().id(),
dst: value.dst().id(),
}
}
}

impl<G: GraphViewOps> From<EdgeView<G>> for EntityId {
fn from(value: EdgeView<G>) -> Self {
EntityId::Edge {
src: value.src().id(),
dst: value.dst().id(),
}
}
}

impl Display for EntityId {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
EntityId::Node { id } => f.serialize_u64(*id),
EntityId::Edge { src, dst } => {
f.serialize_u64(*src)
.expect("src ID couldn't be serialized");
f.write_str("-")
.expect("edge ID separator couldn't be serialized");
f.serialize_u64(*dst)
}
}
}
}
Loading