Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More doctests #47

Merged
merged 5 commits into from
Dec 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,9 @@ fastembed = "4.3.0"

[dev-dependencies]
wasm-bindgen-test = "0.3"
tokio = { version = "1", features = ["rt", "macros"] }
rand = "0.8"
tokio-test = "0.4"


[profile.release]
# Tell `rustc` to optimize for small code size.
Expand Down
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,15 @@ let mut victor = Db::new(PathBuf::from("./victor_test_data"));
victor.clear_db().await.unwrap();

victor
.add_many(
.add(
vec!["Pineapple", "Rocks"], // documents
vec!["Pizza Toppings"], // tags (only used for filtering)
)
.await;

victor.add("Cheese pizza", vec!["Pizza Flavors"]).await; // Add another entry with no tags
victor
.add_single("Cheese pizza", vec!["Pizza Flavors"])
.await; // Add another entry with no tags

// read the 10 closest results from victor that are tagged with "Pizza Toppings"
// (only 2 will be returned because we only inserted two embeddings)
Expand Down
6 changes: 4 additions & 2 deletions examples/in_memory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@ async fn main() {
victor.clear_db().await.unwrap();

victor
.add_many(
.add(
vec!["Pineapple", "Rocks"], // documents
vec!["Pizza Toppings"], // tags (only used for filtering)
)
.await;

victor.add("Cheese pizza", vec!["Pizza Flavors"]).await; // Add another entry with no tags
victor
.add_single("Cheese pizza", vec!["Pizza Flavors"])
.await; // Add another entry with no tags

// read the 10 closest results from victor that are tagged with "Pizza Toppings"
// (only 2 will be returned because we only inserted two embeddings)
Expand Down
6 changes: 4 additions & 2 deletions examples/native_filesystem.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ async fn main() {
victor.clear_db().await.unwrap();

victor
.add_many(
.add(
vec!["Pineapple", "Rocks"], // documents
vec!["Pizza Toppings"], // tags (only used for filtering)
)
.await;

victor.add("Cheese pizza", vec!["Pizza Flavors"]).await; // Add another entry with no tags
victor
.add_single("Cheese pizza", vec!["Pizza Flavors"])
.await; // Add another entry with no tags

// read the 10 closest results from victor that are tagged with "Pizza Toppings"
// (only 2 will be returned because we only inserted two embeddings)
Expand Down
76 changes: 62 additions & 14 deletions src/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ use crate::{
similarity,
};

/// The main database struct.
/// Through this you can [`Victor::add`] and [`Victor::search`] for embeddings.
pub struct Victor<D> {
root: D,
}
Expand Down Expand Up @@ -72,19 +74,32 @@ extern "C" {
}

impl<D: DirectoryHandle> Victor<D> {
/// Create a new Victor database given a directory handle.
///
/// For example, you can use [`std::path::PathBuf`] to use the native filesystem.
/// Or you can use [`crate::memory::DirectoryHandle`] to use an in-memory database.
pub fn new(root: impl Into<D>) -> Self {
let root = root.into();
Self { root }
}

/// Add many documents to the database.
/// Embeddings will be generated for each document.
///
/// ```rust
/// # tokio_test::block_on(async {
/// # use victor_db::memory::{Db, DirectoryHandle};
/// # let mut victor = Db::new(DirectoryHandle::default());
/// victor
/// .add(
/// vec!["Pineapple", "Rocks"], // documents
/// vec!["Pizza Toppings"], // tags (only used for filtering)
/// )
/// .await;
/// # })
/// ```
#[cfg(not(target_arch = "wasm32"))]
pub async fn add_many(
&mut self,
content: Vec<impl Into<String>>,
tags: Vec<impl Into<String>>,
) {
pub async fn add(&mut self, content: Vec<impl Into<String>>, tags: Vec<impl Into<String>>) {
let tags = tags.into_iter().map(|t| t.into()).collect::<Vec<String>>();
let model = fastembed::TextEmbedding::try_new(Default::default()).unwrap();
let content = content
Expand All @@ -95,20 +110,36 @@ impl<D: DirectoryHandle> Victor<D> {
let vectors = model.embed(content.clone(), None).unwrap();

let to_add = content.into_iter().zip(vectors.into_iter()).collect();
self.add_embedding_many(to_add, tags).await;
self.add_embeddings(to_add, tags).await;
}

/// Add a single document to the database.
/// Embedding will be generated for the document.
/// When adding many documents, it is more efficient to use `add_many`.
/// When adding many documents, it is more efficient to use `add`.
///
/// ```rust
/// # tokio_test::block_on(async {
/// # use victor_db::memory::{Db, DirectoryHandle};
/// # let mut victor = Db::new(DirectoryHandle::default());
/// victor.add_single("Pepperoni pizza", vec!["Pizza Flavors"]).await;
/// # })
/// ```
#[cfg(not(target_arch = "wasm32"))]
pub async fn add(&mut self, content: impl Into<String>, tags: Vec<impl Into<String>>) {
self.add_many(vec![content], tags).await;
pub async fn add_single(&mut self, content: impl Into<String>, tags: Vec<impl Into<String>>) {
self.add(vec![content], tags).await;
}

/// Add many documen/embedding pairs to the database.
/// Add many document/embedding pairs to the database.
/// This is useful for adding embeddings that have already been generated.
pub async fn add_embedding_many(
///
/// ```rust
/// # tokio_test::block_on(async {
/// # use victor_db::memory::{Db, DirectoryHandle};
/// # let mut victor = Db::new(DirectoryHandle::default());
/// victor.add_embeddings(vec![("Pepperoni pizza", vec![0.1, 0.2, 0.3])], vec!["Pizza Flavors"]).await;
/// # })
/// ```
pub async fn add_embeddings(
&mut self,
to_add: Vec<(impl Into<String>, Vec<f32>)>,
tags: Vec<impl Into<String>>,
Expand All @@ -134,19 +165,35 @@ impl<D: DirectoryHandle> Victor<D> {

/// Add a single document/embedding pair to the database.
/// This is useful for adding embeddings that have already been generated.
/// When adding many documents, it is more efficient to use `add_embedding_many`.
pub async fn add_embedding(
/// When adding many documents, it is more efficient to use `add_embeddings`.
///
/// ```rust
/// # tokio_test::block_on(async {
/// # use victor_db::memory::{Db, DirectoryHandle};
/// # let mut victor = Db::new(DirectoryHandle::default());
/// victor.add_single_embedding("Pepperoni pizza", vec![0.1, 0.2, 0.3], vec!["Pizza Flavors"]).await;
/// # })
/// ```
pub async fn add_single_embedding(
&mut self,
content: impl Into<String>,
vector: Vec<f32>,
tags: Vec<impl Into<String>>,
) {
self.add_embedding_many(vec![(content, vector)], tags).await;
self.add_embeddings(vec![(content, vector)], tags).await;
}

/// Search the database for the nearest neighbors to a given document.
/// An embedding will be generated for the document being searched for.
/// This will return the top `top_n` nearest neighbors.
///
/// ```rust
/// # tokio_test::block_on(async {
/// # use victor_db::memory::{Db, DirectoryHandle};
/// # let mut victor = Db::new(DirectoryHandle::default());
/// victor.search("Pepperoni pizza", vec!["Pizza Flavors"], 10).await;
/// # })
/// ```
#[cfg(not(target_arch = "wasm32"))]
pub async fn search(
&self,
Expand Down Expand Up @@ -557,6 +604,7 @@ impl<D: DirectoryHandle> Victor<D> {
content.to_string()
}

/// Clear the database, deleting all data.
pub async fn clear_db(&mut self) -> Result<(), D::Error> {
// clear db files
let files = Index::get_all_db_filenames(&mut self.root).await?;
Expand Down
104 changes: 100 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,91 @@
//! A browser-optimized vector database. Backed by the private virtual filesystem API on web. On native, supports running with the native filesystem or in memory.
//! A browser-optimized vector database. Backed by the private virtual filesystem API on web.
//!
//! You're viewing this on crates.io, so you're probably interested in the native version. The native version supports running with the native filesystem or in memory.
//!
//! If you want to use it on the web, [check out victor-db on npm](https://www.npmjs.com/package/victor-db).
//!
//! ## In-memory database
//!
//! Use this if you want to run victor in-memory (all data is lost when the program exits).
//!
//! The in-memory version is useful for testing and applications where you don't need to persist data:
//! ```rust
//! # tokio_test::block_on(async {
//! // use victor_db::memory for the in-memory implementation
//! use victor_db::memory::{Db, DirectoryHandle};
//!
//! // create a new in-memory database
//! let mut victor = Db::new(DirectoryHandle::default());
//!
//! // add some embeddings to the database
//! victor
//! .add(
//! vec!["Pineapple", "Rocks"], // documents
//! vec!["Pizza Toppings"], // tags (only used for filtering)
//! )
//! .await;
//!
//! // add another embedding to the database, this time with no tags
//! victor.add_single("Cheese pizza", vec!["Pizza Flavors"]).await;
//!
//! // read the 10 closest results from victor that are tagged with "Pizza Toppings"
//! // (only 2 will be returned because we only inserted two embeddings)
//! let nearest = victor
//! .search("Hawaiian pizza", vec!["Pizza Toppings"], 10)
//! .await
//! .first()
//! .unwrap()
//! .content
//! .clone();
//! assert_eq!(nearest, "Pineapple".to_string());
//!
//! // Clear the database
//! victor.clear_db().await.unwrap();
//! # })
//! ```
//!
//! ## Native database
//!
//! Use this if you want to persist your database to disk.
//!
//! ```rust
//! # tokio_test::block_on(async {
//! // use victor_db::native for the native filesystem implementation
//! use victor_db::native::Db;
//! use std::path::PathBuf;
//!
//! // create a new native database under "./victor_test_data"
//! let _ = std::fs::create_dir("./victor_test_data");
//! let mut victor = Db::new(PathBuf::from("./victor_test_data"));
//!
//! // add some embeddings to the database
//! victor
//! .add(
//! vec!["Pineapple", "Rocks"], // documents
//! vec!["Pizza Toppings"], // tags (only used for filtering)
//! )
//! .await;
//!
//! // add another embedding to the database, this time with no tags
//! victor.add_single("Cheese pizza", vec!["Pizza Flavors"]).await;
//!
//! // read the 10 closest results from victor that are tagged with "Pizza Toppings"
//! // (only 2 will be returned because we only inserted two embeddings)
//! let nearest = victor
//! .search("Hawaiian pizza", vec!["Pizza Toppings"], 10)
//! .await
//! .first()
//! .unwrap()
//! .content
//! .clone();
//! assert_eq!(nearest, "Pineapple".to_string());
//!
//! // Clear the database
//! victor.clear_db().await.unwrap();
//! # })
//! ```
//!
//! See the docs for [`Victor`] for more information.

#![deny(missing_docs)]

Expand All @@ -9,6 +96,9 @@ mod packed_vector;
mod similarity;
mod utils;

#[cfg(not(target_arch = "wasm32"))]
pub use db::Victor;

#[cfg(test)]
mod tests;

Expand All @@ -22,7 +112,9 @@ type Victor = crate::db::Victor<filesystem::web::DirectoryHandle>;

// Native

/// Used to tell victor to use the native filesystem.
/// Victor's native filesystem implementation.
///
/// Use this if you want to persist your database to disk.
#[cfg(not(target_arch = "wasm32"))]
pub mod native {
use crate::db::Victor;
Expand All @@ -31,7 +123,9 @@ pub mod native {
pub type Db = Victor<crate::filesystem::native::DirectoryHandle>;
}

/// Used to tell victor to use an in-memory filesystem.
/// Victor's in-memory implementation.
///
/// Use this if you want to run victor in-memory (all data is lost when the program exits).
#[cfg(not(target_arch = "wasm32"))]
pub mod memory {
use crate::db::Victor;
Expand Down Expand Up @@ -105,7 +199,9 @@ impl Db {
})
.unwrap_or(vec![]);

self.victor.add_embedding(content, embedding, tags).await;
self.victor
.add_single_embedding(content, embedding, tags)
.await;
}

/// Search the database for the nearest neighbors to a given embedding.
Expand Down
18 changes: 9 additions & 9 deletions src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ async fn store_and_retrieve() {
let mut victor = Db::new(DirectoryHandle::default());

victor
.add_embedding("hello", embedding.clone(), Vec::<String>::new())
.add_single_embedding("hello", embedding.clone(), Vec::<String>::new())
.await;

let result = victor
Expand All @@ -29,10 +29,10 @@ async fn store_two_and_retrieve() {
let mut victor = Db::new(DirectoryHandle::default());

victor
.add_embedding("hello", embedding_1.clone(), Vec::<String>::new())
.add_single_embedding("hello", embedding_1.clone(), Vec::<String>::new())
.await;
victor
.add_embedding("goodbye", embedding_2.clone(), Vec::<String>::new())
.add_single_embedding("goodbye", embedding_2.clone(), Vec::<String>::new())
.await;

{
Expand Down Expand Up @@ -67,10 +67,10 @@ async fn store_two_and_retrieve_with_tags() {
let mut victor = Db::new(DirectoryHandle::default());

victor
.add_embedding("hello", embedding_1.clone(), vec!["greetings".to_string()])
.add_single_embedding("hello", embedding_1.clone(), vec!["greetings".to_string()])
.await;
victor
.add_embedding("goodbye", embedding_2.clone(), vec!["goodbyes".to_string()])
.add_single_embedding("goodbye", embedding_2.clone(), vec!["goodbyes".to_string()])
.await;

{
Expand Down Expand Up @@ -135,19 +135,19 @@ async fn incompatible_size_panic() {
let mut victor = Db::new(DirectoryHandle::default());

victor
.add_embedding("hello", embedding_1, Vec::<String>::new())
.add_single_embedding("hello", embedding_1, Vec::<String>::new())
.await;
victor
.add_embedding("hello", embedding_2, Vec::<String>::new())
.add_single_embedding("hello", embedding_2, Vec::<String>::new())
.await;
}

#[tokio::test]
async fn add_many() {
async fn add() {
let mut victor = Db::new(DirectoryHandle::default());

victor
.add_many(vec!["pineapple", "rocks"], Vec::<String>::new())
.add(vec!["pineapple", "rocks"], Vec::<String>::new())
.await;

let result = victor
Expand Down
Loading