From a7d99088e509a1b2fe00057aa5ccb447368efee2 Mon Sep 17 00:00:00 2001 From: Timon Vonk Date: Mon, 10 Feb 2025 16:13:54 +0100 Subject: [PATCH 1/4] wip duckdb --- Cargo.toml | 1 + swiftide-core/src/template.rs | 1 + swiftide-integrations/Cargo.toml | 3 + swiftide-integrations/src/duckdb/mod.rs | 58 +++++++++++++++ swiftide-integrations/src/duckdb/persist.rs | 76 ++++++++++++++++++++ swiftide-integrations/src/duckdb/retrieve.rs | 0 swiftide-integrations/src/duckdb/schema.sql | 17 +++++ swiftide-integrations/src/duckdb/upsert.sql | 16 +++++ swiftide-integrations/src/lib.rs | 2 + 9 files changed, 174 insertions(+) create mode 100644 swiftide-integrations/src/duckdb/mod.rs create mode 100644 swiftide-integrations/src/duckdb/persist.rs create mode 100644 swiftide-integrations/src/duckdb/retrieve.rs create mode 100644 swiftide-integrations/src/duckdb/schema.sql create mode 100644 swiftide-integrations/src/duckdb/upsert.sql diff --git a/Cargo.toml b/Cargo.toml index 5940a0b2..9a47d768 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -85,6 +85,7 @@ tree-sitter-rust = "0.23" tree-sitter-typescript = "0.23" tree-sitter-go = "0.23" tree-sitter-solidity = "1.2.11" +duckdb = { version = "1.1.1", default-features = false } # Testing diff --git a/swiftide-core/src/template.rs b/swiftide-core/src/template.rs index 05d61e55..61d9f143 100644 --- a/swiftide-core/src/template.rs +++ b/swiftide-core/src/template.rs @@ -2,6 +2,7 @@ use anyhow::{Context as _, Result}; use tokio::sync::RwLock; use lazy_static::lazy_static; +pub use tera::Context; use tera::Tera; use uuid::Uuid; diff --git a/swiftide-integrations/Cargo.toml b/swiftide-integrations/Cargo.toml index 45819655..fb3a228a 100644 --- a/swiftide-integrations/Cargo.toml +++ b/swiftide-integrations/Cargo.toml @@ -84,6 +84,7 @@ parquet = { workspace = true, optional = true, features = [ ] } arrow = { workspace = true, optional = true } redb = { workspace = true, optional = true } +duckdb = { workspace = true, optional = true } [dev-dependencies] swiftide-core = { path = "../swiftide-core", features = ["test-utils"] } @@ -154,6 +155,8 @@ fluvio = ["dep:fluvio"] parquet = ["dep:arrow-array", "dep:parquet", "dep:arrow"] # Redb as an embeddable node cache redb = ["dep:redb"] +# Duckdb for indexing and retrieval +duckdb = ["dep:duckdb"] [lints] diff --git a/swiftide-integrations/src/duckdb/mod.rs b/swiftide-integrations/src/duckdb/mod.rs new file mode 100644 index 00000000..46326bfb --- /dev/null +++ b/swiftide-integrations/src/duckdb/mod.rs @@ -0,0 +1,58 @@ +use std::{ + collections::HashMap, + sync::{Arc, OnceLock}, +}; + +use derive_builder::Builder; +use swiftide_core::indexing::EmbeddedField; +use tokio::sync::Mutex; + +pub mod persist; +pub mod retrieve; + +#[derive(Clone, Builder)] +#[builder(setter(into))] +pub struct Duckdb { + #[builder(setter(custom))] + connection: Arc>, + table_name: String, + + // The vectors to be stored, field name -> size + vectors: HashMap, + + #[builder(default = "256")] + batch_size: usize, + + #[builder(default = OnceLock::new())] + node_upsert_sql: OnceLock, +} + +impl std::fmt::Debug for Duckdb { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Duckdb") + .field("connection", &"Arc>") + .field("table_name", &self.table_name) + .field("batch_size", &self.batch_size) + .finish() + } +} + +impl Duckdb { + pub fn builder() -> DuckdbBuilder { + DuckdbBuilder::default() + } + + pub async fn connection(&self) -> &Mutex { + &self.connection + } +} + +impl DuckdbBuilder { + pub fn connection( + &mut self, + connection: impl Into>>, + ) -> &mut Self { + self.connection = Some(connection.into()); + self + } +} diff --git a/swiftide-integrations/src/duckdb/persist.rs b/swiftide-integrations/src/duckdb/persist.rs new file mode 100644 index 00000000..2e27a6bd --- /dev/null +++ b/swiftide-integrations/src/duckdb/persist.rs @@ -0,0 +1,76 @@ +use anyhow::Result; +use async_trait::async_trait; +use duckdb::params; +use swiftide_core::{ + indexing, + template::{Context, Template}, + Persist, +}; + +use super::Duckdb; + +const SCHEMA: &str = include_str!("schema.sql"); +const UPSERT: &str = include_str!("upsert.sql"); + +#[async_trait] +impl Persist for Duckdb { + async fn setup(&self) -> Result<()> { + let mut context = Context::default(); + context.insert("table_name", &self.table_name); + context.insert("vectors", &self.vectors); + + let rendered = Template::Static(SCHEMA).render(&context).await?; + self.connection.lock().await.execute_batch(&rendered)?; + + context.insert( + "vector_field_names", + &self.vectors.keys().collect::>(), + ); + + // User could have overridden the upsert sql + // Which is fine + let upsert = Template::Static(UPSERT).render(&context).await?; + self.node_upsert_sql + .set(upsert) + .map_err(|_| anyhow::anyhow!("Failed to set upsert sql"))?; + + Ok(()) + } + + async fn store(&self, node: indexing::Node) -> Result { + let Some(query) = self.node_upsert_sql.get() else { + anyhow::bail!("Upsert sql in Duckdb not set"); + }; + + let mut stmt = self.connection.lock().await.prepare(query)?; + let value_iter = [ + node.id(), + node.chunk, + node.path, + node.metadata, + ] + stmt.execute(params![ + node.id(), + node.chunk, + node.path, + node.metadata, + node.original_size, + node.range.start, + node.range.end, + node.range.line_start, + node.range.line_end, + node.range.column_start, + node.range.column_end, + ])?; + + // TODO: Investigate concurrency in duckdb, maybe optmistic if it works + self.connection + .lock() + .await + .execute_batch(&self.node_upsert_sql.get())?; + } + + async fn batch_store(&self, nodes: Vec) -> indexing::IndexingStream { + todo!() + } +} diff --git a/swiftide-integrations/src/duckdb/retrieve.rs b/swiftide-integrations/src/duckdb/retrieve.rs new file mode 100644 index 00000000..e69de29b diff --git a/swiftide-integrations/src/duckdb/schema.sql b/swiftide-integrations/src/duckdb/schema.sql new file mode 100644 index 00000000..674d1184 --- /dev/null +++ b/swiftide-integrations/src/duckdb/schema.sql @@ -0,0 +1,17 @@ +INSTALL vss; +LOAD vss; + +CREATE TABLE IF NOT EXISTS {{table_name}} ( + uuid VARCHAR PRIMARY KEY, + chunk VARCHAR NOT NULL, + path VARCHAR, + metadata MAP(VARCHAR, VARCHAR) + original_size INT, + offset INT, + + -- NOTE mind want to add created / updated timestamps + + {% for vector, size in vectors %} + {{vector}} FLOAT[size], + {% endfor %} +); diff --git a/swiftide-integrations/src/duckdb/upsert.sql b/swiftide-integrations/src/duckdb/upsert.sql new file mode 100644 index 00000000..1734a27c --- /dev/null +++ b/swiftide-integrations/src/duckdb/upsert.sql @@ -0,0 +1,16 @@ +INSERT INTO {{ table_name }} (uuid, chunk, path, metadata, original_size, offset, {{ vector_field_names | join(",") }}) +VALUES (?, ?, ?, ?, ?, ?, + {% for _ in range(end=vector_field_names | len) %} + ?, + {% endfor %} + ) +) ON CONFINCT (uuid) DO UPDATE SET + chunk = EXCLUDED.chunk, + path = EXCLUDED.path, + metadata = EXCLUDED.metadata, + original_size = EXCLUDED.original_size, + offset = EXCLUDED.offset, + {% for vector in vector_field_names %} + {{ vector }} = EXCLUDED.{{ vector }}, + {% endfor %} + ; diff --git a/swiftide-integrations/src/lib.rs b/swiftide-integrations/src/lib.rs index 95d196e2..f6750442 100644 --- a/swiftide-integrations/src/lib.rs +++ b/swiftide-integrations/src/lib.rs @@ -4,6 +4,8 @@ pub mod aws_bedrock; #[cfg(feature = "dashscope")] pub mod dashscope; +#[cfg(feature = "duckdb")] +pub mod duckdb; #[cfg(feature = "fastembed")] pub mod fastembed; #[cfg(feature = "fluvio")] From d2acad46f4117f3fdf0b3edf062e385711bd529e Mon Sep 17 00:00:00 2001 From: Timon Vonk Date: Wed, 29 Jan 2025 18:21:40 +0100 Subject: [PATCH 2/4] At least it compiles --- swiftide-integrations/Cargo.toml | 2 + swiftide-integrations/src/duckdb/mod.rs | 14 +- swiftide-integrations/src/duckdb/persist.rs | 150 ++++++++++++++++---- swiftide-integrations/src/duckdb/schema.sql | 7 +- swiftide-integrations/src/duckdb/upsert.sql | 10 +- 5 files changed, 139 insertions(+), 44 deletions(-) diff --git a/swiftide-integrations/Cargo.toml b/swiftide-integrations/Cargo.toml index fb3a228a..b2515848 100644 --- a/swiftide-integrations/Cargo.toml +++ b/swiftide-integrations/Cargo.toml @@ -27,6 +27,7 @@ strum = { workspace = true } strum_macros = { workspace = true } regex = { workspace = true } futures-util = { workspace = true } +uuid = { workspace = true } # Integrations async-openai = { workspace = true, optional = true } @@ -94,6 +95,7 @@ swiftide-test-utils = { path = "../swiftide-test-utils", features = [ temp-dir = { workspace = true } pretty_assertions = { workspace = true } arrow = { workspace = true, features = ["test_utils"] } +duckdb = { workspace = true, features = ["bundled"] } # Used for hacking fluv to play nice flv-util = { workspace = true } diff --git a/swiftide-integrations/src/duckdb/mod.rs b/swiftide-integrations/src/duckdb/mod.rs index 46326bfb..af997fee 100644 --- a/swiftide-integrations/src/duckdb/mod.rs +++ b/swiftide-integrations/src/duckdb/mod.rs @@ -48,11 +48,15 @@ impl Duckdb { } impl DuckdbBuilder { - pub fn connection( - &mut self, - connection: impl Into>>, - ) -> &mut Self { - self.connection = Some(connection.into()); + pub fn connection(&mut self, connection: impl Into) -> &mut Self { + self.connection = Some(Arc::new(Mutex::new(connection.into()))); + self + } + + pub fn with_vector(&mut self, field: EmbeddedField, size: usize) -> &mut Self { + self.vectors + .get_or_insert_with(HashMap::new) + .insert(field, size); self } } diff --git a/swiftide-integrations/src/duckdb/persist.rs b/swiftide-integrations/src/duckdb/persist.rs index 2e27a6bd..ab1bfc1c 100644 --- a/swiftide-integrations/src/duckdb/persist.rs +++ b/swiftide-integrations/src/duckdb/persist.rs @@ -1,17 +1,58 @@ -use anyhow::Result; +use std::{collections::HashMap, path::Path}; + +use anyhow::{Context as _, Result}; use async_trait::async_trait; -use duckdb::params; +use duckdb::{ + params_from_iter, + types::{OrderedMap, ToSqlOutput, Value}, + ToSql, +}; use swiftide_core::{ - indexing, + indexing::{self, EmbeddedField, Metadata}, template::{Context, Template}, Persist, }; +use uuid::Uuid; use super::Duckdb; const SCHEMA: &str = include_str!("schema.sql"); const UPSERT: &str = include_str!("upsert.sql"); +enum NodeValues<'a> { + Uuid(Uuid), + Path(&'a Path), + Chunk(&'a str), + Metadata(&'a Metadata), + Vector(&'a [f32]), +} + +impl ToSql for NodeValues<'_> { + fn to_sql(&self) -> duckdb::Result> { + match self { + NodeValues::Uuid(uuid) => Ok(ToSqlOutput::Owned(uuid.to_string().into())), + NodeValues::Path(path) => Ok(path.to_string_lossy().to_string().into()), // Should be borrow-able + NodeValues::Chunk(chunk) => chunk.to_sql(), + NodeValues::Metadata(metadata) => { + let ordered_map: OrderedMap = metadata + .iter() + .map(|(k, v)| { + ( + k.to_string().into(), + serde_json::to_string(v).unwrap().into(), + ) + }) + .collect::>() + .into(); + Ok(ToSqlOutput::Owned(duckdb::types::Value::Map(ordered_map))) + } + NodeValues::Vector(vector) => Ok(ToSqlOutput::Owned(Value::Array( + vector.iter().map(|f| (*f).into()).collect(), + ))), + } + } +} + #[async_trait] impl Persist for Duckdb { async fn setup(&self) -> Result<()> { @@ -42,35 +83,88 @@ impl Persist for Duckdb { anyhow::bail!("Upsert sql in Duckdb not set"); }; - let mut stmt = self.connection.lock().await.prepare(query)?; - let value_iter = [ - node.id(), - node.chunk, - node.path, - node.metadata, - ] - stmt.execute(params![ - node.id(), - node.chunk, - node.path, - node.metadata, - node.original_size, - node.range.start, - node.range.end, - node.range.line_start, - node.range.line_end, - node.range.column_start, - node.range.column_end, - ])?; + // TODO: Doing potentially many locks here for the duration of a single query, + // SOMEONE IS GOING TO HAVE A BAD TIME + let lock = self.connection.lock().await; + let mut stmt = lock.prepare(query)?; + + // metadata needs to be converted to `map_from_entries([('key1', value)])`` + // TODO: Investigate if we can do with way less allocations + let mut values = vec![ + NodeValues::Uuid(node.id()), + NodeValues::Chunk(&node.chunk), + NodeValues::Path(&node.path), + NodeValues::Metadata(&node.metadata), + ]; + + let Some(node_vectors) = &node.vectors else { + anyhow::bail!("Expected node to have vectors; cannot store into duckdb"); + }; + + for (field, size) in &self.vectors { + let Some(vector) = node_vectors.get(field) else { + anyhow::bail!("Expected vector for field {} in node", field); + }; + + values.push(NodeValues::Vector(vector)); + } // TODO: Investigate concurrency in duckdb, maybe optmistic if it works - self.connection - .lock() - .await - .execute_batch(&self.node_upsert_sql.get())?; + stmt.execute(params_from_iter(values)) + .context("Failed to store node")?; + + Ok(node) } async fn batch_store(&self, nodes: Vec) -> indexing::IndexingStream { - todo!() + // TODO: Must batch + let mut new_nodes = vec![]; + for node in nodes { + new_nodes.push(self.store(node).await); + } + new_nodes.into() + } +} + +#[cfg(test)] +mod tests { + use indexing::{EmbeddedField, Node}; + + use super::*; + + #[test_log::test(tokio::test)] + async fn test_persisting_nodes() { + let client = Duckdb::builder() + .connection(duckdb::Connection::open_in_memory().unwrap()) + .table_name("test".to_string()) + .with_vector(EmbeddedField::Combined, 3) + .build() + .unwrap(); + + let node = Node::new("Hello duckdb!") + .with_vectors([(EmbeddedField::Combined, vec![1.0, 2.0, 3.0])]) + .to_owned(); + + client.setup().await.unwrap(); + client.store(node).await.unwrap(); + + let connection = client.connection.lock().await; + let mut stmt = connection.prepare("SELECT * FROM test").unwrap(); + let node_iter = stmt + .query_map([], |row| { + Ok(( + row.get::<_, String>(0).unwrap(), // id + row.get::<_, String>(1).unwrap(), // chunk + row.get::<_, String>(2).unwrap(), // path + row.get::<_, String>(3).unwrap(), // metadata + row.get::<_, String>(4).unwrap(), // vector + )) + }) + .unwrap(); + + let retrieved = node_iter.collect::, _>>().unwrap(); + + assert_eq!(retrieved.len(), 1); + assert_eq!(retrieved[0].1, "Hello duckdb!"); } } diff --git a/swiftide-integrations/src/duckdb/schema.sql b/swiftide-integrations/src/duckdb/schema.sql index 674d1184..affd983c 100644 --- a/swiftide-integrations/src/duckdb/schema.sql +++ b/swiftide-integrations/src/duckdb/schema.sql @@ -5,13 +5,10 @@ CREATE TABLE IF NOT EXISTS {{table_name}} ( uuid VARCHAR PRIMARY KEY, chunk VARCHAR NOT NULL, path VARCHAR, - metadata MAP(VARCHAR, VARCHAR) - original_size INT, - offset INT, - + metadata MAP(VARCHAR, VARCHAR), -- NOTE mind want to add created / updated timestamps {% for vector, size in vectors %} - {{vector}} FLOAT[size], + {{vector}} FLOAT[{{size}}], {% endfor %} ); diff --git a/swiftide-integrations/src/duckdb/upsert.sql b/swiftide-integrations/src/duckdb/upsert.sql index 1734a27c..9546d60a 100644 --- a/swiftide-integrations/src/duckdb/upsert.sql +++ b/swiftide-integrations/src/duckdb/upsert.sql @@ -1,15 +1,13 @@ -INSERT INTO {{ table_name }} (uuid, chunk, path, metadata, original_size, offset, {{ vector_field_names | join(",") }}) -VALUES (?, ?, ?, ?, ?, ?, - {% for _ in range(end=vector_field_names | len) %} +INSERT INTO {{ table_name }} (uuid, chunk, path, metadata, {{ vector_field_names | join(sep=", ") }}) +VALUES (?, ?, ?, ?, + {% for _ in range(end=vector_field_names | length) %} ?, {% endfor %} ) -) ON CONFINCT (uuid) DO UPDATE SET +ON CONFLICT (uuid) DO UPDATE SET chunk = EXCLUDED.chunk, path = EXCLUDED.path, metadata = EXCLUDED.metadata, - original_size = EXCLUDED.original_size, - offset = EXCLUDED.offset, {% for vector in vector_field_names %} {{ vector }} = EXCLUDED.{{ vector }}, {% endfor %} From d344c4837998722da02d834c82e9f8d22ddb3b8a Mon Sep 17 00:00:00 2001 From: Timon Vonk Date: Mon, 3 Feb 2025 22:19:47 +0100 Subject: [PATCH 3/4] Get persist to work --- Cargo.lock | 284 +++++++++++++++++-- swiftide-integrations/src/duckdb/persist.rs | 72 +++-- swiftide-integrations/src/duckdb/retrieve.rs | 4 + swiftide-integrations/src/duckdb/schema.sql | 2 - swiftide-integrations/src/duckdb/upsert.sql | 5 +- 5 files changed, 310 insertions(+), 57 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 285f484b..864b731c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,17 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" +[[package]] +name = "ahash" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "891477e0c6a8957309ee5c45a6368af3ae14bb510732d2684ffa19af310920f9" +dependencies = [ + "getrandom 0.2.15", + "once_cell", + "version_check", +] + [[package]] name = "ahash" version = "0.8.11" @@ -223,7 +234,7 @@ version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d45fe6d3faed0435b7313e59a02583b14c6c6339fa7729e94c32a20af319a79" dependencies = [ - "ahash", + "ahash 0.8.11", "arrow-buffer", "arrow-data", "arrow-schema", @@ -354,7 +365,7 @@ version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f936954991c360ba762dff23f5dda16300774fafd722353d9683abd97630ae" dependencies = [ - "ahash", + "ahash 0.8.11", "arrow-array", "arrow-buffer", "arrow-data", @@ -377,7 +388,7 @@ version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7471ba126d0b0aaa24b50a36bc6c25e4e74869a1fd1a5553357027a0b1c8d1f1" dependencies = [ - "ahash", + "ahash 0.8.11", "arrow-array", "arrow-buffer", "arrow-data", @@ -1408,6 +1419,29 @@ dependencies = [ "serde_with", ] +[[package]] +name = "borsh" +version = "1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5430e3be710b68d984d1391c854eb431a9d548640711faa54eecb1df93db91cc" +dependencies = [ + "borsh-derive", + "cfg_aliases", +] + +[[package]] +name = "borsh-derive" +version = "1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8b668d39970baad5356d7c83a86fee3a539e6f93bf6764c97368243e17a0487" +dependencies = [ + "once_cell", + "proc-macro-crate", + "proc-macro2", + "quote", + "syn 2.0.98", +] + [[package]] name = "brotli" version = "7.0.0" @@ -1451,6 +1485,28 @@ version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" +[[package]] +name = "bytecheck" +version = "0.6.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23cdc57ce23ac53c931e88a43d06d070a6fd142f2617be5855eb75efc9beb1c2" +dependencies = [ + "bytecheck_derive", + "ptr_meta", + "simdutf8", +] + +[[package]] +name = "bytecheck_derive" +version = "0.6.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3db406d29fbcd95542e92559bed4d8ad92636d1ca8b3b72ede10b4bcc010e659" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "bytecount" version = "0.6.8" @@ -2194,7 +2250,7 @@ version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b42b7d720fe21ed9cca2ebb635f3f13a12cfab786b41e0fba184fb2e620525b" dependencies = [ - "ahash", + "ahash 0.8.11", "arrow", "arrow-array", "arrow-buffer", @@ -2309,7 +2365,7 @@ version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aabbe48fba18f9981b134124381bee9e46f93518b8ad2f9721ee296cef5affb9" dependencies = [ - "ahash", + "ahash 0.8.11", "arrow", "arrow-schema", "datafusion-common", @@ -2331,7 +2387,7 @@ version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7a3fefed9c8c11268d446d924baca8cabf52fe32f73fdaa20854bac6473590c" dependencies = [ - "ahash", + "ahash 0.8.11", "arrow", "datafusion-common", "datafusion-expr-common", @@ -2437,7 +2493,7 @@ version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acd6ddc378f6ad19af95ccd6790dec8f8e1264bc4c70e99ddc1830c1a1c78ccd" dependencies = [ - "ahash", + "ahash 0.8.11", "arrow", "arrow-array", "arrow-buffer", @@ -2462,7 +2518,7 @@ version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06e6c05458eccd74b4c77ed6a1fe63d52434240711de7f6960034794dad1caf5" dependencies = [ - "ahash", + "ahash 0.8.11", "arrow", "datafusion-common", "datafusion-expr-common", @@ -2492,7 +2548,7 @@ version = "44.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a6608bc9844b4ddb5ed4e687d173e6c88700b1d0482f43894617d18a1fe75da" dependencies = [ - "ahash", + "ahash 0.8.11", "arrow", "arrow-array", "arrow-buffer", @@ -2800,6 +2856,25 @@ dependencies = [ "dtoa", ] +[[package]] +name = "duckdb" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86844939330ba6ce345c4b5333d3be45c4f0c092779bf9617bba92efb8b841f5" +dependencies = [ + "arrow", + "cast", + "fallible-iterator", + "fallible-streaming-iterator", + "hashlink 0.9.1", + "libduckdb-sys", + "memchr", + "num-integer", + "rust_decimal", + "smallvec", + "strum 0.25.0", +] + [[package]] name = "dyn-clone" version = "1.0.18" @@ -3011,6 +3086,18 @@ dependencies = [ "once_cell", ] +[[package]] +name = "fallible-iterator" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + [[package]] name = "fastdivide" version = "0.4.2" @@ -3792,6 +3879,9 @@ name = "hashbrown" version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +dependencies = [ + "ahash 0.7.8", +] [[package]] name = "hashbrown" @@ -3799,7 +3889,7 @@ version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" dependencies = [ - "ahash", + "ahash 0.8.11", "allocator-api2", ] @@ -3814,6 +3904,15 @@ dependencies = [ "foldhash", ] +[[package]] +name = "hashlink" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af" +dependencies = [ + "hashbrown 0.14.5", +] + [[package]] name = "hashlink" version = "0.10.0" @@ -5255,6 +5354,22 @@ version = "0.2.169" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" +[[package]] +name = "libduckdb-sys" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eac2de5219db852597558df5dcd617ffccd5cbd7b9f5402ccbf899aca6cb6047" +dependencies = [ + "autocfg", + "cc", + "flate2", + "pkg-config", + "serde", + "serde_json", + "tar", + "vcpkg", +] + [[package]] name = "libfuzzer-sys" version = "0.4.9" @@ -6185,7 +6300,7 @@ version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8957c0c95a6a1804f3e51a18f69df29be53856a8c5768cc9b6d00fcafcd2917c" dependencies = [ - "ahash", + "ahash 0.8.11", "arrow-array", "arrow-buffer", "arrow-cast", @@ -6686,6 +6801,15 @@ dependencies = [ "syn 2.0.98", ] +[[package]] +name = "proc-macro-crate" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecf48c7ca261d60b74ab1a7b20da18bede46776b2e55535cb958eb595c5fa7b" +dependencies = [ + "toml_edit", +] + [[package]] name = "proc-macro-hack" version = "0.5.20+deprecated" @@ -6778,6 +6902,26 @@ version = "2.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33cb294fe86a74cbcf50d4445b37da762029549ebeea341421c7c70370f86cac" +[[package]] +name = "ptr_meta" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0738ccf7ea06b608c10564b31debd4f5bc5e197fc8bfe088f68ae5ce81e7a4f1" +dependencies = [ + "ptr_meta_derive", +] + +[[package]] +name = "ptr_meta_derive" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16b845dbfca988fa33db069c0e230574d15a3088f147a87b64c7589eb662c9ac" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "publicsuffix" version = "2.3.0" @@ -7284,6 +7428,15 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" +[[package]] +name = "rend" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71fe3824f5629716b1589be05dacd749f6aa084c87e00e016714a8cdfccc997c" +dependencies = [ + "bytecheck", +] + [[package]] name = "reqwest" version = "0.12.12" @@ -7388,6 +7541,35 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rkyv" +version = "0.7.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9008cd6385b9e161d8229e1f6549dd23c3d022f132a2ea37ac3a10ac4935779b" +dependencies = [ + "bitvec", + "bytecheck", + "bytes", + "hashbrown 0.12.3", + "ptr_meta", + "rend", + "rkyv_derive", + "seahash", + "tinyvec", + "uuid", +] + +[[package]] +name = "rkyv_derive" +version = "0.7.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "503d1d27590a2b0a3a4ca4c94755aa2875657196ecbf401a42eff41d7de532c0" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "roaring" version = "0.10.9" @@ -7428,6 +7610,22 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "rust_decimal" +version = "1.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b082d80e3e3cc52b2ed634388d436fe1f4de6af5786cc2de9ba9737527bdf555" +dependencies = [ + "arrayvec", + "borsh", + "bytes", + "num-traits", + "rand 0.8.5", + "rkyv", + "serde", + "serde_json", +] + [[package]] name = "rustc-demangle" version = "0.1.24" @@ -7655,6 +7853,12 @@ dependencies = [ "untrusted", ] +[[package]] +name = "seahash" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" + [[package]] name = "secrecy" version = "0.10.3" @@ -7943,6 +8147,12 @@ dependencies = [ "quote", ] +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + [[package]] name = "similar" version = "2.7.0" @@ -8089,7 +8299,7 @@ version = "2.27.50" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6777700924fadc27a4393fb7f25f7ae7f4cb48ad0a1ae236eab11e009f2eada6" dependencies = [ - "ahash", + "ahash 0.8.11", "aho-corasick", "auto_encoder", "bytes", @@ -8112,7 +8322,7 @@ dependencies = [ "sqlx", "string-interner", "string_concat", - "strum", + "strum 0.26.3", "sysinfo", "tokio", "tokio-stream", @@ -8204,7 +8414,7 @@ dependencies = [ "futures-io", "futures-util", "hashbrown 0.15.2", - "hashlink", + "hashlink 0.10.0", "indexmap 2.7.1", "log", "memchr", @@ -8475,13 +8685,35 @@ dependencies = [ "syn 2.0.98", ] +[[package]] +name = "strum" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125" +dependencies = [ + "strum_macros 0.25.3", +] + [[package]] name = "strum" version = "0.26.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" dependencies = [ - "strum_macros", + "strum_macros 0.26.4", +] + +[[package]] +name = "strum_macros" +version = "0.25.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.98", ] [[package]] @@ -8545,8 +8777,8 @@ dependencies = [ "pretty_assertions", "serde", "serde_json", - "strum", - "strum_macros", + "strum 0.26.3", + "strum_macros 0.26.4", "swiftide-core", "swiftide-macros", "temp-dir", @@ -8572,8 +8804,8 @@ dependencies = [ "qdrant-client", "serde", "serde_json", - "strum", - "strum_macros", + "strum 0.26.3", + "strum_macros 0.26.4", "tera", "test-case", "thiserror 2.0.11", @@ -8620,8 +8852,8 @@ dependencies = [ "num_cpus", "serde", "serde_json", - "strum", - "strum_macros", + "strum 0.26.3", + "strum_macros 0.26.4", "swiftide-core", "swiftide-macros", "test-case", @@ -8647,6 +8879,7 @@ dependencies = [ "chrono", "deadpool 0.12.2", "derive_builder", + "duckdb", "fastembed", "fluvio", "flv-util", @@ -8670,8 +8903,8 @@ dependencies = [ "serde_json", "spider", "sqlx", - "strum", - "strum_macros", + "strum 0.26.3", + "strum_macros 0.26.4", "swiftide-core", "swiftide-macros", "swiftide-test-utils", @@ -8690,6 +8923,7 @@ dependencies = [ "tree-sitter-rust", "tree-sitter-solidity", "tree-sitter-typescript", + "uuid", ] [[package]] @@ -9170,14 +9404,14 @@ version = "0.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6de8ec9ff8e9192f77add3b37ec040808a8bd6567c0616972b2a8568d7269c0d" dependencies = [ - "ahash", + "ahash 0.8.11", "auto_enums", "either", "itertools 0.13.0", "once_cell", "pulldown-cmark 0.12.2", "regex", - "strum", + "strum 0.26.3", "thiserror 1.0.69", "unicode-segmentation", ] diff --git a/swiftide-integrations/src/duckdb/persist.rs b/swiftide-integrations/src/duckdb/persist.rs index ab1bfc1c..9e2882f2 100644 --- a/swiftide-integrations/src/duckdb/persist.rs +++ b/swiftide-integrations/src/duckdb/persist.rs @@ -1,10 +1,10 @@ -use std::{collections::HashMap, path::Path}; +use std::{borrow::Cow, collections::HashMap, path::Path}; use anyhow::{Context as _, Result}; use async_trait::async_trait; use duckdb::{ params_from_iter, - types::{OrderedMap, ToSqlOutput, Value}, + types::{FromSql, OrderedMap, ToSqlOutput, Type, Value, ValueRef}, ToSql, }; use swiftide_core::{ @@ -19,12 +19,14 @@ use super::Duckdb; const SCHEMA: &str = include_str!("schema.sql"); const UPSERT: &str = include_str!("upsert.sql"); +#[allow(dead_code)] enum NodeValues<'a> { Uuid(Uuid), Path(&'a Path), Chunk(&'a str), Metadata(&'a Metadata), - Vector(&'a [f32]), + Vector(Cow<'a, [f32]>), + Null, } impl ToSql for NodeValues<'_> { @@ -33,22 +35,29 @@ impl ToSql for NodeValues<'_> { NodeValues::Uuid(uuid) => Ok(ToSqlOutput::Owned(uuid.to_string().into())), NodeValues::Path(path) => Ok(path.to_string_lossy().to_string().into()), // Should be borrow-able NodeValues::Chunk(chunk) => chunk.to_sql(), - NodeValues::Metadata(metadata) => { - let ordered_map: OrderedMap = metadata - .iter() - .map(|(k, v)| { - ( - k.to_string().into(), - serde_json::to_string(v).unwrap().into(), - ) - }) - .collect::>() - .into(); - Ok(ToSqlOutput::Owned(duckdb::types::Value::Map(ordered_map))) + NodeValues::Metadata(_metadata) => { + unimplemented!("maps are not yet implemented for duckdb"); + // let ordered_map = metadata + // .iter() + // .map(|(k, v)| format!("'{}': {}", k, serde_json::to_string(v).unwrap())) + // .collect::>() + // .join(","); + // + // let formatted = format!("MAP {{{ordered_map}}}"); + // Ok(ToSqlOutput::Owned(formatted.into())) } - NodeValues::Vector(vector) => Ok(ToSqlOutput::Owned(Value::Array( - vector.iter().map(|f| (*f).into()).collect(), - ))), + NodeValues::Vector(vector) => { + let array_str = format!( + "[{}]", + vector + .iter() + .map(ToString::to_string) + .collect::>() + .join(",") + ); + Ok(ToSqlOutput::Owned(array_str.into())) + } + NodeValues::Null => Ok(ToSqlOutput::Owned(Value::Null)), } } } @@ -85,8 +94,6 @@ impl Persist for Duckdb { // TODO: Doing potentially many locks here for the duration of a single query, // SOMEONE IS GOING TO HAVE A BAD TIME - let lock = self.connection.lock().await; - let mut stmt = lock.prepare(query)?; // metadata needs to be converted to `map_from_entries([('key1', value)])`` // TODO: Investigate if we can do with way less allocations @@ -94,9 +101,14 @@ impl Persist for Duckdb { NodeValues::Uuid(node.id()), NodeValues::Chunk(&node.chunk), NodeValues::Path(&node.path), - NodeValues::Metadata(&node.metadata), ]; + // if node.metadata.is_empty() { + // values.push(NodeValues::Null); + // } else { + // values.push(NodeValues::Metadata(&node.metadata)); + // } + let Some(node_vectors) = &node.vectors else { anyhow::bail!("Expected node to have vectors; cannot store into duckdb"); }; @@ -106,9 +118,11 @@ impl Persist for Duckdb { anyhow::bail!("Expected vector for field {} in node", field); }; - values.push(NodeValues::Vector(vector)); + values.push(NodeValues::Vector(vector.into())); } + let lock = self.connection.lock().await; + let mut stmt = lock.prepare(query)?; // TODO: Investigate concurrency in duckdb, maybe optmistic if it works stmt.execute(params_from_iter(values)) .context("Failed to store node")?; @@ -148,23 +162,27 @@ mod tests { client.setup().await.unwrap(); client.store(node).await.unwrap(); + tracing::info!("Stored node"); + let connection = client.connection.lock().await; - let mut stmt = connection.prepare("SELECT * FROM test").unwrap(); + let mut stmt = connection + .prepare("SELECT uuid,path,chunk FROM test") + .unwrap(); let node_iter = stmt .query_map([], |row| { Ok(( row.get::<_, String>(0).unwrap(), // id row.get::<_, String>(1).unwrap(), // chunk row.get::<_, String>(2).unwrap(), // path - row.get::<_, String>(3).unwrap(), // metadata - row.get::<_, String>(4).unwrap(), // vector + // row.get::<_, String>(3).unwrap(), // metadata + // row.get::<_, Vec>(4).unwrap(), // vector )) }) .unwrap(); let retrieved = node_iter.collect::, _>>().unwrap(); - + dbg!(&retrieved); + // assert_eq!(retrieved.len(), 1); - assert_eq!(retrieved[0].1, "Hello duckdb!"); } } diff --git a/swiftide-integrations/src/duckdb/retrieve.rs b/swiftide-integrations/src/duckdb/retrieve.rs index e69de29b..8ae417b0 100644 --- a/swiftide-integrations/src/duckdb/retrieve.rs +++ b/swiftide-integrations/src/duckdb/retrieve.rs @@ -0,0 +1,4 @@ +use duckdb::Statement; +use swiftide_core::{querying::search_strategies::CustomStrategy, Retrieve}; + +impl Retrieve diff --git a/swiftide-integrations/src/duckdb/schema.sql b/swiftide-integrations/src/duckdb/schema.sql index affd983c..fe3bfa18 100644 --- a/swiftide-integrations/src/duckdb/schema.sql +++ b/swiftide-integrations/src/duckdb/schema.sql @@ -5,8 +5,6 @@ CREATE TABLE IF NOT EXISTS {{table_name}} ( uuid VARCHAR PRIMARY KEY, chunk VARCHAR NOT NULL, path VARCHAR, - metadata MAP(VARCHAR, VARCHAR), - -- NOTE mind want to add created / updated timestamps {% for vector, size in vectors %} {{vector}} FLOAT[{{size}}], diff --git a/swiftide-integrations/src/duckdb/upsert.sql b/swiftide-integrations/src/duckdb/upsert.sql index 9546d60a..34283da2 100644 --- a/swiftide-integrations/src/duckdb/upsert.sql +++ b/swiftide-integrations/src/duckdb/upsert.sql @@ -1,5 +1,5 @@ -INSERT INTO {{ table_name }} (uuid, chunk, path, metadata, {{ vector_field_names | join(sep=", ") }}) -VALUES (?, ?, ?, ?, +INSERT INTO {{ table_name }} (uuid, chunk, path, {{ vector_field_names | join(sep=", ") }}) +VALUES (?, ?, ?, {% for _ in range(end=vector_field_names | length) %} ?, {% endfor %} @@ -7,7 +7,6 @@ VALUES (?, ?, ?, ?, ON CONFLICT (uuid) DO UPDATE SET chunk = EXCLUDED.chunk, path = EXCLUDED.path, - metadata = EXCLUDED.metadata, {% for vector in vector_field_names %} {{ vector }} = EXCLUDED.{{ vector }}, {% endfor %} From fce0d724bbbd6406530d0650cd79c31c04b32329 Mon Sep 17 00:00:00 2001 From: Timon Vonk Date: Wed, 12 Feb 2025 17:16:59 +0100 Subject: [PATCH 4/4] wip --- swiftide-integrations/src/duckdb/retrieve.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swiftide-integrations/src/duckdb/retrieve.rs b/swiftide-integrations/src/duckdb/retrieve.rs index 8ae417b0..63b97594 100644 --- a/swiftide-integrations/src/duckdb/retrieve.rs +++ b/swiftide-integrations/src/duckdb/retrieve.rs @@ -1,4 +1,4 @@ use duckdb::Statement; use swiftide_core::{querying::search_strategies::CustomStrategy, Retrieve}; -impl Retrieve +// impl Retrieve