Skip to content

Commit

Permalink
add hybrid search
Browse files Browse the repository at this point in the history
  • Loading branch information
yujonglee committed Jun 22, 2024
1 parent 2a65408 commit b95a113
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 0 deletions.
110 changes: 110 additions & 0 deletions core/lib/canary/sources/document.ex
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,120 @@ defmodule Canary.Sources.Document do
create :create do
accept [:content, :embedding]
end

read :hybrid_search do
argument :text, :string do
allow_nil? false
end

argument :embedding, :vector do
allow_nil? false
end

argument :threshold, :float do
allow_nil? true
end

manual Canary.Sources.Document.HybridSearch
end
end

postgres do
table "source_documents"
repo Canary.Repo
end
end

defmodule Canary.Sources.Document.HybridSearch do
use Ash.Resource.ManualRead

@index_name "search_index"
@table_name "source_documents"
@table_id_field "id"
@table_text_field "content"
@table_vector_field "embedding"

def read(ash_query, _ecto_query, _opts, _context) do
text = ash_query.arguments.text
embedding = ash_query.arguments.embedding

opts = [
threshold: ash_query.arguments[:threshold],
limit: ash_query.limit
]

result = hybrid_search(text, embedding, opts)

{:ok, result}
end

def hybrid_search(text, embedding, opts \\ []) do
n = opts[:limit] || 10
threshold = opts[:threshold] || 0.4

embedding = Ash.Vector.to_list(embedding)

"""
SELECT doc.*
FROM #{@table_name} doc
LEFT JOIN (
SELECT *
FROM #{@index_name}.rank_hybrid(
bm25_query => $1,
similarity_query => $2,
bm25_weight => 0.6,
bm25_limit_n => 100,
similarity_weight => 0.4,
similarity_limit_n => 100
)
) index
ON doc.id = index.id
WHERE index.rank_hybrid >= $3
ORDER BY index.rank_hybrid DESC
LIMIT $4;
"""
|> query([
"#{@table_text_field}:#{text}",
"'#{Jason.encode!(embedding)}' <-> #{@table_vector_field}",
threshold,
n
])
end

defp query(query, params) do
query
|> Canary.Repo.query(params)
|> case do
{:ok, %{rows: rows, columns: columns}} ->
rows |> Enum.map(&Canary.Repo.load(Canary.Sources.Document, {columns, &1}))

error ->
error
end
end
end

defmodule Canary.Sources.Document.Migration do
use Ecto.Migration

@index_name "search_index"
@table_name "source_documents"
@table_id_field "id"

def up do
execute("""
CALL paradedb.create_bm25(
index_name => '#{@index_name}',
table_name => '#{@table_name}',
key_field => '#{@table_id_field}',
text_fields => '#{Jason.encode!(%{content: %{tokenizer: %{type: "ngram", min_gram: 4, max_gram: 6, prefix_only: true}}})}'
);
""")
end

def down do
execute("""
CALL paradedb.drop_bm25('#{@table_name}');
""")
end
end
11 changes: 11 additions & 0 deletions core/priv/repo/migrations/20240622031331_add_bm25_index.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
defmodule Canary.Repo.Migrations.AddBm25Index do
use Ecto.Migration

def up do
Canary.Sources.Document.Migration.up()
end

def down do
Canary.Sources.Document.Migration.down()
end
end

0 comments on commit b95a113

Please sign in to comment.