From ac10fdb2cdc80c2706f59d7d62541fe6c98316bc Mon Sep 17 00:00:00 2001
From: "Michael J. Sullivan" <sully@msully.net>
Date: Wed, 7 Jun 2023 18:02:09 -0700
Subject: [PATCH 1/4] Implement support for vector type

Vectors get decoded into array.array. Encoding supports any list-like
array of numbers, but has an optimized fast path for things like array
and ndarray that avoids needing to box integers.

The test passes when run against the current `pgvector` branch with
edgedb/edgedb#5620 applied to it.
---
 edgedb/protocol/codecs/codecs.pyx | 107 ++++++++++++++++++++++-
 tests/test_vector.py              | 141 ++++++++++++++++++++++++++++++
 2 files changed, 245 insertions(+), 3 deletions(-)
 create mode 100644 tests/test_vector.py

diff --git a/edgedb/protocol/codecs/codecs.pyx b/edgedb/protocol/codecs/codecs.pyx
index d0cfa7b5..3778c91e 100644
--- a/edgedb/protocol/codecs/codecs.pyx
+++ b/edgedb/protocol/codecs/codecs.pyx
@@ -17,6 +17,7 @@
 #
 
 
+import array
 import decimal
 import uuid
 import datetime
@@ -24,6 +25,8 @@ from edgedb import describe
 from edgedb import enums
 from edgedb.datatypes import datatypes
 
+from libc.string cimport memcpy
+
 
 include "./edb_types.pxi"
 
@@ -347,14 +350,16 @@ cdef dict BASE_SCALAR_CODECS = {}
 cdef register_base_scalar_codec(
         str name,
         pgproto.encode_func encoder,
-        pgproto.decode_func decoder):
+        pgproto.decode_func decoder,
+        object tid = None):
 
     cdef:
         BaseCodec codec
 
-    tid = TYPE_IDS.get(name)
     if tid is None:
-        raise RuntimeError(f'cannot find known ID for type {name!r}')
+        tid = TYPE_IDS.get(name)
+        if tid is None:
+            raise RuntimeError(f'cannot find known ID for type {name!r}')
     tid = tid.bytes
 
     if tid in BASE_SCALAR_CODECS:
@@ -510,6 +515,94 @@ cdef config_memory_decode(pgproto.CodecContext settings, FRBuffer *buf):
     return datatypes.ConfigMemory(bytes=bytes)
 
 
+DEF PGVECTOR_MAX_DIM = 16000
+
+
+cdef pgvector_encode_memview(pgproto.CodecContext settings, WriteBuffer buf,
+                             float[:] obj):
+    cdef:
+        float item
+        Py_ssize_t objlen
+        Py_ssize_t i
+
+    objlen = len(obj)
+    if objlen > PGVECTOR_MAX_DIM:
+        raise ValueError('too many dimensions in vector value')
+
+    buf.write_int32(4 + objlen*4)
+    buf.write_int16(objlen)
+    buf.write_int16(0)
+    for i in range(objlen):
+        buf.write_float(obj[i])
+
+
+cdef pgvector_encode(pgproto.CodecContext settings, WriteBuffer buf,
+                     object obj):
+    cdef:
+        float item
+        Py_ssize_t objlen
+        float[:] memview
+        Py_ssize_t i
+
+    # If we can take a typed memview of the object, we use that.
+    # That is good, because it means we can consume array.array and
+    # numpy.ndarray without needing to unbox.
+    # Otherwise we take the slow path, indexing into the array using
+    # the normal protocol.
+    try:
+        memview = obj
+    except (ValueError, TypeError) as e:
+        pass
+    else:
+        pgvector_encode_memview(settings, buf, memview)
+        return
+
+    if not _is_array_iterable(obj):
+        raise TypeError(
+            'a sized iterable container expected (got type {!r})'.format(
+                type(obj).__name__))
+
+    # Annoyingly, this is literally identical code to the fast path...
+    # but the types are different in critical ways.
+    objlen = len(obj)
+    if objlen > PGVECTOR_MAX_DIM:
+        raise ValueError('too many dimensions in vector value')
+
+    buf.write_int32(4 + objlen*4)
+    buf.write_int16(objlen)
+    buf.write_int16(0)
+    for i in range(objlen):
+        buf.write_float(obj[i])
+
+
+cdef object ONE_EL_ARRAY = array.array('f', [0.0])
+
+
+cdef pgvector_decode(pgproto.CodecContext settings, FRBuffer *buf):
+    cdef:
+        int32_t dim
+        Py_ssize_t size
+        Py_buffer view
+        char *p
+        float[:] array_view
+
+    dim = hton.unpack_uint16(frb_read(buf, 2))
+    frb_read(buf, 2)
+
+    size = dim * 4
+    p = frb_read(buf, size)
+
+    # Create a float array with size dim
+    val = ONE_EL_ARRAY * dim
+
+    # And fill it with the buffer contents
+    array_view = val
+    memcpy(&array_view[0], p, size)
+    val.byteswap()
+
+    return val
+
+
 cdef checked_decimal_encode(
     pgproto.CodecContext settings, WriteBuffer buf, obj
 ):
@@ -712,4 +805,12 @@ cdef register_base_scalar_codecs():
         pgproto.text_encode,
         pgproto.text_decode)
 
+    register_base_scalar_codec(
+        'vector::vector',
+        pgvector_encode,
+        pgvector_decode,
+        uuid.UUID('9565dd88-04f5-11ee-a691-0b6ebe179825'),
+    )
+
+
 register_base_scalar_codecs()
diff --git a/tests/test_vector.py b/tests/test_vector.py
new file mode 100644
index 00000000..a96119d3
--- /dev/null
+++ b/tests/test_vector.py
@@ -0,0 +1,141 @@
+#
+# This source file is part of the EdgeDB open source project.
+#
+# Copyright 2019-present MagicStack Inc. and the EdgeDB authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from edgedb import _testbase as tb
+import edgedb
+
+import array
+
+
+# An array.array subtype where indexing doesn't work.
+# We use this to verify that the non-boxing memoryview based
+# fast path works, since the slow path won't work on this object.
+class brokenarray(array.array):
+    def __getitem__(self, i):
+        raise AssertionError("the fast path wasn't used!")
+
+
+class TestVector(tb.SyncQueryTestCase):
+    def setUp(self):
+        super().setUp()
+
+        if not self.client.query_required_single('''
+            select exists (
+              select sys::ExtensionPackage filter .name = 'vector'
+            )
+        '''):
+            self.skipTest("feature not implemented")
+
+        self.client.execute('''
+            create extension vector version '1.0'
+        ''')
+
+    def tearDown(self):
+        try:
+            self.client.execute('''
+                drop extension vector version '1.0'
+            ''')
+        finally:
+            super().tearDown()
+
+    async def test_vector_01(self):
+        # if not self.client.query_required_single('''
+        #     select exists (
+        #       select sys::ExtensionPackage filter .name = 'vector'
+        #     )
+        # '''):
+        #     self.skipTest("feature not implemented")
+
+        # self.client.execute('''
+        #     create extension vector version '1.0'
+        # ''')
+
+        val = self.client.query_single('''
+            select <vector::vector>'[1.5,2.0,3.8]'
+        ''')
+        self.assertTrue(isinstance(val, array.array))
+        self.assertEqual(val, array.array('f', [1.5, 2.0, 3.8]))
+
+        val = self.client.query_single(
+            '''
+                select <str><vector::vector>$0
+            ''',
+            [3.0, 9.0, -42.5],
+        )
+        self.assertEqual(val, '[3,9,-42.5]')
+
+        val = self.client.query_single(
+            '''
+                select <str><vector::vector>$0
+            ''',
+            array.array('f', [3.0, 9.0, -42.5])
+        )
+        self.assertEqual(val, '[3,9,-42.5]')
+
+        val = self.client.query_single(
+            '''
+                select <str><vector::vector>$0
+            ''',
+            array.array('i', [1, 2, 3]),
+        )
+        self.assertEqual(val, '[1,2,3]')
+
+        # Test that the fast-path works: if the encoder tries to
+        # call __getitem__ on this brokenarray, it will fail.
+        val = self.client.query_single(
+            '''
+                select <str><vector::vector>$0
+            ''',
+            brokenarray('f', [3.0, 9.0, -42.5])
+        )
+        self.assertEqual(val, '[3,9,-42.5]')
+
+        # I don't think it's worth adding a dependency to test this,
+        # but this works too:
+        # import numpy as np
+        # val = self.client.query_single(
+        #     '''
+        #     select <str><vector::vector>$0
+        #     ''',
+        #     np.asarray([3.0, 9.0, -42.5], dtype=np.float32),
+        # )
+
+        # Some sad path tests
+        with self.assertRaises(edgedb.InvalidArgumentError):
+            self.client.query_single(
+                '''
+                    select <vector::vector>$0
+                ''',
+                [3.0, None, -42.5],
+            )
+
+        with self.assertRaises(edgedb.InvalidArgumentError):
+            self.client.query_single(
+                '''
+                    select <vector::vector>$0
+                ''',
+                [3.0, 'x', -42.5],
+            )
+
+        with self.assertRaises(edgedb.InvalidArgumentError):
+            self.client.query_single(
+                '''
+                    select <vector::vector>$0
+                ''',
+                'foo',
+            )

From ef8ad0ce4b1676f2cc128f7a4545fe37d301dac1 Mon Sep 17 00:00:00 2001
From: "Michael J. Sullivan" <sully@msully.net>
Date: Thu, 8 Jun 2023 12:38:17 -0700
Subject: [PATCH 2/4] only restrict it to uint16 max

---
 edgedb/protocol/codecs/codecs.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/edgedb/protocol/codecs/codecs.pyx b/edgedb/protocol/codecs/codecs.pyx
index 3778c91e..2a3bc72e 100644
--- a/edgedb/protocol/codecs/codecs.pyx
+++ b/edgedb/protocol/codecs/codecs.pyx
@@ -515,7 +515,7 @@ cdef config_memory_decode(pgproto.CodecContext settings, FRBuffer *buf):
     return datatypes.ConfigMemory(bytes=bytes)
 
 
-DEF PGVECTOR_MAX_DIM = 16000
+DEF PGVECTOR_MAX_DIM = (1 << 16) - 1
 
 
 cdef pgvector_encode_memview(pgproto.CodecContext settings, WriteBuffer buf,

From 631ad4c43e39701c8ffa7b0f061d7958a3496167 Mon Sep 17 00:00:00 2001
From: "Michael J. Sullivan" <sully@msully.net>
Date: Thu, 8 Jun 2023 13:10:30 -0700
Subject: [PATCH 3/4] don't say dimensions

---
 edgedb/protocol/codecs/codecs.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/edgedb/protocol/codecs/codecs.pyx b/edgedb/protocol/codecs/codecs.pyx
index 2a3bc72e..6d10d30f 100644
--- a/edgedb/protocol/codecs/codecs.pyx
+++ b/edgedb/protocol/codecs/codecs.pyx
@@ -527,7 +527,7 @@ cdef pgvector_encode_memview(pgproto.CodecContext settings, WriteBuffer buf,
 
     objlen = len(obj)
     if objlen > PGVECTOR_MAX_DIM:
-        raise ValueError('too many dimensions in vector value')
+        raise ValueError('too many elements in vector value')
 
     buf.write_int32(4 + objlen*4)
     buf.write_int16(objlen)
@@ -566,7 +566,7 @@ cdef pgvector_encode(pgproto.CodecContext settings, WriteBuffer buf,
     # but the types are different in critical ways.
     objlen = len(obj)
     if objlen > PGVECTOR_MAX_DIM:
-        raise ValueError('too many dimensions in vector value')
+        raise ValueError('too many elements in vector value')
 
     buf.write_int32(4 + objlen*4)
     buf.write_int16(objlen)

From 7d697b87345e4fe9cac041df08a8a16351107e10 Mon Sep 17 00:00:00 2001
From: "Michael J. Sullivan" <sully@msully.net>
Date: Thu, 8 Jun 2023 16:11:30 -0700
Subject: [PATCH 4/4] rename the type

---
 edgedb/protocol/codecs/codecs.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/edgedb/protocol/codecs/codecs.pyx b/edgedb/protocol/codecs/codecs.pyx
index 6d10d30f..da40e283 100644
--- a/edgedb/protocol/codecs/codecs.pyx
+++ b/edgedb/protocol/codecs/codecs.pyx
@@ -806,7 +806,7 @@ cdef register_base_scalar_codecs():
         pgproto.text_decode)
 
     register_base_scalar_codec(
-        'vector::vector',
+        'ext::pgvector::vector',
         pgvector_encode,
         pgvector_decode,
         uuid.UUID('9565dd88-04f5-11ee-a691-0b6ebe179825'),