Skip to content

MLE-12345 - Refactors vector utility functions #113

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 33 additions & 35 deletions marklogic/vectors.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,43 @@
"""
Supports encoding and decoding vectors using the same approach as the vec:base64-encode and vec:base64-decode
functions supported by the MarkLogic server.
"""

import base64
import struct
from typing import List


class VectorUtil:
def base64_encode(vector: List[float]) -> str:
"""
Supports encoding and decoding vectors using the same approach as the vec:base64-encode and vec:base64-decode
functions supported by the MarkLogic server.
Encodes a list of floats as a base64 string compatible with MarkLogic's vec:base64-encode.
"""
dimensions = len(vector)
# version (int32, 0) + dimensions (int32) + floats (little-endian)
buffer = struct.pack("<ii", 0, dimensions) + struct.pack(
"<" + "f" * dimensions, *vector
)
return base64.b64encode(buffer).decode("ascii")

@staticmethod
def base64_encode(vector: List[float]) -> str:
"""
Encodes a list of floats as a base64 string compatible with MarkLogic's vec:base64-encode.
"""
dimensions = len(vector)
# version (int32, 0) + dimensions (int32) + floats (little-endian)
buffer = struct.pack("<ii", 0, dimensions) + struct.pack(
"<" + "f" * dimensions, *vector
)
return base64.b64encode(buffer).decode("ascii")

@staticmethod
def base64_decode(encoded_vector: str) -> List[float]:
"""
Decodes a base64 string to a list of floats compatible with MarkLogic's vec:base64-decode.
"""
buffer = base64.b64decode(encoded_vector)
if len(buffer) < 8:
raise ValueError(
"Buffer is too short to contain version and dimensions."
)
version, dimensions = struct.unpack("<ii", buffer[:8])
if version != 0:
raise ValueError(f"Unsupported vector version: {version}")
expected_length = 8 + 4 * dimensions
if len(buffer) < expected_length:
raise ValueError(
f"Buffer is too short for the specified dimensions: expected {expected_length}, got {len(buffer)}"
)
floats = struct.unpack(
"<" + "f" * dimensions, buffer[8 : 8 + 4 * dimensions]
def base64_decode(encoded_vector: str) -> List[float]:
"""
Decodes a base64 string to a list of floats compatible with MarkLogic's vec:base64-decode.
"""
buffer = base64.b64decode(encoded_vector)
if len(buffer) < 8:
raise ValueError(
"Buffer is too short to contain version and dimensions."
)
version, dimensions = struct.unpack("<ii", buffer[:8])
if version != 0:
raise ValueError(f"Unsupported vector version: {version}")
expected_length = 8 + 4 * dimensions
if len(buffer) < expected_length:
raise ValueError(
f"Buffer is too short for the specified dimensions: expected {expected_length}, got {len(buffer)}"
)
return list(floats)
floats = struct.unpack(
"<" + "f" * dimensions, buffer[8 : 8 + 4 * dimensions]
)
return list(floats)
12 changes: 6 additions & 6 deletions tests/test_vectors.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import math
import ast
from marklogic.vectors import VectorUtil
from marklogic.vectors import base64_encode, base64_decode
from marklogic import Client

VECTOR = [3.14, 1.59, 2.65]
Expand All @@ -9,17 +9,17 @@


def test_encode_and_decode_with_python():
encoded = VectorUtil.base64_encode(VECTOR)
encoded = base64_encode(VECTOR)
assert encoded == EXPECTED_BASE64

decoded = VectorUtil.base64_decode(encoded)
decoded = base64_decode(encoded)
assert len(decoded) == len(VECTOR)
for a, b in zip(decoded, VECTOR):
assert abs(a - b) < ACCEPTABLE_DELTA


def test_decode_known_base64():
decoded = VectorUtil.base64_decode(EXPECTED_BASE64)
decoded = base64_decode(EXPECTED_BASE64)
assert len(decoded) == len(VECTOR)
for a, b in zip(decoded, VECTOR):
assert abs(a - b) < ACCEPTABLE_DELTA
Expand All @@ -29,7 +29,7 @@ def test_encode_and_decode_with_server(client: Client):
"""
Encode a vector in Python, decode it on the MarkLogic server, and check the result.
"""
encoded = VectorUtil.base64_encode(VECTOR)
encoded = base64_encode(VECTOR)
assert encoded == EXPECTED_BASE64

# Use MarkLogic's eval endpoint to decode the vector on the server
Expand All @@ -49,7 +49,7 @@ def test_encode_with_server_and_decode_with_python(client: Client):
encoded = client.eval(xquery=xquery)[0]
assert encoded == EXPECTED_BASE64

decoded = VectorUtil.base64_decode(encoded)
decoded = base64_decode(encoded)
assert len(decoded) == len(VECTOR)
for a, b in zip(decoded, VECTOR):
assert math.isclose(a, b, abs_tol=ACCEPTABLE_DELTA)