Skip to content

Commit

Permalink
feat: Add ArrowArrayViewCompare() to check for array equality (#578)
Browse files Browse the repository at this point in the history
This PR is one possible component to address #577. While in some cases
we want a more relaxed comparison that allows (for example) arrays with
the same content to be considered equal even if they have different
content in null slots, in some cases we really do want an exact match.
This PR adds `ArrowArrayViewCompare()` in such a way that the same
signature could be used to apply the equality check at a more relaxed
validation level when this is implemented in a future PR, but only
implements the "identical" level since this is the easiest/most pressing
(applies to IPC validation).

The messages given by the implementation give the location of the
difference but not what the difference actually was. Knowing where the
error was is usually sufficient for a higher level runtime (e.g., R,
Python, C++) to give a fancier message if they want or need to.
  • Loading branch information
paleolimbot authored Aug 10, 2024
1 parent 162dcbd commit cfae94b
Show file tree
Hide file tree
Showing 4 changed files with 339 additions and 0 deletions.
131 changes: 131 additions & 0 deletions src/nanoarrow/common/array.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include <errno.h>
#include <inttypes.h>
#include <stdarg.h>
#include <stdlib.h>
#include <string.h>

Expand Down Expand Up @@ -1335,3 +1336,133 @@ ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view,
ArrowErrorSet(error, "validation_level not recognized");
return EINVAL;
}

struct ArrowComparisonInternalState {
enum ArrowCompareLevel level;
int is_equal;
struct ArrowError* reason;
};

NANOARROW_CHECK_PRINTF_ATTRIBUTE static void ArrowComparePrependPath(
struct ArrowError* out, const char* fmt, ...) {
if (out == NULL) {
return;
}

char prefix[128];
prefix[0] = '\0';
va_list args;
va_start(args, fmt);
int prefix_len = vsnprintf(prefix, sizeof(prefix), fmt, args);
va_end(args);

if (prefix_len <= 0) {
return;
}

size_t out_len = strlen(out->message);
size_t out_len_to_move = sizeof(struct ArrowError) - prefix_len - 1;
if (out_len_to_move > out_len) {
out_len_to_move = out_len;
}

memmove(out->message + prefix_len, out->message, out_len_to_move);
memcpy(out->message, prefix, prefix_len);
out->message[out_len + prefix_len] = '\0';
}

#define SET_NOT_EQUAL_AND_RETURN_IF_IMPL(cond_, state_, reason_) \
do { \
if (cond_) { \
ArrowErrorSet(state_->reason, ": %s", reason_); \
state_->is_equal = 0; \
return; \
} \
} while (0)

#define SET_NOT_EQUAL_AND_RETURN_IF(condition_, state_) \
SET_NOT_EQUAL_AND_RETURN_IF_IMPL(condition_, state_, #condition_)

static void ArrowArrayViewCompareBuffer(const struct ArrowArrayView* actual,
const struct ArrowArrayView* expected, int i,
struct ArrowComparisonInternalState* state) {
SET_NOT_EQUAL_AND_RETURN_IF(
actual->buffer_views[i].size_bytes != expected->buffer_views[i].size_bytes, state);

int64_t buffer_size = actual->buffer_views[i].size_bytes;
if (buffer_size > 0) {
SET_NOT_EQUAL_AND_RETURN_IF(
memcmp(actual->buffer_views[i].data.data, expected->buffer_views[i].data.data,
buffer_size) != 0,
state);
}
}

static void ArrowArrayViewCompareIdentical(const struct ArrowArrayView* actual,
const struct ArrowArrayView* expected,
struct ArrowComparisonInternalState* state) {
SET_NOT_EQUAL_AND_RETURN_IF(actual->storage_type != expected->storage_type, state);
SET_NOT_EQUAL_AND_RETURN_IF(actual->n_children != expected->n_children, state);
SET_NOT_EQUAL_AND_RETURN_IF(actual->dictionary == NULL && expected->dictionary != NULL,
state);
SET_NOT_EQUAL_AND_RETURN_IF(actual->dictionary != NULL && expected->dictionary == NULL,
state);

SET_NOT_EQUAL_AND_RETURN_IF(actual->length != expected->length, state);
SET_NOT_EQUAL_AND_RETURN_IF(actual->offset != expected->offset, state);
SET_NOT_EQUAL_AND_RETURN_IF(actual->null_count != expected->null_count, state);

for (int i = 0; i < NANOARROW_MAX_FIXED_BUFFERS; i++) {
ArrowArrayViewCompareBuffer(actual, expected, i, state);
if (!state->is_equal) {
ArrowComparePrependPath(state->reason, ".buffers[%d]", i);
return;
}
}

for (int64_t i = 0; i < actual->n_children; i++) {
ArrowArrayViewCompareIdentical(actual->children[i], expected->children[i], state);
if (!state->is_equal) {
ArrowComparePrependPath(state->reason, ".children[%" PRId64 "]", i);
return;
}
}

if (actual->dictionary != NULL) {
ArrowArrayViewCompareIdentical(actual->dictionary, expected->dictionary, state);
if (!state->is_equal) {
ArrowComparePrependPath(state->reason, ".dictionary");
return;
}
}
}

// Top-level entry point to take care of creating, cleaning up, and
// propagating the ArrowComparisonInternalState to the caller
ArrowErrorCode ArrowArrayViewCompare(const struct ArrowArrayView* actual,
const struct ArrowArrayView* expected,
enum ArrowCompareLevel level, int* out,
struct ArrowError* reason) {
struct ArrowComparisonInternalState state;
state.level = level;
state.is_equal = 1;
state.reason = reason;

switch (level) {
case NANOARROW_COMPARE_IDENTICAL:
ArrowArrayViewCompareIdentical(actual, expected, &state);
break;
default:
return EINVAL;
}

*out = state.is_equal;
if (!state.is_equal) {
ArrowComparePrependPath(state.reason, "root");
}

return NANOARROW_OK;
}

#undef SET_NOT_EQUAL_AND_RETURN_IF
#undef SET_NOT_EQUAL_AND_RETURN_IF_IMPL
183 changes: 183 additions & 0 deletions src/nanoarrow/common/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1816,6 +1816,189 @@ TEST(ArrayTest, ArrayViewTestBasic) {
ArrowArrayViewReset(&array_view);
}

TEST(ArrayTest, ArrayViewCompareTestStructure) {
struct ArrowError error;
struct ArrowArrayView actual;
struct ArrowArrayView expected;
int is_equal = -1;

ArrowArrayViewInitFromType(&actual, NANOARROW_TYPE_INT32);
ASSERT_EQ(ArrowArrayViewCompare(&actual, &actual, NANOARROW_COMPARE_IDENTICAL,
&is_equal, &error),
NANOARROW_OK);
EXPECT_EQ(is_equal, 1);

// Check non-equal storage type
is_equal = -1;
ArrowArrayViewInitFromType(&expected, NANOARROW_TYPE_STRING);
ASSERT_EQ(ArrowArrayViewCompare(&actual, &expected, NANOARROW_COMPARE_IDENTICAL,
&is_equal, &error),
NANOARROW_OK);
EXPECT_EQ(is_equal, 0);
EXPECT_STREQ(error.message, "root: actual->storage_type != expected->storage_type");

// Check non-equal numbers of children
is_equal = -1;
ArrowArrayViewReset(&actual);
ArrowArrayViewReset(&expected);
ArrowArrayViewInitFromType(&actual, NANOARROW_TYPE_STRUCT);
ArrowArrayViewInitFromType(&expected, NANOARROW_TYPE_STRUCT);
ASSERT_EQ(ArrowArrayViewAllocateChildren(&expected, 1), NANOARROW_OK);
ASSERT_EQ(ArrowArrayViewCompare(&actual, &expected, NANOARROW_COMPARE_IDENTICAL,
&is_equal, &error),
NANOARROW_OK);
EXPECT_EQ(is_equal, 0);
EXPECT_STREQ(error.message, "root: actual->n_children != expected->n_children");

// Check difference in children
is_equal = -1;
ASSERT_EQ(ArrowArrayViewAllocateChildren(&actual, 1), NANOARROW_OK);
ArrowArrayViewInitFromType(actual.children[0], NANOARROW_TYPE_STRING);
ArrowArrayViewInitFromType(expected.children[0], NANOARROW_TYPE_BINARY);
ASSERT_EQ(ArrowArrayViewCompare(&actual, &expected, NANOARROW_COMPARE_IDENTICAL,
&is_equal, &error),
NANOARROW_OK);
EXPECT_EQ(is_equal, 0);
EXPECT_STREQ(error.message,
"root.children[0]: actual->storage_type != expected->storage_type");

// Check presence/absence of dictionary
is_equal = -1;
ArrowArrayViewReset(&actual);
ArrowArrayViewReset(&expected);
ArrowArrayViewInitFromType(&actual, NANOARROW_TYPE_INT32);
ArrowArrayViewInitFromType(&expected, NANOARROW_TYPE_INT32);
ASSERT_EQ(ArrowArrayViewAllocateDictionary(&expected), NANOARROW_OK);
ASSERT_EQ(ArrowArrayViewCompare(&actual, &expected, NANOARROW_COMPARE_IDENTICAL,
&is_equal, &error),
NANOARROW_OK);
EXPECT_EQ(is_equal, 0);
EXPECT_STREQ(error.message,
"root: actual->dictionary == NULL && expected->dictionary != NULL");

is_equal = -1;
ASSERT_EQ(ArrowArrayViewCompare(&expected, &actual, NANOARROW_COMPARE_IDENTICAL,
&is_equal, &error),
NANOARROW_OK);
EXPECT_EQ(is_equal, 0);
EXPECT_STREQ(error.message,
"root: actual->dictionary != NULL && expected->dictionary == NULL");

// Check a difference in a dictionary
is_equal = -1;
ASSERT_EQ(ArrowArrayViewAllocateDictionary(&actual), NANOARROW_OK);
ArrowArrayViewInitFromType(actual.dictionary, NANOARROW_TYPE_STRING);
ArrowArrayViewInitFromType(expected.dictionary, NANOARROW_TYPE_BINARY);
ASSERT_EQ(ArrowArrayViewCompare(&actual, &expected, NANOARROW_COMPARE_IDENTICAL,
&is_equal, &error),
NANOARROW_OK);
EXPECT_EQ(is_equal, 0);
EXPECT_STREQ(error.message,
"root.dictionary: actual->storage_type != expected->storage_type");

ArrowArrayViewReset(&actual);
ArrowArrayViewReset(&expected);
}

TEST(ArrayTest, ArrayViewCompareTestIdentical) {
struct ArrowError error;
struct ArrowArrayView actual;
struct ArrowArrayView expected;
int is_equal = -1;

// Check non-equal length/offset/null count
ArrowArrayViewInitFromType(&actual, NANOARROW_TYPE_INT32);
ArrowArrayViewInitFromType(&expected, NANOARROW_TYPE_INT32);
expected.length = 1;
ASSERT_EQ(ArrowArrayViewCompare(&expected, &actual, NANOARROW_COMPARE_IDENTICAL,
&is_equal, &error),
NANOARROW_OK);
EXPECT_EQ(is_equal, 0);
EXPECT_STREQ(error.message, "root: actual->length != expected->length");

is_equal = -1;
expected.length = actual.length;
expected.offset = 1;
ASSERT_EQ(ArrowArrayViewCompare(&expected, &actual, NANOARROW_COMPARE_IDENTICAL,
&is_equal, &error),
NANOARROW_OK);
EXPECT_EQ(is_equal, 0);
EXPECT_STREQ(error.message, "root: actual->offset != expected->offset");

is_equal = -1;
expected.offset = actual.offset;
expected.null_count = 1;
ASSERT_EQ(ArrowArrayViewCompare(&expected, &actual, NANOARROW_COMPARE_IDENTICAL,
&is_equal, &error),
NANOARROW_OK);
EXPECT_EQ(is_equal, 0);
EXPECT_STREQ(error.message, "root: actual->null_count != expected->null_count");

// Check non-equal buffer size
is_equal = -1;
expected.null_count = actual.null_count;
expected.buffer_views[1].size_bytes = 5;
ASSERT_EQ(ArrowArrayViewCompare(&expected, &actual, NANOARROW_COMPARE_IDENTICAL,
&is_equal, &error),
NANOARROW_OK);
EXPECT_EQ(is_equal, 0);
EXPECT_STREQ(error.message,
"root.buffers[1]: actual->buffer_views[i].size_bytes != "
"expected->buffer_views[i].size_bytes");

is_equal = -1;
const char* actual_content = "abcde";
const char* expected_content = "bcdef";
actual.buffer_views[1].size_bytes = 5;
actual.buffer_views[1].data.as_char = actual_content;
expected.buffer_views[1].data.as_char = expected_content;

ASSERT_EQ(ArrowArrayViewCompare(&expected, &actual, NANOARROW_COMPARE_IDENTICAL,
&is_equal, &error),
NANOARROW_OK);
EXPECT_EQ(is_equal, 0);
EXPECT_STREQ(error.message,
"root.buffers[1]: memcmp(actual->buffer_views[i].data.data, "
"expected->buffer_views[i].data.data, buffer_size) != 0");

// Check difference in a child
is_equal = -1;
ArrowArrayViewReset(&actual);
ArrowArrayViewReset(&expected);
ArrowArrayViewInitFromType(&actual, NANOARROW_TYPE_STRUCT);
ArrowArrayViewInitFromType(&expected, NANOARROW_TYPE_STRUCT);
ASSERT_EQ(ArrowArrayViewAllocateChildren(&actual, 1), NANOARROW_OK);
ASSERT_EQ(ArrowArrayViewAllocateChildren(&expected, 1), NANOARROW_OK);
ArrowArrayViewInitFromType(actual.children[0], NANOARROW_TYPE_INT32);
ArrowArrayViewInitFromType(expected.children[0], NANOARROW_TYPE_INT32);
actual.children[0]->length = 1;

ASSERT_EQ(ArrowArrayViewCompare(&expected, &actual, NANOARROW_COMPARE_IDENTICAL,
&is_equal, &error),
NANOARROW_OK);
EXPECT_EQ(is_equal, 0);
EXPECT_STREQ(error.message, "root.children[0]: actual->length != expected->length");

// Check difference in a dictionary
is_equal = -1;
ArrowArrayViewReset(&actual);
ArrowArrayViewReset(&expected);
ArrowArrayViewInitFromType(&actual, NANOARROW_TYPE_INT32);
ArrowArrayViewInitFromType(&expected, NANOARROW_TYPE_INT32);
ASSERT_EQ(ArrowArrayViewAllocateDictionary(&actual), NANOARROW_OK);
ASSERT_EQ(ArrowArrayViewAllocateDictionary(&expected), NANOARROW_OK);
actual.dictionary->length = 1;

ASSERT_EQ(ArrowArrayViewCompare(&expected, &actual, NANOARROW_COMPARE_IDENTICAL,
&is_equal, &error),
NANOARROW_OK);
EXPECT_EQ(is_equal, 0);
EXPECT_STREQ(error.message, "root.dictionary: actual->length != expected->length");

ArrowArrayViewReset(&actual);
ArrowArrayViewReset(&expected);
}

TEST(ArrayTest, ArrayViewTestComputeNullCount) {
struct ArrowError error;

Expand Down
11 changes: 11 additions & 0 deletions src/nanoarrow/common/inline_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -576,6 +576,17 @@ enum ArrowValidationLevel {
NANOARROW_VALIDATION_LEVEL_FULL = 3
};

/// \brief Comparison level enumerator
/// \ingroup nanoarrow-utils
enum ArrowCompareLevel {
/// \brief Consider arrays equal if buffers contain identical content
/// and have identical offset, null count, and length. Note that this is
/// a much stricter check than logical equality, which would take into
/// account potentially different content of null slots, arrays with a
/// non-zero offset, and other considerations.
NANOARROW_COMPARE_IDENTICAL,
};

/// \brief Get a string value of an enum ArrowTimeUnit value
/// \ingroup nanoarrow-utils
///
Expand Down
14 changes: 14 additions & 0 deletions src/nanoarrow/nanoarrow.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewSetArrayMinimal)
#define ArrowArrayViewValidate \
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewValidate)
#define ArrowArrayViewCompare NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewCompare)
#define ArrowArrayViewReset NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewReset)
#define ArrowBasicArrayStreamInit \
NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowBasicArrayStreamInit)
Expand Down Expand Up @@ -1064,6 +1065,19 @@ ArrowErrorCode ArrowArrayViewValidate(struct ArrowArrayView* array_view,
enum ArrowValidationLevel validation_level,
struct ArrowError* error);

/// \brief Compare two ArrowArrayView objects for equality
///
/// Given two ArrowArrayView instances, place either 0 (not equal) and
/// 1 (equal) at the address pointed to by out. If the comparison determines
/// that actual and expected are not equal, a reason will be communicated via
/// error if error is non-NULL.
///
/// Returns NANOARROW_OK if the comparison completed successfully.
ArrowErrorCode ArrowArrayViewCompare(const struct ArrowArrayView* actual,
const struct ArrowArrayView* expected,
enum ArrowCompareLevel level, int* out,
struct ArrowError* reason);

/// \brief Reset the contents of an ArrowArrayView and frees resources
void ArrowArrayViewReset(struct ArrowArrayView* array_view);

Expand Down

0 comments on commit cfae94b

Please sign in to comment.