Java gatherer (#1523)

* Push of gatherer after performance and memory leak testing * Add Gatherer wrapper to Python learn submodule * Changes from ./gradlew :Integrations:spotlessApply * Minor update and rename to Pythonic conventions * Fix typo * Another typo * Add _defineSymbols call so class wrappers get defined * Changes. Not fully ready, but I don't want to stash them yet. * More updates. Still not ready, but I want to save these and work on deephaven.java for a bit * Updates * Remove print statement for typeString * Replace functions for each data type with one single function * Add support for python built-in types, not just NumPy types * Fix typo * Convert Python built-in types to NumPy dtypes * Updates from Chip's review. Still testing * Updates per Chip's review. Code has been tested. I will add example testing code in a comment on the PR * spotlessApply * Changes - remove transpose altogether, move transferrer to learn.gather * spotlessApply so checks will pass * Add unit tests for Python and Java. The Python unit tests may fail because of the boolean issue (issue 1590) * Update tests. Java tests still fail, and Python will be updated. But the code has been cleaned up * Minor clean up of Python unit tests * Updates to unit tests, make IndexSet public for use in Python unit tests * spotless apply * Fix Python test syntax * Major changes to Gather functions, and updates to tests/Python code in accordance with * Fix Python test * Fix java tests * Remove commented out java test * Add row- and column-major functions. Also update tests in Java/Python and the corresponding Python code * spotlessApply * Fix for Python * Fix Python test (reference to old IndexSet) * Changes from Chip's review * spotlessApply * Fix Python * Updates from Chip's review * spotlessApply * Updates from Chip's review * Fix return to fix Python test * Chip's suggestion for comment in enum * Put comments below enum values to make Sphinx happy
deephaven · Dec 9, 2021 · ae32bfd · ae32bfd
1 parent 4e1524b
commit ae32bfd
Show file tree

Hide file tree

Showing 5 changed files with 1,134 additions and 1 deletion.
diff --git a/Integrations/python/deephaven/learn/__init__.py b/Integrations/python/deephaven/learn/__init__.py
@@ -30,13 +30,13 @@ def _defineSymbols():
         raise SystemError("No java functionality can be used until the JVM has been initialized through the jpy module")
 
     global _Input_, _Output_, _Computer_, _Scatterer_
+
     if _Input_ is None:
         _Input_ = jpy.get_type("io.deephaven.integrations.learn.Input")
         _Output_ = jpy.get_type("io.deephaven.integrations.learn.Output")
         _Computer_ = jpy.get_type("io.deephaven.integrations.learn.Computer")
         _Scatterer_ = jpy.get_type("io.deephaven.integrations.learn.Scatterer")
 
-
 # every module method that invokes Java classes should be decorated with @_passThrough
 @wrapt.decorator
 def _passThrough(wrapped, instance, args, kwargs):
@@ -55,6 +55,10 @@ def _passThrough(wrapped, instance, args, kwargs):
     _defineSymbols()
     return wrapped(*args, **kwargs)
 
+try:
+    _defineSymbols()
+except Exception as e:
+    pass
 
 @_passThrough
 class Input:

diff --git a/Integrations/python/deephaven/learn/gather/__init__.py b/Integrations/python/deephaven/learn/gather/__init__.py
@@ -0,0 +1,122 @@
+# 
+# Copyright (c) 2016 - 2021 Deephaven Data Labs and Patent Pending
+# 
+"""
+Utilities for gathering Deephaven table data into Python objects
+"""
+
+import numpy as np
+import enum
+import jpy
+import wrapt
+
+# None until the first _defineSymbols() call
+_gatherer = None
+
+def _defineSymbols():
+    if not jpy.has_jvm():
+        raise SystemError("No java functionality can be used until the JVM has been initialized through the jpy module")
+
+    global _gatherer
+    global Layout
+
+    if _gatherer is None:
+        _gatherer = jpy.get_type("io.deephaven.integrations.learn.gather.NumPy")
+
+class MemoryLayout(enum.Enum):
+    """
+    Memory layouts for an array.
+    """
+    ROW_MAJOR = True
+    """Row-major memory layout."""
+    COLUMN_MAJOR = False
+    """Column-major memory layout."""
+    C = True
+    """Memory layout consistent with C arrays (row-major)."""
+    FORTRAN = False
+    """Memory layout consistent with Fortran arrays (column-major)."""
+
+    def __init__(self, is_row_major):
+        self.is_row_major = is_row_major
+
+
+# Every method that depends on symbols defined via _defineSymbols() should be decorated with @_passThrough
+@wrapt.decorator
+def _passThrough(wrapped, instance, args, kwargs):
+    """
+    For decoration of module methods, to define necessary symbols at runtime
+
+    :param wrapped: the method to be decorated
+    :param instance: the object to which the wrapped function was bound when it was called
+    :param args: the argument list for `wrapped`
+    :param kwargs: the keyword argument dictionary for `wrapped`
+    :return: the decorated version of the method
+    """
+
+    _defineSymbols()
+    return wrapped(*args, **kwargs)
+
+try:
+    _defineSymbols()
+except Exception as e:
+    pass
+
+@_passThrough
+def convert_to_numpy_dtype(dtype):
+    """
+    Convert an input type to the corresponding NumPy data type
+
+    :param dtype: A Python type
+    """
+    if dtype.__module__ == np.__name__:
+        return dtype
+    elif dtype == bool:
+        dtype = np.bool_
+    elif dtype == float:
+        dtype = np.double
+    elif dtype == int:
+        dtype = np.intc
+    else:
+        raise ValueError(f"{dtype} is not a data type that can be converted to a NumPy dtype.")
+    return dtype
+
+@_passThrough
+def table_to_numpy_2d(row_set, col_set, order:MemoryLayout = MemoryLayout.ROW_MAJOR, dtype:np.dtype = np.intc):
+    """
+    Convert Deephaven table data to a 2d NumPy array of the appropriate size
+
+    :param row_set: A RowSequence describing the number of rows in the table
+    :param col_set: ColumnSources describing which columns to copy
+    :param order: :param order: The desired memory layout of the output array
+    :param dtype: The desired NumPy data type of the output NumPy array
+    :return: A NumPy ndarray
+    """
+
+    if not(isinstance(order, MemoryLayout)):
+        raise ValueError(f"Invalid major order {order}.  Please use an enum value from MemoryLayout.")
+
+    dtype = convert_to_numpy_dtype(dtype)
+
+    if dtype == np.byte:
+        buffer = _gatherer.tensorBuffer2DByte(row_set, col_set, order.is_row_major)
+    elif dtype == np.short:
+        buffer = _gatherer.tensorBuffer2DShort(row_set, col_set, order.is_row_major)
+    elif dtype == np.intc:
+        buffer = _gatherer.tensorBuffer2DInt(row_set, col_set, order.is_row_major)
+    elif dtype == np.int_:
+        buffer = _gatherer.tensorBuffer2DLong(row_set, col_set, order.is_row_major)
+    elif dtype == np.single:
+        buffer = _gatherer.tensorBuffer2DFloat(row_set, col_set, order.is_row_major)
+    elif dtype == np.double:
+        buffer = _gatherer.tensorBuffer2DDouble(row_set, col_set, order.is_row_major)
+    else:
+        raise ValueError(f"Data type {dtype} is not supported.")
+
+    tensor = np.frombuffer(buffer, dtype = dtype)
+
+    if order.is_row_major:
+        tensor.shape = (len(col_set), row_set.intSize())
+        return tensor.T
+    else:
+        tensor.shape = (row_set.intSize(), len(col_set))
+        return tensor
diff --git a/Integrations/python/test/test_learn_gather.py b/Integrations/python/test/test_learn_gather.py
@@ -0,0 +1,170 @@
+# 
+# Copyright (c) 2016-2021 Deephaven Data Labs and Patent Pending
+# 
+
+##############################################################################
+# NOTE: the jvm should have been initialized, or this test will certainly fail
+##############################################################################
+
+import pandas as pd
+import numpy as np
+import unittest
+import jpy
+import sys
+import os
+
+from deephaven import learn, tableToDataFrame, TableTools
+from deephaven.learn import gather
+
+class TestGather(unittest.TestCase):
+    """
+    Test cases for deephaven.learn submodule
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        """
+        Inherited method allowing initialization of test environment
+        """
+        # Tables
+        cls.bool_table = TableTools.emptyTable(100).update(
+            "X = true",
+            "Y = false",
+            "Z = (i % 2 == 0) ? true : false"
+        )
+        cls.byte_table = TableTools.emptyTable(100).update(
+            "X = (byte)i",
+            "Y = (byte)(100 - X)",
+            "Z = (byte)(-101 + X)"
+        )
+        cls.short_table = TableTools.emptyTable(100).update(
+            "X = (short)i",
+            "Y = (short)(100 - X)",
+            "Z = (short)(-101 + X)"
+        )
+        cls.int_table = TableTools.emptyTable(100).update(
+            "X = (int)i",
+            "Y = 100 - X",
+            "Z = -101 + X"
+        )
+        cls.long_table = TableTools.emptyTable(100).update(
+            "X = (long)i",
+            "Y = 100 - X",
+            "Z = -101 + X"
+        )
+        cls.float_table = TableTools.emptyTable(100).update(
+            "X = (float)i",
+            "Y = (float)sqrt(X)",
+            "Z = (float)sqrt(Y)"
+        )
+        cls.double_table = TableTools.emptyTable(100).update(
+            "X = (double)i", 
+            "Y = sqrt(X)", 
+            "Z = sqrt(Y)"
+        )
+        # NumPy arrays
+        cls.bool_array = \
+            np.array([[True, False, True], [True, False, False]] * 50,
+            dtype = np.bool_)
+        cls.byte_array = np.vstack((
+            np.arange(0, 100, dtype = np.byte),
+            np.arange(100, 0, -1, dtype = np.byte),
+            np.arange(-101, -1, dtype = np.byte)
+        )).T
+        cls.short_array = np.vstack((
+            np.arange(0, 100, dtype = np.short),
+            np.arange(100, 0, -1, dtype = np.short),
+            np.arange(-101, -1, dtype = np.short)
+        )).T
+        cls.int_array = np.vstack((
+            np.arange(0, 100, dtype = np.intc),
+            np.arange(100, 0, -1, dtype = np.intc),
+            np.arange(-101, -1, dtype = np.intc)
+        )).T
+        cls.long_array = np.vstack((
+            np.arange(0, 100, dtype = np.int_),
+            np.arange(100, 0, -1, dtype = np.int_),
+            np.arange(-101, -1, dtype = np.int_)
+        )).T
+        cls.float_array = np.vstack((
+            np.arange(0, 100, dtype = np.single),
+            np.sqrt(np.arange(0, 100, dtype = np.single)),
+            np.sqrt(np.sqrt(np.arange(0, 100, dtype = np.single)))
+        )).T
+        cls.double_array = np.vstack((
+            np.arange(0, 100, dtype = np.double),
+            np.sqrt(np.arange(0, 100, dtype = np.double)),
+            np.sqrt(np.sqrt(np.arange(0, 100, dtype = np.double)))
+        )).T
+
+    # Model for learn to use when dtype = [np.bool_]
+    def boolean_model(self, features):
+        return np.count_nonzero(features, axis = 1) < 2
+
+    # Model for learn to use when dtype = [np.byte, np.short, np.intc, np.int_]
+    def integer_model(self, features):
+        return np.sum(features, axis = 1)
+
+    # Model for learn to use when dtype = [np.single, np.double]
+    def decimal_model(self, features):
+        return np.prod(features, axis = 1)
+
+    # Test byte data types
+    def test_byte(self):
+        self.base_test(source = self.byte_table, model = self.integer_model, np_dtype = np.byte)
+
+    # Test short data types
+    def test_short(self):
+        self.base_test(source = self.short_table, model = self.integer_model, np_dtype = np.short)
+
+    # Test int data types
+    def test_int(self):
+        self.base_test(source = self.int_table, model = self.integer_model, np_dtype = np.intc)
+
+    # Test long data types
+    def test_long(self):
+        self.base_test(source = self.long_table, model = self.integer_model, np_dtype = np.int_)
+
+    # Test float data types
+    def test_float(self):
+        self.base_test(source = self.float_table, model = self.decimal_model, np_dtype = np.single)
+
+    # Test double data types
+    def test_double(self):
+        self.base_test(source = self.double_table, model = self.decimal_model, np_dtype = np.double)
+
+    # The base test, which other tests will be built from
+    def base_test(self, source, model, np_dtype):
+
+        rows = source.getRowSet()
+        cols = [source.getColumnSource(col) for col in ["X", "Y", "Z"]]
+
+        gatherer_rowmajor = lambda rowset, colset : gather.table_to_numpy_2d(rowset, colset, gather.MemoryLayout.ROW_MAJOR, np_dtype)
+        gatherer_colmajor = lambda rowset, colset : gather.table_to_numpy_2d(rowset, colset, gather.MemoryLayout.COLUMN_MAJOR, np_dtype)
+
+        array_from_table = tableToDataFrame(source).values
+
+        gathered_rowmajor = gatherer_rowmajor(rows, cols)
+        gathered_colmajor = gatherer_colmajor(rows, cols)
+
+        with self.subTest(msg = "Array shape"):
+            self.assertTrue(gathered_rowmajor.shape == array_from_table.shape)
+            print("Row major gathered shape: {}".format(gathered_rowmajor.shape))
+            self.assertTrue(gathered_colmajor.shape == array_from_table.shape)
+            print("Column major gathered shape: {}".format(gathered_colmajor.shape))
+        with self.subTest(msg = "Values in array"):
+            self.assertTrue(np.allclose(gathered_rowmajor, array_from_table))
+            print("All row-major array values are equal")
+            self.assertTrue(np.allclose(gathered_colmajor, array_from_table))
+            print("All column-major array values are equal")
+        with self.subTest(msg = "Array data type"):
+            self.assertTrue(gathered_rowmajor.dtype == np_dtype)
+            self.assertTrue(gathered_rowmajor.dtype == array_from_table.dtype)
+            self.assertTrue(gathered_colmajor.dtype == np_dtype)
+            self.assertTrue(gathered_colmajor.dtype == array_from_table.dtype)
+            self.assertTrue(gathered_rowmajor.dtype == gathered_colmajor.dtype)
+            print("Array dtype: {}".format(np_dtype))
+        with self.subTest(msg = "Contiguity"):
+            self.assertTrue(gathered_rowmajor.flags["C_CONTIGUOUS"] or gathered_rowmajor.flags["F_CONTIGUOUS"])
+            self.assertTrue(gathered_colmajor.flags["C_CONTIGUOUS"] or gathered_colmajor.flags["F_CONTIGUOUS"])
+            print("Array contiguity checked")