Skip to content

Commit

Permalink
Java gatherer (#1523)
Browse files Browse the repository at this point in the history
* Push of gatherer after performance and memory leak testing

* Add Gatherer wrapper to Python learn submodule

* Changes from ./gradlew :Integrations:spotlessApply

* Minor update and rename to Pythonic conventions

* Fix typo

* Another typo

* Add _defineSymbols call so class wrappers get defined

* Changes.  Not fully ready, but I don't want to stash them yet.

* More updates.  Still not ready, but I want to save these and work on deephaven.java for a bit

* Updates

* Remove print statement for typeString

* Replace functions for each data type with one single function

* Add support for python built-in types, not just NumPy types

* Fix typo

* Convert Python built-in types to NumPy dtypes

* Updates from Chip's review.  Still testing

* Updates per Chip's review.  Code has been tested.  I will add example testing code in a comment on the PR

* spotlessApply

* Changes - remove transpose altogether, move transferrer to learn.gather

* spotlessApply so checks will pass

* Add unit tests for Python and Java.  The Python unit tests may fail because of the boolean issue (issue 1590)

* Update tests.  Java tests still fail, and Python will be updated.  But the code has been cleaned up

* Minor clean up of Python unit tests

* Updates to unit tests, make IndexSet public for use in Python unit tests

* spotless apply

* Fix Python test syntax

* Major changes to Gather functions, and updates to tests/Python code in accordance with

* Fix Python test

* Fix java tests

* Remove commented out java test

* Add row- and column-major functions.  Also update tests in Java/Python and the corresponding Python code

* spotlessApply

* Fix for Python

* Fix Python test (reference to old IndexSet)

* Changes from Chip's review

* spotlessApply

* Fix Python

* Updates from Chip's review

* spotlessApply

* Updates from Chip's review

* Fix return to fix Python test

* Chip's suggestion for comment in enum

* Put comments below enum values to make Sphinx happy
  • Loading branch information
jjbrosnan authored Dec 9, 2021
1 parent 4e1524b commit ae32bfd
Show file tree
Hide file tree
Showing 5 changed files with 1,134 additions and 1 deletion.
6 changes: 5 additions & 1 deletion Integrations/python/deephaven/learn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,13 @@ def _defineSymbols():
raise SystemError("No java functionality can be used until the JVM has been initialized through the jpy module")

global _Input_, _Output_, _Computer_, _Scatterer_

if _Input_ is None:
_Input_ = jpy.get_type("io.deephaven.integrations.learn.Input")
_Output_ = jpy.get_type("io.deephaven.integrations.learn.Output")
_Computer_ = jpy.get_type("io.deephaven.integrations.learn.Computer")
_Scatterer_ = jpy.get_type("io.deephaven.integrations.learn.Scatterer")


# every module method that invokes Java classes should be decorated with @_passThrough
@wrapt.decorator
def _passThrough(wrapped, instance, args, kwargs):
Expand All @@ -55,6 +55,10 @@ def _passThrough(wrapped, instance, args, kwargs):
_defineSymbols()
return wrapped(*args, **kwargs)

try:
_defineSymbols()
except Exception as e:
pass

@_passThrough
class Input:
Expand Down
122 changes: 122 additions & 0 deletions Integrations/python/deephaven/learn/gather/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#
# Copyright (c) 2016 - 2021 Deephaven Data Labs and Patent Pending
#
"""
Utilities for gathering Deephaven table data into Python objects
"""

import numpy as np
import enum
import jpy
import wrapt

# None until the first _defineSymbols() call
_gatherer = None

def _defineSymbols():
if not jpy.has_jvm():
raise SystemError("No java functionality can be used until the JVM has been initialized through the jpy module")

global _gatherer
global Layout

if _gatherer is None:
_gatherer = jpy.get_type("io.deephaven.integrations.learn.gather.NumPy")

class MemoryLayout(enum.Enum):
"""
Memory layouts for an array.
"""
ROW_MAJOR = True
"""Row-major memory layout."""
COLUMN_MAJOR = False
"""Column-major memory layout."""
C = True
"""Memory layout consistent with C arrays (row-major)."""
FORTRAN = False
"""Memory layout consistent with Fortran arrays (column-major)."""

def __init__(self, is_row_major):
self.is_row_major = is_row_major


# Every method that depends on symbols defined via _defineSymbols() should be decorated with @_passThrough
@wrapt.decorator
def _passThrough(wrapped, instance, args, kwargs):
"""
For decoration of module methods, to define necessary symbols at runtime
:param wrapped: the method to be decorated
:param instance: the object to which the wrapped function was bound when it was called
:param args: the argument list for `wrapped`
:param kwargs: the keyword argument dictionary for `wrapped`
:return: the decorated version of the method
"""

_defineSymbols()
return wrapped(*args, **kwargs)

try:
_defineSymbols()
except Exception as e:
pass

@_passThrough
def convert_to_numpy_dtype(dtype):
"""
Convert an input type to the corresponding NumPy data type
:param dtype: A Python type
"""
if dtype.__module__ == np.__name__:
return dtype
elif dtype == bool:
dtype = np.bool_
elif dtype == float:
dtype = np.double
elif dtype == int:
dtype = np.intc
else:
raise ValueError(f"{dtype} is not a data type that can be converted to a NumPy dtype.")
return dtype

@_passThrough
def table_to_numpy_2d(row_set, col_set, order:MemoryLayout = MemoryLayout.ROW_MAJOR, dtype:np.dtype = np.intc):
"""
Convert Deephaven table data to a 2d NumPy array of the appropriate size
:param row_set: A RowSequence describing the number of rows in the table
:param col_set: ColumnSources describing which columns to copy
:param order: :param order: The desired memory layout of the output array
:param dtype: The desired NumPy data type of the output NumPy array
:return: A NumPy ndarray
"""

if not(isinstance(order, MemoryLayout)):
raise ValueError(f"Invalid major order {order}. Please use an enum value from MemoryLayout.")

dtype = convert_to_numpy_dtype(dtype)

if dtype == np.byte:
buffer = _gatherer.tensorBuffer2DByte(row_set, col_set, order.is_row_major)
elif dtype == np.short:
buffer = _gatherer.tensorBuffer2DShort(row_set, col_set, order.is_row_major)
elif dtype == np.intc:
buffer = _gatherer.tensorBuffer2DInt(row_set, col_set, order.is_row_major)
elif dtype == np.int_:
buffer = _gatherer.tensorBuffer2DLong(row_set, col_set, order.is_row_major)
elif dtype == np.single:
buffer = _gatherer.tensorBuffer2DFloat(row_set, col_set, order.is_row_major)
elif dtype == np.double:
buffer = _gatherer.tensorBuffer2DDouble(row_set, col_set, order.is_row_major)
else:
raise ValueError(f"Data type {dtype} is not supported.")

tensor = np.frombuffer(buffer, dtype = dtype)

if order.is_row_major:
tensor.shape = (len(col_set), row_set.intSize())
return tensor.T
else:
tensor.shape = (row_set.intSize(), len(col_set))
return tensor
170 changes: 170 additions & 0 deletions Integrations/python/test/test_learn_gather.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
#
# Copyright (c) 2016-2021 Deephaven Data Labs and Patent Pending
#

##############################################################################
# NOTE: the jvm should have been initialized, or this test will certainly fail
##############################################################################

import pandas as pd
import numpy as np
import unittest
import jpy
import sys
import os

from deephaven import learn, tableToDataFrame, TableTools
from deephaven.learn import gather

class TestGather(unittest.TestCase):
"""
Test cases for deephaven.learn submodule
"""

@classmethod
def setUpClass(cls):
"""
Inherited method allowing initialization of test environment
"""
# Tables
cls.bool_table = TableTools.emptyTable(100).update(
"X = true",
"Y = false",
"Z = (i % 2 == 0) ? true : false"
)
cls.byte_table = TableTools.emptyTable(100).update(
"X = (byte)i",
"Y = (byte)(100 - X)",
"Z = (byte)(-101 + X)"
)
cls.short_table = TableTools.emptyTable(100).update(
"X = (short)i",
"Y = (short)(100 - X)",
"Z = (short)(-101 + X)"
)
cls.int_table = TableTools.emptyTable(100).update(
"X = (int)i",
"Y = 100 - X",
"Z = -101 + X"
)
cls.long_table = TableTools.emptyTable(100).update(
"X = (long)i",
"Y = 100 - X",
"Z = -101 + X"
)
cls.float_table = TableTools.emptyTable(100).update(
"X = (float)i",
"Y = (float)sqrt(X)",
"Z = (float)sqrt(Y)"
)
cls.double_table = TableTools.emptyTable(100).update(
"X = (double)i",
"Y = sqrt(X)",
"Z = sqrt(Y)"
)
# NumPy arrays
cls.bool_array = \
np.array([[True, False, True], [True, False, False]] * 50,
dtype = np.bool_)
cls.byte_array = np.vstack((
np.arange(0, 100, dtype = np.byte),
np.arange(100, 0, -1, dtype = np.byte),
np.arange(-101, -1, dtype = np.byte)
)).T
cls.short_array = np.vstack((
np.arange(0, 100, dtype = np.short),
np.arange(100, 0, -1, dtype = np.short),
np.arange(-101, -1, dtype = np.short)
)).T
cls.int_array = np.vstack((
np.arange(0, 100, dtype = np.intc),
np.arange(100, 0, -1, dtype = np.intc),
np.arange(-101, -1, dtype = np.intc)
)).T
cls.long_array = np.vstack((
np.arange(0, 100, dtype = np.int_),
np.arange(100, 0, -1, dtype = np.int_),
np.arange(-101, -1, dtype = np.int_)
)).T
cls.float_array = np.vstack((
np.arange(0, 100, dtype = np.single),
np.sqrt(np.arange(0, 100, dtype = np.single)),
np.sqrt(np.sqrt(np.arange(0, 100, dtype = np.single)))
)).T
cls.double_array = np.vstack((
np.arange(0, 100, dtype = np.double),
np.sqrt(np.arange(0, 100, dtype = np.double)),
np.sqrt(np.sqrt(np.arange(0, 100, dtype = np.double)))
)).T

# Model for learn to use when dtype = [np.bool_]
def boolean_model(self, features):
return np.count_nonzero(features, axis = 1) < 2

# Model for learn to use when dtype = [np.byte, np.short, np.intc, np.int_]
def integer_model(self, features):
return np.sum(features, axis = 1)

# Model for learn to use when dtype = [np.single, np.double]
def decimal_model(self, features):
return np.prod(features, axis = 1)

# Test byte data types
def test_byte(self):
self.base_test(source = self.byte_table, model = self.integer_model, np_dtype = np.byte)

# Test short data types
def test_short(self):
self.base_test(source = self.short_table, model = self.integer_model, np_dtype = np.short)

# Test int data types
def test_int(self):
self.base_test(source = self.int_table, model = self.integer_model, np_dtype = np.intc)

# Test long data types
def test_long(self):
self.base_test(source = self.long_table, model = self.integer_model, np_dtype = np.int_)

# Test float data types
def test_float(self):
self.base_test(source = self.float_table, model = self.decimal_model, np_dtype = np.single)

# Test double data types
def test_double(self):
self.base_test(source = self.double_table, model = self.decimal_model, np_dtype = np.double)

# The base test, which other tests will be built from
def base_test(self, source, model, np_dtype):

rows = source.getRowSet()
cols = [source.getColumnSource(col) for col in ["X", "Y", "Z"]]

gatherer_rowmajor = lambda rowset, colset : gather.table_to_numpy_2d(rowset, colset, gather.MemoryLayout.ROW_MAJOR, np_dtype)
gatherer_colmajor = lambda rowset, colset : gather.table_to_numpy_2d(rowset, colset, gather.MemoryLayout.COLUMN_MAJOR, np_dtype)

array_from_table = tableToDataFrame(source).values

gathered_rowmajor = gatherer_rowmajor(rows, cols)
gathered_colmajor = gatherer_colmajor(rows, cols)

with self.subTest(msg = "Array shape"):
self.assertTrue(gathered_rowmajor.shape == array_from_table.shape)
print("Row major gathered shape: {}".format(gathered_rowmajor.shape))
self.assertTrue(gathered_colmajor.shape == array_from_table.shape)
print("Column major gathered shape: {}".format(gathered_colmajor.shape))
with self.subTest(msg = "Values in array"):
self.assertTrue(np.allclose(gathered_rowmajor, array_from_table))
print("All row-major array values are equal")
self.assertTrue(np.allclose(gathered_colmajor, array_from_table))
print("All column-major array values are equal")
with self.subTest(msg = "Array data type"):
self.assertTrue(gathered_rowmajor.dtype == np_dtype)
self.assertTrue(gathered_rowmajor.dtype == array_from_table.dtype)
self.assertTrue(gathered_colmajor.dtype == np_dtype)
self.assertTrue(gathered_colmajor.dtype == array_from_table.dtype)
self.assertTrue(gathered_rowmajor.dtype == gathered_colmajor.dtype)
print("Array dtype: {}".format(np_dtype))
with self.subTest(msg = "Contiguity"):
self.assertTrue(gathered_rowmajor.flags["C_CONTIGUOUS"] or gathered_rowmajor.flags["F_CONTIGUOUS"])
self.assertTrue(gathered_colmajor.flags["C_CONTIGUOUS"] or gathered_colmajor.flags["F_CONTIGUOUS"])
print("Array contiguity checked")
Loading

0 comments on commit ae32bfd

Please sign in to comment.