From 251d422f69b6ceddca8f14fcc5f02c1acb5d1b0b Mon Sep 17 00:00:00 2001 From: Jake Mulford Date: Wed, 9 Feb 2022 11:44:33 -0500 Subject: [PATCH 1/3] time series notebooks feedback from pete --- .../A1 Create tables and fake data .md | 68 +++++++------------ .../A2 Filter and decorate .md | 13 ++-- .../A3 Do time series and relational joins.md | 47 ++++++------- 3 files changed, 54 insertions(+), 74 deletions(-) diff --git a/demo/web/src/main/notebooks/A. Real-time table ops/A1 Create tables and fake data .md b/demo/web/src/main/notebooks/A. Real-time table ops/A1 Create tables and fake data .md index ba5e567c207..1919742127b 100644 --- a/demo/web/src/main/notebooks/A. Real-time table ops/A1 Create tables and fake data .md +++ b/demo/web/src/main/notebooks/A. Real-time table ops/A1 Create tables and fake data .md @@ -2,67 +2,49 @@ Throughout this demo notebook series, we show many of the ways to interact with real-time data in Deephaven. Here, we create some tables with fake data; in other notebooks, we show how to perform table operations on that data. Knowing how to create fake ticking tables is useful for familiarizing yourself with Deephaven, but also for working on proof of concepts without necessarily having a complete dataset. -The following Python code contains a method that creates a table of random integers and characters, with each row in the table also containing a timestamp. +The following Python code contains a method that creates a table of random integers, characters, and booleans, with each row in the table also containing a timestamp. ```python -from deephaven import DynamicTableWriter -from deephaven.DateTimeUtils import plus -import deephaven.Types as dht +from deephaven.TableTools import timeTable import random import string -import threading -def create_random_table(number_of_rows, start_time, time_offset): +def random_int(): + return random.randint(1,100) +def random_character(): + return random.choice(string.ascii_uppercase) +def random_boolean(): + return random.choice([True, False]) + +def create_random_table(time_interval, start_time=None): """ Creates a Deephaven table containing rows of random integers from 1 to 99, random uppercase characters, and timestamps. - + Parameters: - number_of_rows (int): The number of rows that the resulting table will contain. - start_time (DateTime): The Deephaven date-time of the first row in the table. - time_offset (Period): A Period object representing the timestamp difference between - each row in the table. + time_interval (str||int): String or int representation of the time interval between rows. + start_time (str||DateTime): Optional string or DateTime representation of the start time. Returns: A Deephaven Table containing the random data. """ - def thread_function(number_of_rows, start_time, time_offset, table_writer): - time = start_time - for i in range(number_of_rows): - random_number = random.randint(1, 100) - random_character = random.choice(string.ascii_uppercase) - random_boolean = random.choice([True, False]) - table_writer.logRow(time, random_number, random_character, random_boolean) - time = plus(time, time_offset) - - column_names = ["DateTime", "Number", "Character", "Boolean"] - column_types = [dht.datetime, dht.int_, dht.string, dht.bool_] - table_writer = DynamicTableWriter(column_names, column_types) - - thread = threading.Thread(target=thread_function, args=(number_of_rows, start_time, time_offset, table_writer)) - thread.start() - - return table_writer.getTable() + table = None + if start_time is None: + table = timeTable(time_interval) + else: + table = timeTable(start_time, time_interval) + + return table.update("Number = (int)random_int()")\ + .update("Character = (String)random_character()")\ + .update("Boolean = (boolean)random_boolean()") ``` We can use this method to create some tables with random data. ```python -from deephaven.DateTimeUtils import Period, convertDateTime - -start_time = convertDateTime("2000-01-01T00:00:00 NY") - -time_offset = Period("T1S") -random_table_1_second_offset_small = create_random_table(1000, start_time, time_offset) -random_table_1_second_offset_large = create_random_table(100000, start_time, time_offset) - -time_offset = Period("T10S") -random_table_10_seconds_offset_small = create_random_table(1000, start_time, time_offset) -random_table_10_seconds_offset_large = create_random_table(100000, start_time, time_offset) - -time_offset = Period("T0.1S") -random_table_tenth_second_offset_small = create_random_table(1000, start_time, time_offset) -random_table_tenth_second_offset_large = create_random_table(100000, start_time, time_offset) +random_table_1_second_offset = create_random_table("00:00:01") +random_table_10_seconds_offset = create_random_table("00:00:10") +random_table_tenth_second_offset = create_random_table("00:00:00.1") ``` [The next notebook](A2%20Filter%20and%20decorate.md) will show how to filter and decorate this data. diff --git a/demo/web/src/main/notebooks/A. Real-time table ops/A2 Filter and decorate .md b/demo/web/src/main/notebooks/A. Real-time table ops/A2 Filter and decorate .md index 8ce92e8eab7..04d074713a0 100644 --- a/demo/web/src/main/notebooks/A. Real-time table ops/A2 Filter and decorate .md +++ b/demo/web/src/main/notebooks/A. Real-time table ops/A2 Filter and decorate .md @@ -2,19 +2,22 @@ In our previous notebook, we showed how to create some tables with fake data. In this notebook, we show how to decorate and filter our data. -Let's start by simulating a year's worth of daily measurements. This could represent something like stock prices, temperatures, etc. +Let's start by simulating measurements of our values every minute. This could represent something like stock prices, temperatures, etc. ```python -start_time = convertDateTime("2020-01-01T00:00:00 NY") +from deephaven.DateTimeUtils import currentTime, expressionToNanos, minus -time_offset = Period("1D") -daily_data = create_random_table(365, start_time, time_offset) +time_interval = expressionToNanos("T1M") +offset = expressionToNanos("10D") +now = currentTime() + +daily_data = create_random_table(time_interval, start_time=minus(now, offset)) ``` Now we decorate the data by adding its day of the week. ```python -daily_data = daily_data.update("DayOfWeekInt = dayOfWeek(DateTime, TZ_NY)") +daily_data = daily_data.update("DayOfWeekInt = dayOfWeek(Timestamp, TZ_NY)") ``` Next, we convert the day of week to a string representation. diff --git a/demo/web/src/main/notebooks/A. Real-time table ops/A3 Do time series and relational joins.md b/demo/web/src/main/notebooks/A. Real-time table ops/A3 Do time series and relational joins.md index f823985ffd5..e326dcf27e0 100644 --- a/demo/web/src/main/notebooks/A. Real-time table ops/A3 Do time series and relational joins.md +++ b/demo/web/src/main/notebooks/A. Real-time table ops/A3 Do time series and relational joins.md @@ -2,18 +2,16 @@ In our [previous notebook](A2%20Filter%20and%20decorate%20.md), we showed how to filter and decorate our time tables. In this notebook, we show how to perform joins with our time series data. -Let's start again by simulating a year's worth of daily measurements, but this time using two tables with slightly different timestamps. This is great for a simulation because there's no guarantee that a real example will collect data with exact timestamp matches. +Let's start again by simulating measurements of our values every minute, but this time using two tables with slightly different timestamps. This is great for a simulation because there's no guarantee that a real example will collect data with exact timestamp matches. ```python -time_offset = Period("1D") +time_interval = expressionToNanos("T1M") +offset_0 = expressionToNanos("10DT2S") +offset_1 = expressionToNanos("10D") +now = currentTime() -start_times = [ - convertDateTime("2020-01-01T00:00:00 NY"), - convertDateTime("2020-01-01T00:00:02 NY") -] - -daily_data_0 = create_random_table(365, start_times[0], time_offset) -daily_data_1 = create_random_table(365, start_times[1], time_offset) +daily_data_0 = create_random_table(time_interval, start_time=minus(now, offset_0)) +daily_data_1 = create_random_table(time_interval, start_time=minus(now, offset_1)) ``` To join these tables together based on the timestamps, we need to use an [as-of join, or `aj`](https://deephaven.io/core/docs/reference/table-operations/join/aj/). As-of joins perform exact matches across all given columns except for the last one, which instead matches based on the closest values. @@ -23,29 +21,26 @@ For an `aj`, the values in the right table are matched to the closest values in Let's join these tables using an `aj` to get a single table with all of our information. ```python -joined_data_aj = daily_data_0.aj(daily_data_1, "DateTime", "Number1 = Number, Character1 = Character, Boolean1 = Boolean") +joined_data_aj = daily_data_0.aj(daily_data_1, "Timestamp", "Number1 = Number, Character1 = Character, Boolean1 = Boolean") ``` -If you look at the `joined_data_aj` table, you may not see what you'd expect. Specifically, the first row won't have any values from our `daily_data_1` table, and the last row of the `daily_data_1` table isn't present. What happened? - -Remember that an `aj` works based on a search where values in the right table are matched to the closest values in the left table without going over the left value: - -- If there's a value in the left table that doesn't match a value in the right table, the created row will have `NULL` values for what would have been the values from the right table. -- If there's a value in the right table that doesn't match a value in the left table, the row in the right table won't be included in the joined table. - -When looking at the first row in our tables, the timestamp for the right table is `2020-01-01T00:00:02.000` and the timestamp for the left table is `2020-01-01T00:00:00.000`. Since these are the lowest values in our table, there's no match for the left table's timestamp since all of the values in the right table are greater than it, resulting in the row with `NULL` values. - -This also explains how the last row in `daily_data_1` is lost. The timestamp value of the last row in `daily_data_1` is `2020-12-29T00:00:02.000`. Since all the values in the left table are less than this value, there can't be a match without going over the left value, resulting in this row being lost. +Deephaven supports another type of as of join, an `raj`. For a `raj`, the values in the right table are matched to the closest values in the left table without going under the left value. For example, if the right table contains a value `5` and the left table contains values `4` and `6`, the right table's `5` will be matched on the left table's `4`. -How can these tables join as expected? We could flip the left and right tables, but then our timestamp column will contain the values in `daily_data_1`, which are a bit messy. Instead, we can use a [reverse as-of join, or `raj`](https://deephaven.io/core/docs/reference/table-operations/join/raj/) to keep the same left and right tables and match our timestamps as we'd expect. +Let's also join these tables using a `raj`. -For a `raj`, the values in the right table are matched to the closest values in the left table without going under the left value. For example, if the right table contains a value `5` and the left table contains values `4` and `6`, the right table's `5` will be matched on the left table's `4`. +```python +joined_data_raj = daily_data_0.raj(daily_data_1, "Timestamp", "Number1 = Number, Character1 = Character, Boolean1 = Boolean") +``` -Let's join these tables using a `raj`. +As of joins work very well with time-tables that sample at different frequencies. Let's create two new tables, one that samples every second and one that samples every ten seconds, and show what happesn when we join them together using `aj` and `raj`. ```python -joined_data_raj = daily_data_0.raj(daily_data_1, "DateTime", "Number1 = Number, Character1 = Character, Boolean1 = Boolean") -``` +time_interval_0 = expressionToNanos("T1S") +time_interval_1 = expressionToNanos("T10S") -And now we have our table joined as expected. +sample_data_0 = create_random_table(time_interval_0) +sample_data_1 = create_random_table(time_interval_1) +sample_data_aj = sample_data_0.aj(sample_data_1, "Timestamp", "Number1 = Number, Character1 = Character, Boolean1 = Boolean") +sample_data_raj = sample_data_0.raj(sample_data_1, "Timestamp", "Number1 = Number, Character1 = Character, Boolean1 = Boolean") +``` From 33559295f8165c0a676efd3e3ba1eea16271b417 Mon Sep 17 00:00:00 2001 From: Jake Mulford Date: Wed, 9 Feb 2022 11:53:18 -0500 Subject: [PATCH 2/3] showed separate example of creating fake data outside of the method --- .../A1 Create tables and fake data .md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/demo/web/src/main/notebooks/A. Real-time table ops/A1 Create tables and fake data .md b/demo/web/src/main/notebooks/A. Real-time table ops/A1 Create tables and fake data .md index 1919742127b..4251fccc7c7 100644 --- a/demo/web/src/main/notebooks/A. Real-time table ops/A1 Create tables and fake data .md +++ b/demo/web/src/main/notebooks/A. Real-time table ops/A1 Create tables and fake data .md @@ -2,7 +2,7 @@ Throughout this demo notebook series, we show many of the ways to interact with real-time data in Deephaven. Here, we create some tables with fake data; in other notebooks, we show how to perform table operations on that data. Knowing how to create fake ticking tables is useful for familiarizing yourself with Deephaven, but also for working on proof of concepts without necessarily having a complete dataset. -The following Python code contains a method that creates a table of random integers, characters, and booleans, with each row in the table also containing a timestamp. +`timeTable` is a great tool to simulate real-time data. We can use this and Python's `random` library to generate some fake data. ```python from deephaven.TableTools import timeTable @@ -17,6 +17,14 @@ def random_character(): def random_boolean(): return random.choice([True, False]) +table = timeTable("00:00:01").update("Number = (int)random_int()")\ + .update("Character = (String)random_character()")\ + .update("Boolean = (boolean)random_boolean()") +``` + +Let's wrap `timeTable` with a method and parameterize the time intervals and start times. This will allow us to reuse it throughout the notebooks. + +```python def create_random_table(time_interval, start_time=None): """ Creates a Deephaven table containing rows of random integers from 1 to 99, random From 56b3c0a5d491da18483f7837dc7aa743dce60cea Mon Sep 17 00:00:00 2001 From: margaretkennedy <82049573+margaretkennedy@users.noreply.github.com> Date: Fri, 18 Feb 2022 05:29:37 -0500 Subject: [PATCH 3/3] Apply suggestions from code review --- .../A1 Create tables and fake data .md | 2 +- .../A3 Do time series and relational joins.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/demo/web/src/main/notebooks/A. Real-time table ops/A1 Create tables and fake data .md b/demo/web/src/main/notebooks/A. Real-time table ops/A1 Create tables and fake data .md index 4251fccc7c7..7b1297f40d0 100644 --- a/demo/web/src/main/notebooks/A. Real-time table ops/A1 Create tables and fake data .md +++ b/demo/web/src/main/notebooks/A. Real-time table ops/A1 Create tables and fake data .md @@ -22,7 +22,7 @@ table = timeTable("00:00:01").update("Number = (int)random_int()")\ .update("Boolean = (boolean)random_boolean()") ``` -Let's wrap `timeTable` with a method and parameterize the time intervals and start times. This will allow us to reuse it throughout the notebooks. +Let's wrap `timeTable` with a method and parameterize the time intervals and start times. This will allow us to reuse the table throughout the notebooks. ```python def create_random_table(time_interval, start_time=None): diff --git a/demo/web/src/main/notebooks/A. Real-time table ops/A3 Do time series and relational joins.md b/demo/web/src/main/notebooks/A. Real-time table ops/A3 Do time series and relational joins.md index e326dcf27e0..bbb8557f4dd 100644 --- a/demo/web/src/main/notebooks/A. Real-time table ops/A3 Do time series and relational joins.md +++ b/demo/web/src/main/notebooks/A. Real-time table ops/A3 Do time series and relational joins.md @@ -24,15 +24,15 @@ Let's join these tables using an `aj` to get a single table with all of our info joined_data_aj = daily_data_0.aj(daily_data_1, "Timestamp", "Number1 = Number, Character1 = Character, Boolean1 = Boolean") ``` -Deephaven supports another type of as of join, an `raj`. For a `raj`, the values in the right table are matched to the closest values in the left table without going under the left value. For example, if the right table contains a value `5` and the left table contains values `4` and `6`, the right table's `5` will be matched on the left table's `4`. +Deephaven supports another type of as-of-join, a reverse as-of-join. Using `raj`, the values in the right table are matched to the closest values in the left table without going under the left value. For example, if the right table contains a value `5` and the left table contains values `4` and `6`, the right table's `5` will be matched on the left table's `4`. -Let's also join these tables using a `raj`. +Let's also join these tables with the `raj` method. ```python joined_data_raj = daily_data_0.raj(daily_data_1, "Timestamp", "Number1 = Number, Character1 = Character, Boolean1 = Boolean") ``` -As of joins work very well with time-tables that sample at different frequencies. Let's create two new tables, one that samples every second and one that samples every ten seconds, and show what happesn when we join them together using `aj` and `raj`. +As-of-joins work very well with time tables that sample at different frequencies. Let's create two new tables, one that samples every second and one that samples every ten seconds, and show what happens when we join them together using `aj` and `raj`. ```python time_interval_0 = expressionToNanos("T1S")