Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmarks for nested element assignment #54

Merged
merged 2 commits into from
May 7, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 91 additions & 7 deletions benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,98 @@
For more information on writing benchmarks:
https://asv.readthedocs.io/en/stable/writing_benchmarks.html."""

from nested_pandas import example_benchmarks
import numpy as np
import pandas as pd
import pyarrow as pa
from nested_pandas import NestedDtype


def time_computation():
"""Time computations are prefixed with 'time'."""
example_benchmarks.runtime_computation()
class AssignSingleDfToNestedSeries:
"""Benchmark the performance of changing a single nested series element"""

n_objects = 10_000
n_sources = 100
new_df: pd.DataFrame
series: pd.Series

def mem_list():
"""Memory computations are prefixed with 'mem' or 'peakmem'."""
return example_benchmarks.memory_computation()
def setup(self):
"""Set up the benchmark environment."""
self.new_df = pd.DataFrame(
{
"time": np.arange(self.n_sources, dtype=np.float64),
"flux": np.linspace(0, 1, self.n_sources),
"band": np.full_like("lsstg", self.n_sources),
}
)
original_df = pd.DataFrame(
{
"time": np.linspace(0, 1, self.n_sources),
"flux": np.arange(self.n_sources, dtype=np.float64),
"band": np.full_like("sdssu", self.n_sources),
}
)
self.series = pd.Series(
[original_df] * self.n_objects,
# When we had NestedExtentionArray inheriting ArrowExtentionArray, it sorted the fields, so we
# need to order by field name here for backwards compatibility.
dtype=NestedDtype.from_fields({"band": pa.string(), "flux": pa.float64(), "time": pa.float64()}),
)

def run(self):
"""Run the benchmark."""
self.series[self.n_objects // 2] = self.new_df

def time_run(self):
"""Benchmark the runtime of changing a single nested series element."""
self.run()

def peakmem_run(self):
"""Benchmark the memory usage of changing a single nested series element."""
self.run()


class ReassignHalfOfNestedSeries:
"""Benchmark the performance of changing a lot of nested series elements"""

n_objects = 10_000
n_sources = 100
series: pd.Series
new_series: pd.Series

def setup(self):
"""Set up the benchmark environment."""
# When we had NestedExtentionArray inheriting ArrowExtentionArray, it sorted the fields, so we need to
# order by field name here for backwards compatibility.
dtype = NestedDtype.from_fields({"band": pa.string(), "flux": pa.float64(), "time": pa.float64()})
original_df = pd.DataFrame(
{
"time": np.linspace(0, 1, self.n_sources),
"flux": np.arange(self.n_sources, dtype=np.float64),
"band": np.full_like("sdssu", self.n_sources),
}
)
self.series = pd.Series(
[original_df] * self.n_objects,
dtype=dtype,
)

new_df = pd.DataFrame(
{
"time": np.arange(self.n_sources, dtype=np.float64),
"flux": np.linspace(0, 1, self.n_sources),
"band": np.full_like("lsstg", self.n_sources),
}
)
self.new_series = pd.Series([new_df] * (self.n_objects // 2), dtype=dtype)

def run(self):
"""Run the benchmark."""
self.series[::2] = self.new_series

def time_run(self):
"""Benchmark the runtime of changing a single nested series element."""
self.run()

def peakmem_run(self):
"""Benchmark the memory usage of changing a single nested series element."""
self.run()