Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release: v0.2.2 #191

Merged
merged 16 commits into from
Nov 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ ignore_errors = True
omit =
tests/*
meerkat/contrib/*
meerkat/nn/*
meerkat/ml/*
setup.py
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ jobs:

- name: Install Dependencies
run: |
pip install -e ".[dev, text, vision, tabular, embeddings-linux, medimg]"
pip install -e ".[dev, text, vision, tabular, embeddings-linux, medimg, ml]"

- name: Test with pytest
run: |
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

<div align="center">
<img src="docs/meerkat_banner.png" height=100 alt="Meerkat logo"/>
<img src="docs/assets/meerkat_banner.png" height=100 alt="Meerkat logo"/>
</div>

-----
Expand Down
File renamed without changes
File renamed without changes
Binary file removed docs/robustnessgym.png
Binary file not shown.
8 changes: 3 additions & 5 deletions meerkat/__init__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
"""Import common classes."""
# flake8: noqa

from meerkat.logging.utils import (
initialize_logging,
set_logging_level,
set_logging_level_for_imports,
)
from meerkat.logging.utils import initialize_logging

initialize_logging()

from meerkat.cells.abstract import AbstractCell
from meerkat.cells.spacy import LazySpacyCell, SpacyCell
from meerkat.cells.volume import MedicalVolumeCell
from meerkat.columns.abstract import AbstractColumn
from meerkat.columns.arrow_column import ArrowArrayColumn
from meerkat.columns.cell_column import CellColumn
from meerkat.columns.image_column import ImageColumn
from meerkat.columns.lambda_column import LambdaCell, LambdaColumn
Expand All @@ -23,6 +20,7 @@
from meerkat.columns.tensor_column import TensorColumn
from meerkat.columns.video_column import VideoColumn
from meerkat.columns.volume_column import MedicalVolumeColumn
from meerkat.contrib.registry import datasets
from meerkat.datapanel import DataPanel
from meerkat.ops.concat import concat
from meerkat.ops.merge import merge
Expand Down
15 changes: 10 additions & 5 deletions meerkat/block/abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,20 @@

@dataclass
class BlockView:
data: object
block_index: BlockIndex
block: AbstractBlock

@property
def data(self):
return self.block._get_data(self.block_index)


class AbstractBlock:
def __init__(self, *args, **kwargs):
super(AbstractBlock, self).__init__(*args, **kwargs)

def __getitem__(self, index: BlockIndex) -> BlockView:
return BlockView(data=self._get_data(index), block_index=index, block=self)
return BlockView(block_index=index, block=self)

def _get_data(self, index: BlockIndex) -> object:
"""Must return view of the underlying data."""
Expand All @@ -39,9 +42,11 @@ def signature(self) -> Hashable:
raise NotImplementedError

@classmethod
def from_data(
cls, data: object, names: Sequence[str]
) -> Tuple[AbstractBlock, Mapping[str, BlockIndex]]:
def from_column_data(cls, data: object) -> Tuple[AbstractBlock, BlockView]:
raise NotImplementedError()

@classmethod
def from_block_data(cls, data: object) -> Tuple[AbstractBlock, BlockView]:
raise NotImplementedError()

@classmethod
Expand Down
147 changes: 147 additions & 0 deletions meerkat/block/arrow_block.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
from __future__ import annotations

import os
from dataclasses import dataclass
from typing import Hashable, List, Sequence, Union

import numpy as np
import pandas as pd
import pyarrow as pa
import torch

from meerkat.block.ref import BlockRef
from meerkat.columns.numpy_column import NumpyArrayColumn
from meerkat.columns.tensor_column import TensorColumn

from .abstract import AbstractBlock, BlockIndex, BlockView


class ArrowBlock(AbstractBlock):
@dataclass(eq=True, frozen=True)
class Signature:
nrows: int
klass: type
# mmap: bool

def __init__(self, data: pa.Table, *args, **kwargs):
super(ArrowBlock, self).__init__(*args, **kwargs)
self.data = data

@property
def signature(self) -> Hashable:
return self.Signature(klass=ArrowBlock, nrows=len(self.data))

def _get_data(self, index: BlockIndex) -> pa.Array:
return self.data[index]

@classmethod
def from_column_data(cls, data: pa.Array) -> BlockView:
data = pa.Table.from_pydict({"col": data})
block = cls(data)
return BlockView(block=block, block_index="col")

@classmethod
def from_block_data(cls, data: pa.Table) -> List[BlockView]:
block = cls(data)
return [
BlockView(block=block, block_index=column) for column in data.column_names
]

@classmethod
def _consolidate(
cls,
block_refs: Sequence[BlockRef],
) -> BlockRef:
table = pa.Table.from_pydict(
# need to ignore index when concatenating
{
name: ref.block.data[col._block_index]
for ref in block_refs
for name, col in ref.items()
}
)
block = cls(table)

# pull out the block columns from all the block_refs
columns = {}
for ref in block_refs:
columns.update(ref)

new_columns = {
name: col._clone(data=block[name]) for name, col in columns.items()
}
return BlockRef(block=block, columns=new_columns)

@staticmethod
def _convert_index(index):
if isinstance(index, list):
return np.array(index)
if torch.is_tensor(index):
# need to convert to numpy for boolean indexing
return index.numpy()
if isinstance(index, NumpyArrayColumn):
return index.data
if isinstance(index, TensorColumn):
# need to convert to numpy for boolean indexing
return index.data.numpy()
if isinstance(index, pd.Series):
# need to convert to numpy for boolean indexing
return index.values
from meerkat.columns.pandas_column import PandasSeriesColumn

if isinstance(index, PandasSeriesColumn):
return index.data.values

return index

def _get(
self, index, block_ref: BlockRef, materialize: bool = True
) -> Union[BlockRef, dict]:
index = self._convert_index(index)
# TODO: check if they're trying to index more than just the row dimension

if isinstance(index, slice) or isinstance(index, int):
data = self.data[index]
elif index.dtype == bool:
data = self.data.filter(pa.array(index))
else:
data = self.data.take(index)

if isinstance(index, int):
# if indexing a single row, we do not return a block manager, just a dict
return {
name: data[col._block_index] for name, col in block_ref.columns.items()
}
block = self.__class__(data)

columns = {
name: col._clone(data=block[col._block_index])
for name, col in block_ref.columns.items()
}
# note that the new block may share memory with the old block
return BlockRef(block=block, columns=columns)

@staticmethod
def _write_table(path: str, table: pa.Table):
# noqa E501, source: huggingface implementation https://github.com/huggingface/datasets/blob/92304b42cf0cc6edafc97832c07de767b81306a6/src/datasets/table.py#L50
with open(path, "wb") as sink:
writer = pa.RecordBatchStreamWriter(sink=sink, schema=table.schema)
batches: List[pa.RecordBatch] = table.to_batches()
for batch in batches:
writer.write_batch(batch)
writer.close()
return sum(batch.nbytes for batch in batches)

@staticmethod
def _read_table(path: str, mmap: bool = False):
if mmap:
return pa.ipc.open_stream(pa.memory_map(path)).read_all()
else:
return pa.ipc.open_stream(pa.input_stream(path)).read_all()

def _write_data(self, path: str):
self._write_table(os.path.join(path, "data.arrow"), self.data)

@staticmethod
def _read_data(path: str, mmap: bool = False):
return ArrowBlock._read_table(os.path.join(path, "data.arrow"), mmap=mmap)
6 changes: 4 additions & 2 deletions meerkat/block/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@


class BlockManager(MutableMapping):
"""This manager manages all blocks."""
"""Manages all blocks in a DataPanel."""

def __init__(self) -> None:
self._columns: Dict[str, AbstractColumn] = {} # ordered as of 3.7
Expand Down Expand Up @@ -309,11 +309,13 @@ def read(
column_dir,
_data=block[_deserialize_block_index(block_meta["block_index"])],
_meta=col_meta,
**kwargs,
)
mgr.add_column(col, name)
else:
mgr.add_column(
col_meta["dtype"].read(path=column_dir, _meta=col_meta), name
col_meta["dtype"].read(path=column_dir, _meta=col_meta, **kwargs),
name,
)
mgr.reorder(meta["_column_order"])
return mgr
Expand Down
9 changes: 5 additions & 4 deletions meerkat/block/numpy_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@
import shutil
from dataclasses import dataclass
from mmap import mmap
from typing import Hashable, Mapping, Sequence, Tuple, Union
from typing import Hashable, Sequence, Tuple, Union

import numpy as np
import torch

from meerkat.block.ref import BlockRef
from meerkat.errors import ConsolidationError

from .abstract import AbstractBlock, BlockIndex
from .abstract import AbstractBlock, BlockIndex, BlockView


class NumpyBlock(AbstractBlock):
Expand Down Expand Up @@ -47,7 +47,7 @@ def _get_data(self, index: BlockIndex, materialize: bool = True) -> np.ndarray:
return self.data[:, index]

@classmethod
def from_data(cls, data: np.ndarray) -> Tuple[NumpyBlock, Mapping[str, BlockIndex]]:
def from_column_data(cls, data: np.ndarray) -> Tuple[NumpyBlock, BlockView]:
"""[summary]

Args:
Expand All @@ -68,7 +68,8 @@ def from_data(cls, data: np.ndarray) -> Tuple[NumpyBlock, Mapping[str, BlockInde
else:
block_index = slice(0, data.shape[1])

return cls(data), block_index
block = cls(data)
return BlockView(block=block, block_index=block_index)

@classmethod
def _consolidate(
Expand Down
10 changes: 5 additions & 5 deletions meerkat/block/pandas_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os
from dataclasses import dataclass
from typing import Hashable, Mapping, Sequence, Tuple, Union
from typing import Hashable, Sequence, Tuple, Union

import pandas as pd
import torch
Expand All @@ -11,7 +11,7 @@
from meerkat.columns.numpy_column import NumpyArrayColumn
from meerkat.columns.tensor_column import TensorColumn

from .abstract import AbstractBlock, BlockIndex
from .abstract import AbstractBlock, BlockIndex, BlockView


class PandasBlock(AbstractBlock):
Expand All @@ -36,7 +36,7 @@ def _get_data(self, index: BlockIndex) -> pd.Series:
return self.data[index]

@classmethod
def from_data(cls, data: pd.Series) -> Tuple[PandasBlock, Mapping[str, BlockIndex]]:
def from_column_data(cls, data: pd.Series) -> Tuple[PandasBlock, BlockView]:
"""[summary]

Args:
Expand All @@ -50,8 +50,8 @@ def from_data(cls, data: pd.Series) -> Tuple[PandasBlock, Mapping[str, BlockInde
Tuple[PandasBlock, Mapping[str, BlockIndex]]: [description]
"""
data = pd.DataFrame({"col": data})
block_index = "col"
return cls(data), block_index
block = cls(data)
return BlockView(block_index="col", block=block)

@classmethod
def _consolidate(
Expand Down
11 changes: 5 additions & 6 deletions meerkat/block/tensor_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os
from dataclasses import dataclass
from typing import Hashable, Mapping, Sequence, Tuple, Union
from typing import Hashable, Sequence, Tuple, Union

import numpy as np
import torch
Expand All @@ -11,7 +11,7 @@
from meerkat.columns.numpy_column import NumpyArrayColumn
from meerkat.errors import ConsolidationError

from .abstract import AbstractBlock, BlockIndex
from .abstract import AbstractBlock, BlockIndex, BlockView


class TensorBlock(AbstractBlock):
Expand Down Expand Up @@ -45,9 +45,7 @@ def _get_data(self, index: BlockIndex) -> torch.Tensor:
return self.data[:, index]

@classmethod
def from_data(
cls, data: torch.Tensor
) -> Tuple[TensorBlock, Mapping[str, BlockIndex]]:
def from_column_data(cls, data: torch.Tensor) -> Tuple[TensorBlock, BlockView]:
"""[summary]

Args:
Expand All @@ -68,7 +66,8 @@ def from_data(
else:
block_index = slice(0, data.shape[1])

return cls(data), block_index
block = cls(data)
return BlockView(block_index=block_index, block=block)

@classmethod
def _consolidate(
Expand Down
2 changes: 1 addition & 1 deletion meerkat/columns/abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ def _repr_pandas_(self, max_rows: int = None) -> pd.Series:
if len(self) > max_rows:
col = pd.Series(
[self._repr_cell(idx) for idx in range(max_rows // 2)]
+ [None]
+ [self._repr_cell(0)]
+ [
self._repr_cell(idx)
for idx in range(len(self) - max_rows // 2, len(self))
Expand Down
Loading