-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnamed_array.py
173 lines (151 loc) · 6.43 KB
/
named_array.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# coding: utf-8
import itertools
import numpy as np
import pandas as pd
import h5py
from scipy import sparse
from proxy import Proxy
class NamedArrBase(object):
pass
class NamedArray(Proxy, NamedArrBase):
# FIXME: issparse(NamedArray(csr_matrix)) -> False
# FIXME: error when NamedArray(csr_matrix) @ NamedArray(csr_matrix)
def __init__(self, data, axis_names):
super().__init__(data)
self._names = []
for names in axis_names:
# TODO: check dims
self._names.append(pd.Index(names) if names else None)
def issparse(self):
return sparse.issparse(object.__getattribute__(self, "_obj"))
def __getitem__(self, key):
key, new_names = self._convert_key(key)
sliced_data = getattr(object.__getattribute__(self, "_obj"), "__getitem__")(key)
return NamedArray(sliced_data, axis_names=new_names)
def _convert_key(self, key):
# TODO: what if key is a single integer?
# FIXME: error when key includes str x['row1', ]
# FIXME: error when 1-dim array x['a']
# FIXME: error when x[['row1', 'row2'], ] -> no colnames
new_key = []
new_names = []
if not isinstance(key, tuple):
key = key,
for i, idx in enumerate(key):
if idx is None:
new_key.append(None)
new_names.append(None)
elif isinstance(idx, int):
new_key.append(idx)
if self._names[i] is None:
new_names.append(None)
else:
new_names.append([self._names[i][idx]])
elif isinstance(idx, slice):
new_key.append(idx)
if self._names[i] is None:
new_names.append(None)
else:
new_names.append(self._names[i][idx].tolist())
elif isinstance(idx, (np.ndarray, list)):
convert = lambda x: x if isinstance(x, int) else self._names[i].get_loc(x)
if isinstance(idx, list):
new_key.append([convert(x) for x in idx])
else:
for elem in np.nditer(idx, op_flags=['readwrite']):
elem[...] = convert(np.asscalar(elem))
new_key.append(idx.astype('int32'))
if self._names[i] is None:
new_names.append(None)
else:
new_names.append(self._names[i][np.ravel(new_key[-1])].tolist())
else:
raise ValueError() # FIXME
return tuple(new_key), new_names
@property
def names(self):
return tuple(names.tolist() if names is not None else None
for names in self._names) # FIXME: slow?
@names.setter
def names(self, axes=0):
# TODO: check length and update names
raise NotImplementedError()
def to_hdf(self, path, group=None, mode='w'):
with h5py.File(path, mode=mode) as h5_file:
if group is None:
h5_obj = h5_file
else:
h5_obj = h5_file.create_group(group)
for i, names in enumerate(self._names):
if names is None:
h5_obj.create_dataset("name_%d"%i, dtype='|S2')
else:
h5_obj.create_dataset("name_%d"%i,
data=[x.encode('utf-8') for x in names])
if isinstance(self._obj, np.ndarray):
h5_obj.attrs['type'] = 'ndarray'.encode('utf-8')
h5_obj.create_dataset("arr", data=self._obj)
elif isinstance(self._obj, (sparse.csr_matrix, sparse.csc_matrix)):
h5_obj.attrs['type'] = type(self._obj).__name__.encode('utf-8')
h5_obj.attrs['shape'] = self._obj.shape
h5_obj.create_dataset('data', data=self._obj.data)
h5_obj.create_dataset('indptr', data=self._obj.indptr)
h5_obj.create_dataset('indices', data=self._obj.indices)
@classmethod
def load(cls, path, group=None):
with h5py.File(path, mode='r') as h5_file:
if group is None:
h5_obj = h5_file
else:
h5_obj = h5_file[group]
data_type = h5_obj.attrs['type'].decode('utf-8')
arr = None
if data_type == 'ndarray':
arr = h5_obj['arr']
elif data_type == 'csr_matrix' or data_type == 'csc_matrix':
shape = h5_obj.attrs['shape']
data = h5_obj['data']
indptr = h5_obj['indptr']
indices = h5_obj['indices']
if data_type == 'csr_matrix':
arr = sparse.csr_matrix((data, indices, indptr), shape=shape)
elif data_type == 'csc_matrix':
arr = sparse.csc_matrix((data, indices, indptr), shape=shape)
names = []
for i in range(len(arr.shape)):
if isinstance(h5_obj['name_%d'%i], h5py.Empty):
names.append(None)
else:
names.append([x.decode('utf-8') for x in h5_obj['name_%d'%i]])
return NamedArray(arr, axis_names=names)
_overrided_special_names = ["__getitem__"]
@classmethod
def _create_class_proxy(cls, theclass):
"""creates a proxy for the given class"""
def make_method(name):
def method(self, *args, **kw):
return getattr(object.__getattribute__(self, "_obj"), name)(*args, **kw)
return method
namespace = {}
for name in cls._special_names:
if name in cls._overrided_special_names:
continue
if hasattr(theclass, name):
namespace[name] = make_method(name)
return type("%s(%s)" % (cls.__name__, theclass.__name__), (cls,), namespace)
def vstack(blocks, **kwargs):
# TODO: check if all the array are NameArray
sparse_stack = any(arr.issparse() for arr in blocks)
if sparse_stack:
stacked_arr = sparse.vstack([x._obj for x in blocks], **kwargs)
else:
stacked_arr = np.vstack([x._obj for x in blocks])
new_names = []
new_names.append([*itertools.chain.from_iterable(arr.names[0] for arr in blocks)])
new_names.append(blocks[0].names[1])
return NamedArray(stacked_arr, axis_names=new_names)
def remove_name(x):
if isinstance(x, NamedArrBase):
return x._obj
else:
return x