Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SparseLinear_v2, fixing indexing issues #754

Merged
merged 11 commits into from
Nov 17, 2022
2 changes: 1 addition & 1 deletion thinc/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from .layers import Dropout, Embed, expand_window, HashEmbed, LayerNorm, Linear
from .layers import Maxout, Mish, MultiSoftmax, Relu, softmax_activation, Softmax, LSTM
from .layers import CauchySimilarity, ParametricAttention, Logistic
from .layers import resizable, sigmoid_activation, Sigmoid, SparseLinear
from .layers import resizable, sigmoid_activation, Sigmoid, SparseLinear, SparseLinear_v2
from .layers import ClippedLinear, ReluK, HardTanh, HardSigmoid
from .layers import Dish, HardSwish, HardSwishMobilenet, Swish, Gelu
from .layers import PyTorchWrapper, PyTorchRNNWrapper, PyTorchLSTM
Expand Down
2 changes: 1 addition & 1 deletion thinc/layers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from .sigmoid import Sigmoid
from .softmax_activation import softmax_activation
from .softmax import Softmax, Softmax_v2
from .sparselinear import SparseLinear
from .sparselinear import SparseLinear, SparseLinear_v2
from .tensorflowwrapper import TensorFlowWrapper, keras_subclass
from .mxnetwrapper import MXNetWrapper

Expand Down
67 changes: 52 additions & 15 deletions thinc/layers/sparselinear.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,22 @@ def SparseLinear(nO: Optional[int] = None, length: int = 2 ** 18):
init=init,
params={"W": None, "b": None},
dims={"nO": nO, "length": length},
attrs={"v1_indexing": True},
)


@cython.binding(True)
@registry.layers("SparseLinear.v2")
def SparseLinear_v2(nO: Optional[int] = None, length: int = 2 ** 18):
# NB: We can't have generic return type annotation if we want function to
# be bound (and inspectable): https://github.com/cython/cython/issues/2753
return Model(
"sparse_linear",
forward,
init=init,
params={"W": None, "b": None},
dims={"nO": nO, "length": length},
attrs={"v1_indexing": False},
)


Expand Down Expand Up @@ -70,11 +86,12 @@ def _begin_cpu_update(model, np.ndarray keys, np.ndarray values, np.ndarray leng
cdef np.ndarray W = model.get_param("W")
cdef np.ndarray b = model.get_param("b")
cdef np.ndarray scores = model.ops.alloc((len(lengths), nO))
cdef bint v1_indexing = model.attrs["v1_indexing"]
scores += b
set_scoresC(<float*>scores.data,
<uint64_t*>keys.data, <float*>values.data, <int32_t*>lengths.data,
lengths.shape[0], nO,
<float*>W.data, length)
<float*>W.data, length, v1_indexing)
return scores, _finish_linear_update(model, keys, values, lengths)


Expand All @@ -95,10 +112,10 @@ class _finish_linear_update:
cdef np.ndarray keys = self.keys
cdef np.ndarray values = self.values
cdef np.ndarray lengths = self.lengths
cdef bint v1_indexing = self.model.attrs["v1_indexing"]
set_gradientC(<float*>d_weights.data,
<uint64_t*>keys.data, <float*>values.data, <int32_t*>lengths.data,
lengths.shape[0], nO,
&d_scores[0,0], length)
lengths.shape[0], nO, &d_scores[0,0], length, v1_indexing)
cdef int i, j
for i in range(d_scores.shape[0]):
for j in range(d_scores.shape[1]):
Expand All @@ -108,43 +125,63 @@ class _finish_linear_update:
return (self.keys, self.values, self.lengths)


# v1_indexing is invalid and only uses a subset of the weight matrix, v1
# indexing is provided here for compatibility. See #752 for more information.
cdef void set_scoresC(float* scores,
const uint64_t* keys, const float* values, const int32_t* lengths,
int batch_size, int nr_out,
const float* weights, int nr_weight) nogil:
int batch_size, int nr_out, const float* weights, int nr_weight,
bint v1_indexing) nogil:
cdef uint32_t idx1, idx2
cdef uint32_t hash1, hash2
for length in lengths[:batch_size]:
for i in range(length):
hash1 = MurmurHash3_x86_32_uint64(keys[i], 0)
hash2 = MurmurHash3_x86_32_uint64(keys[i], 1)
idx1 = hash1 & (nr_weight-1)
idx2 = hash2 & (nr_weight-1)
if v1_indexing:
idx1 = hash1 & (nr_weight-1)
idx2 = hash2 & (nr_weight-1)
else:
idx1 = hash1 % nr_weight
idx2 = hash2 % nr_weight
value = values[i]
for clas in range(nr_out):
scores[clas] += weights[idx1 + clas] * value
scores[clas] += weights[idx2 + clas] * value
if v1_indexing:
scores[clas] += weights[idx1 + clas] * value
scores[clas] += weights[idx2 + clas] * value
else:
scores[clas] += weights[(clas * nr_weight) + idx1] * value
scores[clas] += weights[(clas * nr_weight) + idx2] * value
scores += nr_out
keys += length
values += length


# v1_indexing is invalid and only uses a subset of the weight matrix, v1
# indexing is provided here for compatibility. See #752 for more information.
cdef void set_gradientC(float* d_weights,
const uint64_t* keys, const float* values, const int32_t* lengths,
int batch_size, int nr_out,
const float* d_scores, int nr_weight) nogil:
int batch_size, int nr_out, const float* d_scores, int nr_weight,
bint v1_indexing) nogil:
cdef uint32_t idx1, idx2
cdef uint32_t hash1, hash2
for length in lengths[:batch_size]:
for i in range(length):
hash1 = MurmurHash3_x86_32_uint64(keys[i], 0)
hash2 = MurmurHash3_x86_32_uint64(keys[i], 1)
idx1 = hash1 & (nr_weight-1)
idx2 = hash2 & (nr_weight-1)
if v1_indexing:
idx1 = hash1 & (nr_weight-1)
idx2 = hash2 & (nr_weight-1)
else:
idx1 = hash1 % nr_weight
idx2 = hash2 % nr_weight
value = values[i]
for clas in range(nr_out):
d_weights[idx1 + clas] += d_scores[clas] * value
d_weights[idx2 + clas] += d_scores[clas] * value
if v1_indexing:
d_weights[idx1 + clas] += d_scores[clas] * value
d_weights[idx2 + clas] += d_scores[clas] * value
else:
d_weights[(clas * nr_weight) + idx1] += d_scores[clas] * value
d_weights[(clas * nr_weight) + idx2] += d_scores[clas] * value
d_scores += nr_out
keys += length
values += length
Expand Down
1 change: 1 addition & 0 deletions thinc/tests/layers/test_layers_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def assert_data_match(Y, out_data):
# ("CauchySimilarity.v1", {}, (array2d, array2d), array1d),
("ParametricAttention.v1", {}, ragged, ragged),
("SparseLinear.v1", {}, (numpy.asarray([1, 2, 3], dtype="uint64"), array1d, numpy.asarray([1, 1], dtype="i")), array2d),
("SparseLinear.v2", {}, (numpy.asarray([1, 2, 3], dtype="uint64"), array1d, numpy.asarray([1, 1], dtype="i")), array2d),
danieldk marked this conversation as resolved.
Show resolved Hide resolved
("remap_ids.v1", {"dtype": "f"}, ["a", 1, 5.0], array2dint),
("remap_ids.v2", {"mapping_table": {}, "column": 1}, numpy.array([[1, 2, 3], [4, 5, 6]]).T, array2dint)
# fmt: on
Expand Down
36 changes: 36 additions & 0 deletions website/docs/api-layers.md
Original file line number Diff line number Diff line change
Expand Up @@ -802,6 +802,42 @@ length, describing the concatenated batch of input features and their values.
The `lengths` array should have one entry per sequence in the batch, and the sum
of the lengths should equal the length of the keys and values array.

<infobox variant="warning">

`SparseLinear` should not be used for new models because it contains an indexing
bug. As a result, only a subset of the weights is used. Use
[`SparseLinear_v2`](#sparselinear_v2) instead.

</infobox>

| Argument | Type | Description |
| ----------- | --------------------------------------------------------- | -------------------------------------------------------- |
| `nO` | <tt>Optional[int]</tt> | The size of the output vectors. |
| `length` | <tt>int</tt> | The size of the weights vector, to be tuned empirically. |
| **RETURNS** | <tt>Model[Tuple[ArrayXd, ArrayXd, ArrayXd], ArrayXd]</tt> | The created layer. |

```python
https://github.com/explosion/thinc/blob/master/thinc/layers/sparselinear.pyx
```

### SparseLinear_v2 {#sparselinear_v2 tag="function"}
danieldk marked this conversation as resolved.
Show resolved Hide resolved

<inline-list>

- **Input:** <ndarray>Tuple[ArrayXd, ArrayXd, ArrayXd]</ndarray>
- **Output:** <ndarray>ArrayXd</ndarray>
- **Parameters:** <ndarray shape="nO*length,">W</ndarray>,
<ndarray shape="nO,">b</ndarray>, `length` <tt>int</tt>

</inline-list>

A sparse linear layer using the "hashing trick". Useful for tasks such as text
classification. Inputs to the layer should be a tuple of arrays
`(keys, values, lengths)`, where the `keys` and `values` are arrays of the same
length, describing the concatenated batch of input features and their values.
The `lengths` array should have one entry per sequence in the batch, and the sum
of the lengths should equal the length of the keys and values array.

| Argument | Type | Description |
| ----------- | --------------------------------------------------------- | -------------------------------------------------------- |
| `nO` | <tt>Optional[int]</tt> | The size of the output vectors. |
Expand Down