Merge remote-tracking branch 'origin/main' into fix_pad_sequence

pytorch · Feb 25, 2024 · f47ac14 · f47ac14
2 parents 60323b8 + dbb3363
commit f47ac14
Show file tree

Hide file tree

Showing 27 changed files with 2,110 additions and 391 deletions.
diff --git a/.github/scripts/m1_script.sh b/.github/scripts/m1_script.sh
@@ -1,3 +1,3 @@
 #!/bin/bash
 
-export BUILD_VERSION=0.3.0
+export BUILD_VERSION=0.4.0
diff --git a/.github/workflows/build-wheels-m1.yml b/.github/workflows/build-wheels-m1.yml
@@ -13,6 +13,10 @@ on:
         - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
   workflow_dispatch:
 
+permissions:
+  id-token: write
+  contents: read
+
 jobs:
   generate-matrix:
     uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
@@ -23,21 +27,23 @@ jobs:
       test-infra-ref: main
   build:
     needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/tensordict
+            smoke-test-script: test/smoke_test.py
+            package-name: tensordict
     name: pytorch/tensordict
     uses: pytorch/test-infra/.github/workflows/build_wheels_macos.yml@main
     with:
-      repository: pytorch/tensordict
+      repository: ${{ matrix.repository }}
       ref: ""
       test-infra-repository: pytorch/test-infra
       test-infra-ref: main
       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
-#      pre-script: .github/scripts/pre_build_script_m1.sh
-      post-script: ""
-      package-name: tensordict
-      runner-type: macos-m1-12
-      smoke-test-script: ""
+      package-name: ${{ matrix.package-name }}
+      runner-type: macos-m1-stable
+      smoke-test-script: ${{ matrix.smoke-test-script }}
       trigger-event: ${{ github.event_name }}
       env-var-script: .github/scripts/m1_script.sh
-    secrets:
-      AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
diff --git a/.github/workflows/test-macos.yml b/.github/workflows/test-macos.yml
@@ -54,7 +54,7 @@ jobs:
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
-      runner: macos-m1-12
+      runner: macos-m1-stable
       repository: pytorch/tensordict
       timeout: 120
       script: |

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -4,7 +4,7 @@ on:
     types: [opened, synchronize, reopened]
   push:
     branches:
-      - release/0.3.0
+      - release/0.4.0
 
 concurrency:
   # Documentation suggests ${{ github.head_ref }}, but that's only available on pull_request/pull_request_target triggers, so using ${{ github.ref }}.
@@ -32,7 +32,7 @@ jobs:
         run: |
           export PATH="/opt/python/${{ matrix.python_version[1] }}/bin:$PATH"
           python3 -mpip install wheel
-          BUILD_VERSION=0.3.0 python3 setup.py bdist_wheel
+          BUILD_VERSION=0.4.0 python3 setup.py bdist_wheel
           # NB: wheels have the linux_x86_64 tag so we rename to manylinux1
           # find . -name 'dist/*whl' -exec bash -c ' mv $0 ${0/linux/manylinux1}' {} \;
       # pytorch/pytorch binaries are also manylinux_2_17 compliant but they
@@ -72,7 +72,7 @@ jobs:
         run: |
           export CC=clang CXX=clang++
           python3 -mpip install wheel
-          BUILD_VERSION=0.3.0 python3 setup.py bdist_wheel
+          BUILD_VERSION=0.4.0 python3 setup.py bdist_wheel
       - name: Upload wheel for the test-wheel job
         uses: actions/upload-artifact@v2
         with:
@@ -104,7 +104,7 @@ jobs:
         shell: bash
         run: |
             python3 -mpip install wheel
-            BUILD_VERSION=0.3.0 python3 setup.py bdist_wheel
+            BUILD_VERSION=0.4.0 python3 setup.py bdist_wheel
       - name: Upload wheel for the test-wheel job
         uses: actions/upload-artifact@v2
         with:

diff --git a/README.md b/README.md
@@ -30,7 +30,8 @@
 
 [**Installation**](#installation) | [**General features**](#general) |
 [**Tensor-like features**](#tensor-like-features) |  [**Distributed capabilities**](#distributed-capabilities) |
-[**TensorDict for functional programming using FuncTorch**](#tensordict-for-functional-programming-using-functorch) |
+[**TensorDict for functional programming**](#tensordict-for-functional-programming) |
+[**TensorDict for parameter serialization](#tensordict-for-parameter-serialization) |
 [**Lazy preallocation**](#lazy-preallocation) | [**Nesting TensorDicts**](#nesting-tensordicts) | [**TensorClass**](#tensorclass)
 
 `TensorDict` is a dictionary-like class that inherits properties from tensors,
@@ -39,10 +40,10 @@ in distributed settings.
 
 The main purpose of TensorDict is to make code-bases more _readable_ and _modular_ by abstracting away tailored operations:
 ```python
-for i, tensordict in enumerate(dataset):
+for i, data in enumerate(dataset):
     # the model reads and writes tensordicts
-    tensordict = model(tensordict)
-    loss = loss_module(tensordict)
+    data = model(data)
+    loss = loss_module(data)
     loss.backward()
     optimizer.step()
     optimizer.zero_grad()
@@ -60,7 +61,7 @@ A tensordict is primarily defined by its `batch_size` (or `shape`) and its key-v
 ```python
 >>> from tensordict import TensorDict
 >>> import torch
->>> tensordict = TensorDict({
+>>> data = TensorDict({
 ...     "key 1": torch.ones(3, 4, 5),
 ...     "key 2": torch.zeros(3, 4, 5, dtype=torch.bool),
 ... }, batch_size=[3, 4])
@@ -69,24 +70,44 @@ The `batch_size` and the first dimensions of each of the tensors must be complia
 The tensors can be of any dtype and device. Optionally, one can restrict a tensordict to
 live on a dedicated device, which will send each tensor that is written there:
 ```python
->>> tensordict = TensorDict({
+>>> data = TensorDict({
 ...     "key 1": torch.ones(3, 4, 5),
 ...     "key 2": torch.zeros(3, 4, 5, dtype=torch.bool),
 ... }, batch_size=[3, 4], device="cuda:0")
->>> tensordict["key 3"] = torch.randn(3, 4, device="cpu")
->>> assert tensordict["key 3"].device is torch.device("cuda:0")
+>>> data["key 3"] = torch.randn(3, 4, device="cpu")
+>>> assert data["key 3"].device is torch.device("cuda:0")
+```
+
+But that is not all, you can also store nested values in a tensordict:
+```python
+>>> data["nested", "key"] = torch.zeros(3, 4) # the batch-size must match
+```
+and any nested tuple structure will be unravelled to make it easy to read code and
+write ops programmatically:
+```python
+>>> data["nested", ("supernested", ("key",))] = torch.zeros(3, 4) # the batch-size must match
+>>> assert (data["nested", "supernested", "key"] == 0).all()
+>>> assert (("nested",), "supernested", (("key",),)) in data.keys(include_nested=True)  # this works too!
+```
+
+You can also store non-tensor data in tensordicts:
+
+```python
+>>> data = TensorDict({"a-tensor": torch.randn(1, 2)}, batch_size=[1, 2])
+>>> data["non-tensor"] = "a string!"
+>>> assert data["non-tensor"] == "a string!"
 ```
 
 ### Tensor-like features
 
 TensorDict objects can be indexed exactly like tensors. The resulting of indexing
 a TensorDict is another TensorDict containing tensors indexed along the required dimension:
 ```python
->>> tensordict = TensorDict({
+>>> data = TensorDict({
 ...     "key 1": torch.ones(3, 4, 5),
 ...     "key 2": torch.zeros(3, 4, 5, dtype=torch.bool),
 ... }, batch_size=[3, 4])
->>> sub_tensordict = tensordict[..., :2]
+>>> sub_tensordict = data[..., :2]
 >>> assert sub_tensordict.shape == torch.Size([3, 2])
 >>> assert sub_tensordict["key 1"].shape == torch.Size([3, 2, 5])
 ```
@@ -107,15 +128,15 @@ Similarly, one can build tensordicts by stacking or concatenating single tensord
 
 TensorDict instances can also be reshaped, viewed, squeezed and unsqueezed:
 ```python
->>> tensordict = TensorDict({
+>>> data = TensorDict({
 ...     "key 1": torch.ones(3, 4, 5),
 ...     "key 2": torch.zeros(3, 4, 5, dtype=torch.bool),
 ... }, batch_size=[3, 4])
->>> print(tensordict.view(-1))
+>>> print(data.view(-1))
 torch.Size([12])
->>> print(tensordict.reshape(-1))
+>>> print(data.reshape(-1))
 torch.Size([12])
->>> print(tensordict.unsqueeze(-1))
+>>> print(data.unsqueeze(-1))
 torch.Size([3, 4, 1])
 ```
 
@@ -124,8 +145,16 @@ clone them, update them in-place or not, split them, unbind them, expand them et
 
 If a functionality is missing, it is easy to call it using `apply()` or `apply_()`:
 ```python
-tensordict_uniform = tensordict.apply(lambda tensor: tensor.uniform_())
+tensordict_uniform = data.apply(lambda tensor: tensor.uniform_())
+```
+
+``apply()`` can also be great to filter a tensordict, for instance:
+```python
+data = TensorDict({"a": torch.tensor(1.0, dtype=torch.float), "b": torch.tensor(1, dtype=torch.int64)}, [])
+data_float = data.apply(lambda x: x if x.dtype == torch.float else None) # contains only the "a" key
+assert "b" not in data_float
 ```
+
 ### Distributed capabilities
 
 Complex data structures can be cumbersome to synchronize in distributed settings.
@@ -146,39 +175,101 @@ When nodes share a common scratch space, the
 can be used
 to seamlessly send, receive and read a huge amount of data.
 
-### TensorDict for functional programming using FuncTorch
+### TensorDict for functional programming
 
 We also provide an API to use TensorDict in conjunction with [FuncTorch](https://pytorch.org/functorch).
 For instance, TensorDict makes it easy to concatenate model weights to do model ensembling:
 ```python
 >>> from torch import nn
 >>> from tensordict import TensorDict
->>> from tensordict.nn import make_functional
 >>> import torch
 >>> from torch import vmap
 >>> layer1 = nn.Linear(3, 4)
 >>> layer2 = nn.Linear(4, 4)
 >>> model = nn.Sequential(layer1, layer2)
+>>> params = TensorDict.from_module(model)
 >>> # we represent the weights hierarchically
 >>> weights1 = TensorDict(layer1.state_dict(), []).unflatten_keys(".")
 >>> weights2 = TensorDict(layer2.state_dict(), []).unflatten_keys(".")
->>> params = make_functional(model)
 >>> assert (params == TensorDict({"0": weights1, "1": weights2}, [])).all()
 >>> # Let's use our functional module
 >>> x = torch.randn(10, 3)
->>> out = model(x, params=params)  # params is the last arg (or kwarg)
+>>> with params.to_module(model):
+...     out = model(x)
 >>> # an ensemble of models: we stack params along the first dimension...
 >>> params_stack = torch.stack([params, params], 0)
 >>> # ... and use it as an input we'd like to pass through the model
->>> y = vmap(model, (None, 0))(x, params_stack)
+>>> def func(x, params):
+...     with params.to_module(model):
+...         return model(x)
+>>> y = vmap(func, (None, 0))(x, params_stack)
 >>> print(y.shape)
 torch.Size([2, 10, 4])
 ```
 
-Moreover, tensordict modules are compatible with `torch.fx` and `torch.compile`,
+Moreover, tensordict modules are compatible with `torch.fx` and (soon) `torch.compile`,
 which means that you can get the best of both worlds: a codebase that is
 both readable and future-proof as well as efficient and portable!
 
+### TensorDict for parameter serialization and building datasets
+
+TensorDict offers an API for parameter serialization that can be >3x faster than
+regular calls to `torch.save(state_dict)`. Moreover, because tensors will be saved
+independently on disk, you can deserialize your checkpoint on an arbitrary slice
+of the model.
+
+```python
+>>> model = nn.Sequential(nn.Linear(3, 4), nn.Linear(4, 3))
+>>> params = TensorDict.from_module(model)
+>>> params.memmap("/path/to/saved/folder/", num_threads=16)  # adjust num_threads for speed
+>>> # load params
+>>> params = TensorDict.load_memmap("/path/to/saved/folder/", num_threads=16)
+>>> params.to_module(model)  # load onto model
+>>> params["0"].to_module(model[0])  # load on a slice of the model
+>>> # in the latter case we could also have loaded only the slice we needed
+>>> params0 = TensorDict.load_memmap("/path/to/saved/folder/0", num_threads=16)
+>>> params0.to_module(model[0])  # load on a slice of the model
+```
+
+The same functionality can be used to access data in a dataset stored on disk.
+Soring a single contiguous tensor on disk accessed through the `tensordict.MemoryMappedTensor`
+primitive and reading slices of it is not only **much** faster than loading
+single files one at a time but it's also easier and safer (because there is no pickling
+or third-party library involved):
+
+```python
+# allocate memory of the dataset on disk
+data = TensorDict({
+    "images": torch.zeros((128, 128, 3), dtype=torch.uint8),
+    "labels": torch.zeros((), dtype=torch.int)}, batch_size=[])
+data = data.expand(1000000)
+data = data.memmap_like("/path/to/dataset")
+# ==> Fill your dataset here
+# Let's get 3 items of our dataset:
+data[torch.tensor([1, 10000, 500000])]  # This is much faster than loading the 3 images independently
+```
+
+### Preprocessing with TensorDict.map
+
+Preprocessing huge contiguous (or not!) datasets can be done via `TensorDict.map`
+which will dispatch a task to various workers:
+
+```python
+import torch
+from tensordict import TensorDict, MemoryMappedTensor
+import tempfile
+
+def process_data(data):
+    images = data.get("images").flip(-2).clone()
+    labels = data.get("labels") // 10
+    # we update the td inplace
+    data.set_("images", images)  # flip image
+    data.set_("labels", labels)  # cluster labels
+
+if __name__ == "__main__":
+    # create data_preproc here
+    data_preproc = data.map(process_data, num_workers=4, chunksize=0, pbar=True)  # process 1 images at a time
+```
 
 ### Lazy preallocation
 
@@ -187,21 +278,21 @@ items varies according to the script configuration. TensorDict solves this in an
 Assume you are working with a function `foo() -> TensorDict`, e.g.
 ```python
 def foo():
-    tensordict = TensorDict({}, batch_size=[])
-    tensordict["a"] = torch.randn(3)
-    tensordict["b"] = TensorDict({"c": torch.zeros(2)}, batch_size=[])
-    return tensordict
+    data = TensorDict({}, batch_size=[])
+    data["a"] = torch.randn(3)
+    data["b"] = TensorDict({"c": torch.zeros(2)}, batch_size=[])
+    return data
 ```
 and you would like to call this function repeatedly. You could do this in two ways.
 The first would simply be to stack the calls to the function:
 ```python
-tensordict = torch.stack([foo() for _ in range(N)])
+data = torch.stack([foo() for _ in range(N)])
 ```
 However, you could also choose to preallocate the tensordict:
 ```python
-tensordict = TensorDict({}, batch_size=[N])
+data = TensorDict({}, batch_size=[N])
 for i in range(N):
-    tensordict[i] = foo()
+    data[i] = foo()
 ```
 which also results in a tensordict (when `N = 10`)
 ```
@@ -233,16 +324,16 @@ batch size.
 We can switch easily between hierarchical and flat representations.
 For instance, the following code will result in a single-level tensordict with keys `"key 1"` and `"key 2.sub-key"`:
 ```python
->>> tensordict = TensorDict({
+>>> data = TensorDict({
 ...     "key 1": torch.ones(3, 4, 5),
 ...     "key 2": TensorDict({"sub-key": torch.randn(3, 4, 5, 6)}, batch_size=[3, 4, 5])
 ... }, batch_size=[3, 4])
->>> tensordict_flatten = tensordict.flatten_keys(separator=".")
+>>> tensordict_flatten = data.flatten_keys(separator=".")
 ```
 
 Accessing nested tensordicts can be achieved with a single index:
 ```python
->>> sub_value = tensordict["key 2", "sub-key"]
+>>> sub_value = data["key 2", "sub-key"]
 ```
 
 ## TensorClass

diff --git a/benchmarks/nn/functional_benchmarks_test.py b/benchmarks/nn/functional_benchmarks_test.py
@@ -319,6 +319,19 @@ def fun(x, params):
         benchmark(vfun, x, params)
 
 
+@pytest.mark.parametrize("tdparams", [True, False])
+def test_to_module_speed(benchmark, tdparams):
+    module = torch.nn.Transformer()
+    params = TensorDict.from_module(module, as_module=tdparams)
+
+    def func(params=params, module=module):
+        with params.to_module(module):
+            pass
+        return
+
+    benchmark(func)
+
+
 if __name__ == "__main__":
     args, unknown = argparse.ArgumentParser().parse_known_args()
     pytest.main([__file__, "--capture", "no", "--exitfirst"] + unknown)