diff --git a/.gitignore b/.gitignore
index 6f73669b..179cf0dd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,4 +14,4 @@ Session.vim
 /trash
 /misc
 /mdx
-.mypy_cache
\ No newline at end of file
+.mypy_cache
diff --git a/demucs/__init__.py b/demucs/__init__.py
index ef5cd6f3..e02c0ada 100644
--- a/demucs/__init__.py
+++ b/demucs/__init__.py
@@ -4,4 +4,4 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
-__version__ = "4.1.0a1"
+__version__ = "4.1.0a2"
diff --git a/demucs/api.py b/demucs/api.py
index fc254fb2..20079a6b 100644
--- a/demucs/api.py
+++ b/demucs/api.py
@@ -195,7 +195,7 @@ def update_parameter(
             self._jobs = jobs
         if not isinstance(progress, _NotProvided):
             self._progress = progress
-        if not isinstance(callback, _NotProvided) and (callback is None or callable(callback)):
+        if not isinstance(callback, _NotProvided):
             self._callback = callback
         if not isinstance(callback_arg, _NotProvided):
             self._callback_arg = callback_arg
@@ -266,7 +266,7 @@ def separate_tensor(
             wav = convert_audio(wav, sr, self._samplerate, self._audio_channels)
         ref = wav.mean(0)
         wav -= ref.mean()
-        wav /= ref.std()
+        wav /= ref.std() + 1e-8
         out = apply_model(
                 self._model,
                 wav[None],
@@ -284,9 +284,9 @@ def separate_tensor(
             )
         if out is None:
             raise KeyboardInterrupt
-        out *= ref.std()
+        out *= ref.std() + 1e-8
         out += ref.mean()
-        wav *= ref.std()
+        wav *= ref.std() + 1e-8
         wav += ref.mean()
         return (wav, dict(zip(self._model.sources, out[0])))
 
diff --git a/demucs/apply.py b/demucs/apply.py
index 180db7fe..1540f3d4 100644
--- a/demucs/apply.py
+++ b/demucs/apply.py
@@ -150,7 +150,7 @@ def apply_model(model: tp.Union[BagOfModels, Model],
                 num_workers: int = 0, segment: tp.Optional[float] = None,
                 pool=None, lock=None,
                 callback: tp.Optional[tp.Callable[[dict], None]] = None,
-                callback_arg: tp.Optional[dict] = None) -> tp.Optional[th.Tensor]:
+                callback_arg: tp.Optional[dict] = None) -> th.Tensor:
     """
     Apply model to a given mixture.
 
@@ -197,7 +197,7 @@ def apply_model(model: tp.Union[BagOfModels, Model],
         'lock': lock,
     }
     out: tp.Union[float, th.Tensor]
-    res: tp.Union[float, th.Tensor, None]
+    res: tp.Union[float, th.Tensor]
     if isinstance(model, BagOfModels):
         # Special treatment for bag of model.
         # We explicitely apply multiple times `apply_model` so that the random shifts
@@ -205,22 +205,15 @@ def apply_model(model: tp.Union[BagOfModels, Model],
         estimates: tp.Union[float, th.Tensor] = 0.
         totals = [0.] * len(model.sources)
         callback_arg["models"] = len(model.models)
-        kwargs["callback"] = (
-            (
-                lambda d, i=callback_arg["model_idx_in_bag"]: callback(
-                    _replace_dict(d, ("model_idx_in_bag", i))
-                )
-            )
-            if callable(callback)
-            else None
-        )
         for sub_model, model_weights in zip(model.models, model.weights):
+            kwargs["callback"] = ((
+                    lambda d, i=callback_arg["model_idx_in_bag"]: callback(
+                        _replace_dict(d, ("model_idx_in_bag", i))) if callback else None)
+            )
             original_model_device = next(iter(sub_model.parameters())).device
             sub_model.to(device)
 
             res = apply_model(sub_model, mix, **kwargs, callback_arg=callback_arg)
-            if res is None:
-                return res
             out = res
             sub_model.to(original_model_device)
             for k, inst_weight in enumerate(model_weights):
@@ -252,13 +245,10 @@ def apply_model(model: tp.Union[BagOfModels, Model],
             offset = random.randint(0, max_shift)
             shifted = TensorChunk(padded_mix, offset, length + max_shift - offset)
             kwargs["callback"] = (
-                    (lambda d, i=shift_idx: callback(_replace_dict(d, ("shift_idx", i))))
-                    if callable(callback)
-                    else None
+                    (lambda d, i=shift_idx: callback(_replace_dict(d, ("shift_idx", i)))
+                     if callback else None)
                 )
             res = apply_model(model, shifted, **kwargs, callback_arg=callback_arg)
-            if res is None:
-                return res
             shifted_out = res
             out += shifted_out[..., max_shift - offset:]
         out /= shifts
@@ -289,17 +279,18 @@ def apply_model(model: tp.Union[BagOfModels, Model],
             chunk = TensorChunk(mix, offset, segment_length)
             future = pool.submit(apply_model, model, chunk, **kwargs, callback_arg=callback_arg,
                                  callback=(lambda d, i=offset:
-                                           callback(_replace_dict(d, ("segment_offset", i))))
-                                 if callable(callback) else None)
+                                           callback(_replace_dict(d, ("segment_offset", i)))
+                                           if callback else None))
             futures.append((future, offset))
             offset += segment_length
         if progress:
             futures = tqdm.tqdm(futures, unit_scale=scale, ncols=120, unit='seconds')
         for future, offset in futures:
-            chunk_out = future.result()  # type: tp.Union[None, th.Tensor]
-            if chunk_out is None:
-                pool.shutdown(wait=False, cancel_futures=True)
-                return chunk_out
+            try:
+                chunk_out = future.result()  # type: th.Tensor
+            except Exception:
+                pool.shutdown(wait=True, cancel_futures=True)
+                raise
             chunk_length = chunk_out.shape[-1]
             out[..., offset:offset + segment_length] += (
                 weight[:chunk_length] * chunk_out).to(mix.device)
@@ -320,20 +311,12 @@ def apply_model(model: tp.Union[BagOfModels, Model],
         assert isinstance(mix, TensorChunk)
         padded_mix = mix.padded(valid_length).to(device)
         with lock:
-            try:
+            if callback is not None:
                 callback(_replace_dict(callback_arg, ("state", "start")))  # type: ignore
-            except KeyboardInterrupt:
-                raise
-            except Exception:
-                pass
         with th.no_grad():
             out = model(padded_mix)
         with lock:
-            try:
+            if callback is not None:
                 callback(_replace_dict(callback_arg, ("state", "end")))  # type: ignore
-            except KeyboardInterrupt:
-                raise
-            except Exception:
-                pass
         assert isinstance(out, th.Tensor)
         return center_trim(out, length)
diff --git a/demucs/utils.py b/demucs/utils.py
index c80fc129..a3f5993e 100755
--- a/demucs/utils.py
+++ b/demucs/utils.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from collections import defaultdict
+from concurrent.futures import CancelledError
 from contextlib import contextmanager
 import math
 import os
@@ -129,6 +130,8 @@ def __init__(self, func, _dict, *args, **kwargs):
         def result(self):
             if self._dict["run"]:
                 return self.func(*self.args, **self.kwargs)
+            else:
+                raise CancelledError()
 
     def __init__(self, workers=0):
         self._dict = {"run": True}
diff --git a/docs/api.md b/docs/api.md
index e6d9e873..ab55f922 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -79,7 +79,7 @@ progress: If true, show a progress bar.
 
 ##### Notes for callback
 
-The function will be called with only one positional parameter whose type is `dict`. The `callback_arg` will be combined with information of current separation progress. The progress information will override the values in `callback_arg` if same key has been used. To abort the separation, raise `KeyboardInterrupt`.
+The function will be called with only one positional parameter whose type is `dict`. The `callback_arg` will be combined with information of current separation progress. The progress information will override the values in `callback_arg` if same key has been used. To abort the separation, raise an exception in `callback` which should be handled by yourself if you want your codes continue to function.
 
 Progress information contains several keys (These keys will always exist):
 - `model_idx_in_bag`: The index of the submodel in `BagOfModels`. Starts from 0.
@@ -127,7 +127,7 @@ progress: If true, show a progress bar.
 
 ##### Notes for callback
 
-The function will be called with only one positional parameter whose type is `dict`. The `callback_arg` will be combined with information of current separation progress. The progress information will override the values in `callback_arg` if same key has been used. To abort the separation, raise `KeyboardInterrupt`.
+The function will be called with only one positional parameter whose type is `dict`. The `callback_arg` will be combined with information of current separation progress. The progress information will override the values in `callback_arg` if same key has been used. To abort the separation, raise an exception in `callback` which should be handled by yourself if you want your codes continue to function.
 
 Progress information contains several keys (These keys will always exist):
 - `model_idx_in_bag`: The index of the submodel in `BagOfModels`. Starts from 0.
diff --git a/docs/release.md b/docs/release.md
index 1c8dd537..0aee2f71 100644
--- a/docs/release.md
+++ b/docs/release.md
@@ -14,7 +14,7 @@ Added `--other-method`: method to get `no_{STEM}`, add up all the other stems (a
 
 Added type `HTDemucs` to type alias `AnyModel`.
 
-## V4.0.1a1, TBD
+## V4.0.1, 8th of September 2023
 
 **From this version, Python 3.7 is no longer supported. This is not a problem since the latest PyTorch 2.0.0 no longer support it either.**
 
diff --git a/docs/windows.md b/docs/windows.md
index a84e89bf..acc48c70 100644
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -2,25 +2,32 @@
 
 ## Installation and usage
 
-Parts of the code are untested on Windows (in particular, training a new model). If you don't have much experience with Anaconda, python or the shell, here are more detailed instructions. Note that **Demucs is not supported on 32bits systems** (as Pytorch is not available there).
+If you don't have much experience with Anaconda, python or the shell, here are more detailed instructions. Note that **Demucs is not supported on 32bits systems** (as Pytorch is not available there).
 
 - First install Anaconda with **Python 3.8** or more recent, which you can find [here][install].
 - Start the [Anaconda prompt][prompt].
 
 Then, all commands that follow must be run from this prompt.
 
+<details>
+  <summary>I have no coding experience and these are too difficult for me</summary>
+
+> Then a GUI is suitable for you. See [Demucs GUI](https://github.com/CarlGao4/Demucs-Gui)
+
+</details>
+
 ### If you want to use your GPU
 
-If you have graphic cards produced by nVidia with more than 6GiB of memory, you can separate tracks with GPU acceleration. To achieve this, you must install Pytorch with CUDA. If Pytorch was already installed (you already installed Demucs for instance), first run  `python.exe -m pip uninstall torch torchaudio`.
-Then visit [Pytorch Home Page](https://pytorch.org/get-started/locally/) and follow the guide on it to install with CUDA support. 
+If you have graphic cards produced by NVIDIA with more than 2GiB of memory, you can separate tracks with GPU acceleration. To achieve this, you must install Pytorch with CUDA. If Pytorch was already installed (you already installed Demucs for instance), first run  `python.exe -m pip uninstall torch torchaudio`.
+Then visit [Pytorch Home Page](https://pytorch.org/get-started/locally/) and follow the guide on it to install with CUDA support. Please make sure that the version of torchaudio should no greater than 2.1 (which is the latest version when this document is written, but 2.2.0 is sure unsupported)
 
 ### Installation
 
 Start the Anaconda prompt, and run the following
-bash
-```
+
+```cmd
 conda install -c conda-forge ffmpeg
-python.exe -m pip install -U demucs PySoundFile
+python.exe -m pip install -U demucs SoundFile
 ```
 
 ### Upgrade
@@ -33,9 +40,12 @@ Then to use Demucs, just start the **Anaconda prompt** and run:
 ```
 demucs -d cpu "PATH_TO_AUDIO_FILE_1" ["PATH_TO_AUDIO_FILE_2" ...]
 ```
-The `"` around the filename are required if the path contains spaces.
-The separated files will be under `C:\Users\YOUR_USERNAME\demucs\separated\demucs\`.
+The `"` around the filename are required if the path contains spaces. A simple way to input these paths is draging a file from a folder into the terminal.
 
+To find out the separated files, you can run this command and open the folders:
+```
+explorer separated
+```
 
 ### Separating an entire folder
 
@@ -45,7 +55,6 @@ cd FOLDER
 for %i in (*.mp3) do (demucs -d cpu "%i")
 ```
 
-
 ## Potential errors
 
 If you have an error saying that `mkl_intel_thread.dll` cannot be found, you can try to first run
diff --git a/requirements.txt b/requirements.txt
index 26342361..294290d3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,7 +13,7 @@ openunmix
 pyyaml
 submitit
 torch>=1.8.1
-torchaudio>=0.8
+torchaudio>=0.8,<2.1
 tqdm
 treetable
 soundfile>=0.10.3;sys_platform=="win32"
diff --git a/requirements_minimal.txt b/requirements_minimal.txt
index 8c6f1e57..1940bf01 100644
--- a/requirements_minimal.txt
+++ b/requirements_minimal.txt
@@ -6,5 +6,5 @@ lameenc>=1.2
 openunmix
 pyyaml
 torch>=1.8.1
-torchaudio>=0.8
+torchaudio>=0.8,<2.1
 tqdm