Merge pull request #399 from claritychallenge/371-CAD2

Adding Cadenza 2
claritychallenge · Jul 12, 2024 · bebd5fc · bebd5fc
2 parents 86ab22a + 61e155c
commit bebd5fc
Show file tree

Hide file tree

Showing 47 changed files with 5,362 additions and 73 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -70,6 +70,9 @@ repos:
       - id: mypy
         args:
           - --explicit-package-bases
+        additional_dependencies:
+          - 'types-PyYAML'
+
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.

diff --git a/clarity/enhancer/multiband_compressor/__init__.py b/clarity/enhancer/multiband_compressor/__init__.py
@@ -0,0 +1,7 @@
+from clarity.enhancer.multiband_compressor.compressor_qmul import Compressor
+from clarity.enhancer.multiband_compressor.crossover import Crossover
+from clarity.enhancer.multiband_compressor.multiband_compressor import (
+    MultibandCompressor,
+)
+
+__all__ = ["MultibandCompressor", "Compressor", "Crossover"]
diff --git a/clarity/enhancer/multiband_compressor/compressor_qmul.py b/clarity/enhancer/multiband_compressor/compressor_qmul.py
@@ -2,8 +2,6 @@
 
 from __future__ import annotations
 
-import warnings
-
 import numpy as np
 
 
@@ -68,39 +66,28 @@ def __init__(
         knee_width: float = 0.0,
         sample_rate: float = 44100.0,
     ) -> None:
-        if threshold > 0 or threshold < -60:
-            warnings.warn(
-                "Threshold outside the recommended range [0.0, -60.0] dB."
-                f" {threshold} dB was provided.",
-                stacklevel=1,
-            )
-        if ratio < 1 or ratio > 20:
-            warnings.warn(
-                "Ratio outside the recommended range [1.0, 20.0]."
-                f" {ratio} was provided.",
-                stacklevel=1,
-            )
-
-        if attack < 0.1 or attack > 80.0:
-            warnings.warn(
-                "Attack outside the recommended range [0.1, 80.0] ms."
-                f" {attack} ms was provided.",
-                stacklevel=1,
-            )
-
-        if release < 0.1 or release > 1000.0:
-            warnings.warn(
-                "Release outside the recommended range [0.1, 1000.0] ms."
-                f" {release} ms was provided.",
-                stacklevel=1,
-            )
-
-        if makeup_gain < 0 or makeup_gain > 24:
-            warnings.warn(
-                "Make-up gain outside the recommended range [0.0, 24.0] dB."
-                f" {makeup_gain} dB was provided.",
-                stacklevel=1,
-            )
+        """Constructor for the Compressor class.
+
+        Args:
+            threshold (float): The threshold level in dB.
+            ratio (float): The compression ratio.
+            attack (float): The attack time in ms.
+            release (float): The release time in ms.
+            makeup_gain (float): The make-up gain in dB.
+            knee_width (float): The knee width in dB.
+            sample_rate (float): The sample rate in Hz.
+
+        Notes:
+            Original implementation recommends ranges for each parameter.
+            We are not enforcing these ranges in this implementation.
+            The ranges are:
+            - threshold in the range [0.0, -60.0] dB,
+            - ratio in the range [1.0, 20.0],
+            - attack in the range [0.1, 80.0] ms,
+            - release in the range [0.1, 1000.0] ms,
+            - makeup_gain in the range [0.0, 24.0] dB.
+            - knee_width in the range [0.0, 10.0] dB.
+        """
 
         self.threshold = float(threshold)
         self.ratio = float(ratio)
@@ -110,6 +97,8 @@ def __init__(
         self.sample_rate = float(sample_rate)
         self.knee_width = float(knee_width)
 
+        self.eps = 1e-12
+
         self.alpha_attack = np.exp(-1.0 / (0.001 * self.sample_rate * self.attack))
         self.alpha_release = np.exp(-1.0 / (0.001 * self.sample_rate * self.release))
 
@@ -125,6 +114,7 @@ def __call__(self, input_signal: np.ndarray) -> np.ndarray:
         """
 
         # Compute the instantaneous desired levels
+        input_signal[input_signal == 0] = self.eps
         x_g = 20 * np.log10(np.abs(input_signal))
         x_g[x_g < -120] = -120
         y_g = self.threshold + (x_g - self.threshold) / self.ratio

diff --git a/clarity/utils/flac_encoder.py b/clarity/utils/flac_encoder.py
@@ -15,6 +15,8 @@
 import pyflac as pf
 import soundfile as sf
 
+from clarity.utils.signal_processing import clip_signal, resample, to_16bit
+
 logger = logging.getLogger(__name__)
 
 
@@ -189,11 +191,11 @@ def encode(
         """
         if signal.dtype != np.int16:
             logger.error(
-                f"FLAC encoder only supports 16-bit integer signals, "
+                "FLAC encoder only supports 16-bit integer signals, "
                 f"but got {signal.dtype}"
             )
             raise ValueError(
-                f"FLAC encoder only supports 16-bit integer signals, "
+                "FLAC encoder only supports 16-bit integer signals, "
                 f"but got {signal.dtype}"
             )
 
@@ -261,3 +263,65 @@ def read_flac_signal(filename: Path) -> tuple[np.ndarray, float]:
             # Scale signal
             signal *= max_value
     return signal, sample_rate
+
+
+def save_flac_signal(
+    signal: np.ndarray,
+    filename: Path,
+    signal_sample_rate: int,
+    output_sample_rate: int | None = None,
+    do_clip_signal: bool = False,
+    do_soft_clip: bool = False,
+    do_scale_signal: bool = False,
+) -> None:
+    """
+    Function to save output signals.
+
+    - The output signal will be resample to ``output_sample_rate``.
+        If ``output_sample_rate`` is None, the output signal will have
+        the same sample rate as the input signal.
+    - The output signal will be clipped to [-1, 1] if ``do_clip_signal`` is True
+        and use soft clipped if ``do_soft_clip`` is True. Note that if
+        ``do_clip_signal`` is False, ``do_soft_clip`` will be ignored.
+        Note that if ``do_clip_signal`` is True, ``do_scale_signal`` will be ignored.
+    - The output signal will be scaled to [-1, 1] if ``do_scale_signal`` is True.
+        If signal is scale, the scale factor will be saved in a TXT file.
+        Note that if ``do_clip_signal`` is True, ``do_scale_signal`` will be ignored.
+    - The output signal will be saved as a FLAC file.
+
+    Args:
+        signal (np.ndarray) : Signal to save
+        filename (Path) : Path to save signal
+        signal_sample_rate (int) : Sample rate of the input signal
+        output_sample_rate (int) : Sample rate of the output signal
+        do_clip_signal (bool) : Whether to clip signal
+        do_soft_clip (bool) : Whether to apply soft clipping
+        do_scale_signal (bool) : Whether to scale signal
+    """
+    # Resample signal to expected output sample rate
+    if output_sample_rate is None:
+        output_sample_rate = signal_sample_rate
+
+    if signal_sample_rate != output_sample_rate:
+        signal = resample(signal, signal_sample_rate, output_sample_rate)
+
+    if do_scale_signal:
+        # Scale stem signal
+        max_value = np.max(np.abs(signal))
+        signal = signal / max_value
+
+        # Save scale factor
+        with open(filename.with_suffix(".txt"), "w", encoding="utf-8") as file:
+            file.write(f"{max_value}")
+
+    elif do_clip_signal:
+        # Clip the signal
+        signal, n_clipped = clip_signal(signal, do_soft_clip)
+        if n_clipped > 0:
+            logger.warning(f"Writing {filename}: {n_clipped} samples clipped")
+
+    # Convert signal to 16-bit integer
+    signal = to_16bit(signal)
+
+    # Create flac encoder object to compress and save the signal
+    FlacEncoder().encode(signal, output_sample_rate, filename)
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,6 +39,7 @@ dependencies = [
     "pystoi",
     "pytorch-lightning",
     "resampy",
+    "safetensors>=0.4.3",
     "scikit-learn>=1.0.2",
     "scipy>=1.7.3, <1.13.0",
     "SoundFile>=0.10.3.post1",

diff --git a/recipes/cad2/README.md b/recipes/cad2/README.md
@@ -0,0 +1,7 @@
+# The Second Cadenza Challenge
+
+Cadenza challenge code for the Second Cadenza Challenge (CAD2).
+For more information please visit the [challenge website](https://cadenzachallenge.org/docs/cadenza2/intro).
+
+In the directories `task 1` and `task 2`, you will find the code for the baseline
+for each system and the instruction on how to obtain the data.
diff --git a/recipes/cad2/task1/ConvTasNet/README.md b/recipes/cad2/task1/ConvTasNet/README.md
@@ -0,0 +1,41 @@
+# CAD2-TASK1 singing/accompaniment separation model
+
+This recipe contains the necessary content to replicate the separation models used in CAD2-Task1.
+
+- The system is based on Asteroid Source Separation system.
+- ConvTasNet implementation is based on stereo adaptation by Alexandre Defossez <https://github.com/facebookresearch/demucs/blob/v1/demucs/tasnet.py>
+- Evaluation logic is based on <https://github.com/asteroid-team/asteroid/blob/master/egs/musdb18/X-UMX/eval.py>
+- Dataloader is based on <https://github.com/asteroid-team/asteroid/blob/master/asteroid/data/musdb18_dataset.py>
+
+You can replicate the Causal and Non-Causal model by running:
+
+- **To replicate the Non-Causal model**
+
+```bash
+python train.py \
+    --exp_dir /path/to/save/exps \
+    --batch_size 4 \
+    --aggregate 2 \
+    --lr 0.0005 \
+    --root /path/to/MUSDB18 \
+    --sample_rate 44100 \
+    --segment 5.0 \
+    --samples_per_track 64
+```
+
+- **To replicate the Causal model**
+
+```bash
+python train.py \
+    --exp_dir /path/to/save/exps \
+    --batch_size 4 \
+    --aggregate 1 \
+    --lr 0.0005 \
+    --root /path/to/MUSDB18 \
+    --sample_rate 44100 \
+    --segment 4.0 \
+    --samples_per_track 64 \
+    --causal True \
+    --n_src 2 \
+    --norm_type cLN
+```
diff --git a/recipes/cad2/task1/ConvTasNet/__init__.py b/recipes/cad2/task1/ConvTasNet/__init__.py
diff --git a/recipes/cad2/task1/ConvTasNet/eval.py b/recipes/cad2/task1/ConvTasNet/eval.py
@@ -0,0 +1,119 @@
+import argparse
+import os
+import sys
+from pathlib import Path
+
+import musdb
+import museval
+import soundfile as sf
+import torch
+import yaml
+from local import ConvTasNetStereo
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--out_dir",
+    type=str,
+    required=True,
+    help="Directory in exp_dir where the eval results will be stored",
+)
+parser.add_argument(
+    "--use_gpu", type=int, default=0, help="Whether to use the GPU for model execution"
+)
+parser.add_argument("--exp_dir", default="exp/tmp", help="Experiment root")
+parser.add_argument(
+    "--n_save_ex",
+    type=int,
+    default=10,
+    help="Number of audio examples to save, -1 means all",
+)
+
+compute_metrics = ["si_sdr", "sdr", "sir", "sar"]
+
+
+def main(conf):
+    model_path = os.path.join(conf["exp_dir"], "best_model.pth")
+
+    model = ConvTasNetStereo(
+        **conf["train_conf"]["convtasnet"],
+        samplerate=conf["train_conf"]["data"]["sample_rate"],
+    )
+
+    saved = torch.load(model_path, map_location="cpu")
+    model.load_state_dict(saved["state_dict"])
+
+    # Handle device placement
+    if conf["use_gpu"]:
+        model.cuda()
+
+    model_device = next(model.parameters()).device
+
+    # Evaluation is mode using 'remix' mixture
+    test_set = musdb.DB(
+        root=conf["train_conf"]["data"]["root"], subsets="test", is_wav=True
+    )
+    results = museval.EvalStore()
+
+    # Randomly choose the indexes of sentences to save.
+    eval_save_dir = os.path.join(conf["exp_dir"], conf["out_dir"])
+    Path(eval_save_dir).mkdir(exist_ok=True, parents=True)
+
+    txtout = os.path.join(eval_save_dir, "results.txt")
+    fp = open(txtout, "w")
+
+    torch.no_grad().__enter__()
+    for track in test_set:
+        input_file = os.path.join(
+            conf["train_conf"]["data"]["root"], "test", track.name, "mixture.wav"
+        )
+        # Forward the network on the mixture.
+        mix, rate = sf.read(input_file, always_2d=True, start=0, stop=None)
+
+        # Separate
+        mix = torch.tensor(mix.T, dtype=torch.float).to(model_device)
+
+        est_sources = model.forward(mix.unsqueeze(0))
+        est_sources = est_sources.squeeze(0).cpu().data.numpy()
+
+        estimates = {}
+        estimates["vocals"] = est_sources[0].T
+        estimates["accompaniment"] = est_sources[1].T
+
+        output_path = Path(os.path.join(eval_save_dir, track.name))
+        output_path.mkdir(exist_ok=True, parents=True)
+
+        print(f"Processing... {track.name}", file=sys.stderr)
+        print(track.name, file=fp)
+
+        for target, estimate in estimates.items():
+            sf.write(
+                str(output_path / Path(target).with_suffix(".wav")),
+                estimate,
+                conf["train_conf"]["data"]["sample_rate"],
+            )
+        track_scores = museval.eval_mus_track(track, estimates)
+        results.add_track(track_scores.df)
+        print(track_scores, file=sys.stderr)
+        print(track_scores, file=fp)
+    print(results, file=sys.stderr)
+    print(results, file=fp)
+    results.save(os.path.join(eval_save_dir, "results.pandas"))
+    results.frames_agg = "mean"
+    print(results, file=sys.stderr)
+    print(results, file=fp)
+    fp.close()
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    arg_dic = dict(vars(args))
+    # Load training config
+    conf_path = os.path.join(args.exp_dir, "conf.yml")
+    with open(conf_path) as f:
+        train_conf = yaml.safe_load(f)
+    arg_dic["sample_rate"] = train_conf["data"]["sample_rate"]
+    arg_dic["train_conf"] = train_conf
+
+    main(arg_dic)
+
+    print("Done!")
diff --git a/recipes/cad2/task1/ConvTasNet/local/__init__.py b/recipes/cad2/task1/ConvTasNet/local/__init__.py
@@ -0,0 +1,11 @@
+from .musdb18_dataset import Compose, MUSDB18Dataset, augment_channelswap, augment_gain
+from .tasnet import ConvTasNetStereo, overlap_and_add
+
+__all__ = [
+    "MUSDB18Dataset",
+    "Compose",
+    "augment_gain",
+    "augment_channelswap",
+    "ConvTasNetStereo",
+    "overlap_and_add",
+]