Skip to content

Commit

Permalink
Merge pull request #399 from claritychallenge/371-CAD2
Browse files Browse the repository at this point in the history
Adding Cadenza 2
  • Loading branch information
groadabike authored Jul 12, 2024
2 parents 86ab22a + 61e155c commit bebd5fc
Show file tree
Hide file tree
Showing 47 changed files with 5,362 additions and 73 deletions.
3 changes: 3 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ repos:
- id: mypy
args:
- --explicit-package-bases
additional_dependencies:
- 'types-PyYAML'


- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
Expand Down
7 changes: 7 additions & 0 deletions clarity/enhancer/multiband_compressor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from clarity.enhancer.multiband_compressor.compressor_qmul import Compressor
from clarity.enhancer.multiband_compressor.crossover import Crossover
from clarity.enhancer.multiband_compressor.multiband_compressor import (
MultibandCompressor,
)

__all__ = ["MultibandCompressor", "Compressor", "Crossover"]
60 changes: 25 additions & 35 deletions clarity/enhancer/multiband_compressor/compressor_qmul.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

from __future__ import annotations

import warnings

import numpy as np


Expand Down Expand Up @@ -68,39 +66,28 @@ def __init__(
knee_width: float = 0.0,
sample_rate: float = 44100.0,
) -> None:
if threshold > 0 or threshold < -60:
warnings.warn(
"Threshold outside the recommended range [0.0, -60.0] dB."
f" {threshold} dB was provided.",
stacklevel=1,
)
if ratio < 1 or ratio > 20:
warnings.warn(
"Ratio outside the recommended range [1.0, 20.0]."
f" {ratio} was provided.",
stacklevel=1,
)

if attack < 0.1 or attack > 80.0:
warnings.warn(
"Attack outside the recommended range [0.1, 80.0] ms."
f" {attack} ms was provided.",
stacklevel=1,
)

if release < 0.1 or release > 1000.0:
warnings.warn(
"Release outside the recommended range [0.1, 1000.0] ms."
f" {release} ms was provided.",
stacklevel=1,
)

if makeup_gain < 0 or makeup_gain > 24:
warnings.warn(
"Make-up gain outside the recommended range [0.0, 24.0] dB."
f" {makeup_gain} dB was provided.",
stacklevel=1,
)
"""Constructor for the Compressor class.
Args:
threshold (float): The threshold level in dB.
ratio (float): The compression ratio.
attack (float): The attack time in ms.
release (float): The release time in ms.
makeup_gain (float): The make-up gain in dB.
knee_width (float): The knee width in dB.
sample_rate (float): The sample rate in Hz.
Notes:
Original implementation recommends ranges for each parameter.
We are not enforcing these ranges in this implementation.
The ranges are:
- threshold in the range [0.0, -60.0] dB,
- ratio in the range [1.0, 20.0],
- attack in the range [0.1, 80.0] ms,
- release in the range [0.1, 1000.0] ms,
- makeup_gain in the range [0.0, 24.0] dB.
- knee_width in the range [0.0, 10.0] dB.
"""

self.threshold = float(threshold)
self.ratio = float(ratio)
Expand All @@ -110,6 +97,8 @@ def __init__(
self.sample_rate = float(sample_rate)
self.knee_width = float(knee_width)

self.eps = 1e-12

self.alpha_attack = np.exp(-1.0 / (0.001 * self.sample_rate * self.attack))
self.alpha_release = np.exp(-1.0 / (0.001 * self.sample_rate * self.release))

Expand All @@ -125,6 +114,7 @@ def __call__(self, input_signal: np.ndarray) -> np.ndarray:
"""

# Compute the instantaneous desired levels
input_signal[input_signal == 0] = self.eps
x_g = 20 * np.log10(np.abs(input_signal))
x_g[x_g < -120] = -120
y_g = self.threshold + (x_g - self.threshold) / self.ratio
Expand Down
68 changes: 66 additions & 2 deletions clarity/utils/flac_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import pyflac as pf
import soundfile as sf

from clarity.utils.signal_processing import clip_signal, resample, to_16bit

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -189,11 +191,11 @@ def encode(
"""
if signal.dtype != np.int16:
logger.error(
f"FLAC encoder only supports 16-bit integer signals, "
"FLAC encoder only supports 16-bit integer signals, "
f"but got {signal.dtype}"
)
raise ValueError(
f"FLAC encoder only supports 16-bit integer signals, "
"FLAC encoder only supports 16-bit integer signals, "
f"but got {signal.dtype}"
)

Expand Down Expand Up @@ -261,3 +263,65 @@ def read_flac_signal(filename: Path) -> tuple[np.ndarray, float]:
# Scale signal
signal *= max_value
return signal, sample_rate


def save_flac_signal(
signal: np.ndarray,
filename: Path,
signal_sample_rate: int,
output_sample_rate: int | None = None,
do_clip_signal: bool = False,
do_soft_clip: bool = False,
do_scale_signal: bool = False,
) -> None:
"""
Function to save output signals.
- The output signal will be resample to ``output_sample_rate``.
If ``output_sample_rate`` is None, the output signal will have
the same sample rate as the input signal.
- The output signal will be clipped to [-1, 1] if ``do_clip_signal`` is True
and use soft clipped if ``do_soft_clip`` is True. Note that if
``do_clip_signal`` is False, ``do_soft_clip`` will be ignored.
Note that if ``do_clip_signal`` is True, ``do_scale_signal`` will be ignored.
- The output signal will be scaled to [-1, 1] if ``do_scale_signal`` is True.
If signal is scale, the scale factor will be saved in a TXT file.
Note that if ``do_clip_signal`` is True, ``do_scale_signal`` will be ignored.
- The output signal will be saved as a FLAC file.
Args:
signal (np.ndarray) : Signal to save
filename (Path) : Path to save signal
signal_sample_rate (int) : Sample rate of the input signal
output_sample_rate (int) : Sample rate of the output signal
do_clip_signal (bool) : Whether to clip signal
do_soft_clip (bool) : Whether to apply soft clipping
do_scale_signal (bool) : Whether to scale signal
"""
# Resample signal to expected output sample rate
if output_sample_rate is None:
output_sample_rate = signal_sample_rate

if signal_sample_rate != output_sample_rate:
signal = resample(signal, signal_sample_rate, output_sample_rate)

if do_scale_signal:
# Scale stem signal
max_value = np.max(np.abs(signal))
signal = signal / max_value

# Save scale factor
with open(filename.with_suffix(".txt"), "w", encoding="utf-8") as file:
file.write(f"{max_value}")

elif do_clip_signal:
# Clip the signal
signal, n_clipped = clip_signal(signal, do_soft_clip)
if n_clipped > 0:
logger.warning(f"Writing {filename}: {n_clipped} samples clipped")

# Convert signal to 16-bit integer
signal = to_16bit(signal)

# Create flac encoder object to compress and save the signal
FlacEncoder().encode(signal, output_sample_rate, filename)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ dependencies = [
"pystoi",
"pytorch-lightning",
"resampy",
"safetensors>=0.4.3",
"scikit-learn>=1.0.2",
"scipy>=1.7.3, <1.13.0",
"SoundFile>=0.10.3.post1",
Expand Down
7 changes: 7 additions & 0 deletions recipes/cad2/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# The Second Cadenza Challenge

Cadenza challenge code for the Second Cadenza Challenge (CAD2).
For more information please visit the [challenge website](https://cadenzachallenge.org/docs/cadenza2/intro).

In the directories `task 1` and `task 2`, you will find the code for the baseline
for each system and the instruction on how to obtain the data.
41 changes: 41 additions & 0 deletions recipes/cad2/task1/ConvTasNet/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# CAD2-TASK1 singing/accompaniment separation model

This recipe contains the necessary content to replicate the separation models used in CAD2-Task1.

- The system is based on Asteroid Source Separation system.
- ConvTasNet implementation is based on stereo adaptation by Alexandre Defossez <https://github.com/facebookresearch/demucs/blob/v1/demucs/tasnet.py>
- Evaluation logic is based on <https://github.com/asteroid-team/asteroid/blob/master/egs/musdb18/X-UMX/eval.py>
- Dataloader is based on <https://github.com/asteroid-team/asteroid/blob/master/asteroid/data/musdb18_dataset.py>

You can replicate the Causal and Non-Causal model by running:

- **To replicate the Non-Causal model**

```bash
python train.py \
--exp_dir /path/to/save/exps \
--batch_size 4 \
--aggregate 2 \
--lr 0.0005 \
--root /path/to/MUSDB18 \
--sample_rate 44100 \
--segment 5.0 \
--samples_per_track 64
```

- **To replicate the Causal model**

```bash
python train.py \
--exp_dir /path/to/save/exps \
--batch_size 4 \
--aggregate 1 \
--lr 0.0005 \
--root /path/to/MUSDB18 \
--sample_rate 44100 \
--segment 4.0 \
--samples_per_track 64 \
--causal True \
--n_src 2 \
--norm_type cLN
```
Empty file.
119 changes: 119 additions & 0 deletions recipes/cad2/task1/ConvTasNet/eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import argparse
import os
import sys
from pathlib import Path

import musdb
import museval
import soundfile as sf
import torch
import yaml
from local import ConvTasNetStereo

parser = argparse.ArgumentParser()
parser.add_argument(
"--out_dir",
type=str,
required=True,
help="Directory in exp_dir where the eval results will be stored",
)
parser.add_argument(
"--use_gpu", type=int, default=0, help="Whether to use the GPU for model execution"
)
parser.add_argument("--exp_dir", default="exp/tmp", help="Experiment root")
parser.add_argument(
"--n_save_ex",
type=int,
default=10,
help="Number of audio examples to save, -1 means all",
)

compute_metrics = ["si_sdr", "sdr", "sir", "sar"]


def main(conf):
model_path = os.path.join(conf["exp_dir"], "best_model.pth")

model = ConvTasNetStereo(
**conf["train_conf"]["convtasnet"],
samplerate=conf["train_conf"]["data"]["sample_rate"],
)

saved = torch.load(model_path, map_location="cpu")
model.load_state_dict(saved["state_dict"])

# Handle device placement
if conf["use_gpu"]:
model.cuda()

model_device = next(model.parameters()).device

# Evaluation is mode using 'remix' mixture
test_set = musdb.DB(
root=conf["train_conf"]["data"]["root"], subsets="test", is_wav=True
)
results = museval.EvalStore()

# Randomly choose the indexes of sentences to save.
eval_save_dir = os.path.join(conf["exp_dir"], conf["out_dir"])
Path(eval_save_dir).mkdir(exist_ok=True, parents=True)

txtout = os.path.join(eval_save_dir, "results.txt")
fp = open(txtout, "w")

torch.no_grad().__enter__()
for track in test_set:
input_file = os.path.join(
conf["train_conf"]["data"]["root"], "test", track.name, "mixture.wav"
)
# Forward the network on the mixture.
mix, rate = sf.read(input_file, always_2d=True, start=0, stop=None)

# Separate
mix = torch.tensor(mix.T, dtype=torch.float).to(model_device)

est_sources = model.forward(mix.unsqueeze(0))
est_sources = est_sources.squeeze(0).cpu().data.numpy()

estimates = {}
estimates["vocals"] = est_sources[0].T
estimates["accompaniment"] = est_sources[1].T

output_path = Path(os.path.join(eval_save_dir, track.name))
output_path.mkdir(exist_ok=True, parents=True)

print(f"Processing... {track.name}", file=sys.stderr)
print(track.name, file=fp)

for target, estimate in estimates.items():
sf.write(
str(output_path / Path(target).with_suffix(".wav")),
estimate,
conf["train_conf"]["data"]["sample_rate"],
)
track_scores = museval.eval_mus_track(track, estimates)
results.add_track(track_scores.df)
print(track_scores, file=sys.stderr)
print(track_scores, file=fp)
print(results, file=sys.stderr)
print(results, file=fp)
results.save(os.path.join(eval_save_dir, "results.pandas"))
results.frames_agg = "mean"
print(results, file=sys.stderr)
print(results, file=fp)
fp.close()


if __name__ == "__main__":
args = parser.parse_args()
arg_dic = dict(vars(args))
# Load training config
conf_path = os.path.join(args.exp_dir, "conf.yml")
with open(conf_path) as f:
train_conf = yaml.safe_load(f)
arg_dic["sample_rate"] = train_conf["data"]["sample_rate"]
arg_dic["train_conf"] = train_conf

main(arg_dic)

print("Done!")
11 changes: 11 additions & 0 deletions recipes/cad2/task1/ConvTasNet/local/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from .musdb18_dataset import Compose, MUSDB18Dataset, augment_channelswap, augment_gain
from .tasnet import ConvTasNetStereo, overlap_and_add

__all__ = [
"MUSDB18Dataset",
"Compose",
"augment_gain",
"augment_channelswap",
"ConvTasNetStereo",
"overlap_and_add",
]
Loading

0 comments on commit bebd5fc

Please sign in to comment.