[TTS]add StarGANv2VC preprocess (#3163)

PaddlePaddle · Apr 18, 2023 · bd0d69c · bd0d69c
1 parent c7d24ba
commit bd0d69c
Show file tree

Hide file tree

Showing 8 changed files with 485 additions and 23 deletions.
diff --git a/examples/vctk/vc3/conf/default.yaml b/examples/vctk/vc3/conf/default.yaml
@@ -1,12 +1,23 @@
 ###########################################################
 #                FEATURE EXTRACTION SETTING               #
 ###########################################################
-# 其实没用上，其实用的是 16000
-sr: 24000
+# 源码 load 的时候用的 24k, 提取 mel 用的 16k, 后续 load 和提取 mel 都要改成 24k
+fs: 16000
 n_fft: 2048
-win_length: 1200
-hop_length: 300
+n_shift: 300
+win_length: 1200   # Window length.(in samples) 50ms
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+fmin: 0           # Minimum frequency of Mel basis.
+fmax: 8000        # Maximum frequency of Mel basis.  sr // 2
 n_mels: 80
+# only for StarGANv2 VC
+norm:             # None here
+htk: True
+power: 2.0
+
+
 ###########################################################
 #                       MODEL SETTING                     #
 ###########################################################

diff --git a/examples/vctk/vc3/local/preprocess.sh b/examples/vctk/vc3/local/preprocess.sh
@@ -6,13 +6,32 @@ stop_stage=100
 config_path=$1
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/preprocess.py \
+        --dataset=vctk \
+        --rootdir=~/datasets/VCTK-Corpus-0.92/ \
+        --dumpdir=dump \
+        --config=${config_path} \
+        --num-cpu=20
 
 fi
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+echo "Normalize ..."
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --speaker-dict=dump/speaker_id_map.txt
 
 fi
diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py
@@ -669,18 +669,72 @@ def vits_multi_spk_batch_fn(examples):
     return batch
 
 
-# 未完成
-def starganv2_vc_batch_fn(examples):
-    batch = {
-        "x_real": None,
-        "y_org": None,
-        "x_ref": None,
-        "x_ref2": None,
-        "y_trg": None,
-        "z_trg": None,
-        "z_trg2": None,
-    }
-    return batch
+# 因为要传参数，所以需要额外构建
+def build_starganv2_vc_collate_fn(latent_dim: int=16, max_mel_length: int=192):
+
+    return StarGANv2VCCollateFn(
+        latent_dim=latent_dim, max_mel_length=max_mel_length)
+
+
+class StarGANv2VCCollateFn:
+    """Functor class of common_collate_fn()"""
+
+    def __init__(self, latent_dim: int=16, max_mel_length: int=192):
+        self.latent_dim = latent_dim
+        self.max_mel_length = max_mel_length
+
+    def random_clip(self, mel: np.array):
+        # [80, T]
+        mel_length = mel.shape[1]
+        if mel_length > self.max_mel_length:
+            random_start = np.random.randint(0,
+                                             mel_length - self.max_mel_length)
+            mel = mel[:, random_start:random_start + self.max_mel_length]
+        return mel
+
+    def __call__(self, exmaples):
+        return self.starganv2_vc_batch_fn(exmaples)
+
+    def starganv2_vc_batch_fn(self, examples):
+        batch_size = len(examples)
+
+        label = [np.array(item["label"], dtype=np.int64) for item in examples]
+        ref_label = [
+            np.array(item["ref_label"], dtype=np.int64) for item in examples
+        ]
+
+        # 需要对 mel 进行裁剪
+        mel = [self.random_clip(item["mel"]) for item in examples]
+        ref_mel = [self.random_clip(item["ref_mel"]) for item in examples]
+        ref_mel_2 = [self.random_clip(item["ref_mel_2"]) for item in examples]
+
+        mel = batch_sequences(mel)
+        ref_mel = batch_sequences(ref_mel)
+        ref_mel_2 = batch_sequences(ref_mel_2)
+
+        # convert each batch to paddle.Tensor
+        # (B,)
+        label = paddle.to_tensor(label)
+        ref_label = paddle.to_tensor(ref_label)
+        # [B, 80, T] -> [B, 1, 80, T]
+        mel = paddle.to_tensor(mel)
+        ref_mel = paddle.to_tensor(ref_mel)
+        ref_mel_2 = paddle.to_tensor(ref_mel_2)
+
+        z_trg = paddle.randn(batch_size, self.latent_dim)
+        z_trg2 = paddle.randn(batch_size, self.latent_dim)
+
+        batch = {
+            "x_real": mels,
+            "y_org": labels,
+            "x_ref": ref_mels,
+            "x_ref2": ref_mels_2,
+            "y_trg": ref_labels,
+            "z_trg": z_trg,
+            "z_trg2": z_trg2
+        }
+
+        return batch
 
 
 # for PaddleSlim

diff --git a/paddlespeech/t2s/datasets/data_table.py b/paddlespeech/t2s/datasets/data_table.py
@@ -11,12 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import random
 from multiprocessing import Manager
 from typing import Any
 from typing import Callable
 from typing import Dict
 from typing import List
 
+import numpy as np
 from paddle.io import Dataset
 
 
@@ -131,3 +133,54 @@ def __len__(self) -> int:
             The length of the dataset
         """
         return len(self.data)
+
+
+class StarGANv2VCDataTable(DataTable):
+    def __init__(self, data: List[Dict[str, Any]]):
+        super().__init__(data)
+        raw_data = data
+        spk_id_set = list(set([item['spk_id'] for item in raw_data]))
+        data_list_per_class = {}
+        for spk_id in spk_id_set:
+            data_list_per_class[spk_id] = []
+        for item in raw_data:
+            for spk_id in spk_id_set:
+                if item['spk_id'] == spk_id:
+                    data_list_per_class[spk_id].append(item)
+        self.data_list_per_class = data_list_per_class
+
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        """Get an example given an index.
+        Args:
+            idx (int): Index of the example to get
+
+        Returns:
+            Dict[str, Any]: A converted example
+        """
+        if self.use_cache and self.caches[idx] is not None:
+            return self.caches[idx]
+
+        data = self._get_metadata(idx)
+
+        # 裁剪放到 batch_fn 里面
+        # 返回一个字典
+        """
+        {'utt_id': 'p225_111', 'spk_id': '1', 'speech': 'path of *.npy'}
+        """
+        ref_data = random.choice(self.data)
+        ref_label = ref_data['spk_id']
+        ref_data_2 = random.choice(self.data_list_per_class[ref_label])
+        # mel_tensor, label, ref_mel_tensor, ref2_mel_tensor, ref_label
+        new_example = {
+            'utt_id': data['utt_id'],
+            'mel': np.load(data['speech']),
+            'label': int(data['spk_id']),
+            'ref_mel': np.load(ref_data['speech']),
+            'ref_mel_2': np.load(ref_data_2['speech']),
+            'ref_label': int(ref_label)
+        }
+
+        if self.use_cache:
+            self.caches[idx] = new_example
+
+        return new_example
diff --git a/paddlespeech/t2s/exps/starganv2_vc/normalize.py b/paddlespeech/t2s/exps/starganv2_vc/normalize.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Normalize feature files and dump them."""
+import argparse
+import logging
+from operator import itemgetter
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import tqdm
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+
+
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
+    )
+    parser.add_argument(
+        "--metadata",
+        type=str,
+        required=True,
+        help="directory including feature files to be normalized. "
+        "you need to specify either *-scp or rootdir.")
+
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump normalized feature files.")
+
+    parser.add_argument(
+        "--speaker-dict", type=str, default=None, help="speaker id map file.")
+
+    args = parser.parse_args()
+
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+
+    # get dataset
+    with jsonlines.open(args.metadata, 'r') as reader:
+        metadata = list(reader)
+    dataset = DataTable(
+        metadata, converters={
+            "speech": np.load,
+        })
+    logging.info(f"The number of files = {len(dataset)}.")
+
+    vocab_speaker = {}
+    with open(args.speaker_dict, 'rt') as f:
+        spk_id = [line.strip().split() for line in f.readlines()]
+    for spk, id in spk_id:
+        vocab_speaker[spk] = int(id)
+
+    # process each file
+    output_metadata = []
+
+    for item in tqdm.tqdm(dataset):
+        utt_id = item['utt_id']
+        speech = item['speech']
+
+        # normalize
+        # 这里暂时写死
+        mean, std = -4, 4
+        speech = (speech - mean) / std
+        speech_path = dumpdir / f"{utt_id}_speech.npy"
+        np.save(speech_path, speech.astype(np.float32), allow_pickle=False)
+
+        spk_id = vocab_speaker[item["speaker"]]
+        record = {
+            "utt_id": item['utt_id'],
+            "spk_id": spk_id,
+            "speech": str(speech_path),
+        }
+
+        output_metadata.append(record)
+    output_metadata.sort(key=itemgetter('utt_id'))
+    output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
+    with jsonlines.open(output_metadata_path, 'w') as writer:
+        for item in output_metadata:
+            writer.write(item)
+    logging.info(f"metadata dumped into {output_metadata_path}")
+
+
+if __name__ == "__main__":
+    main()