Merge branch 'develop' of https://github.com/PaddlePaddle/DeepSpeech …

…into ctcdecoders
PaddlePaddle · Jan 24, 2022 · eb4edad · eb4edad
2 parents 624e86d + 4907628
commit eb4edad
Show file tree

Hide file tree

Showing 4 changed files with 88 additions and 61 deletions.
diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig_wrapper.py b/paddlespeech/s2t/decoders/ctcdecoder/swig_wrapper.py
@@ -137,20 +137,19 @@ def ctc_beam_search_decoding_batch(probs_split,
     return batch_beam_results
 
 
-class CTCBeamSearchDecoder(
-        paddlespeech_ctcdecoders.CtcBeamSearchDecoderBatch):
+class CTCBeamSearchDecoder(paddlespeech_ctcdecoders.CtcBeamSearchDecoderBatch):
     """Wrapper for CtcBeamSearchDecoderBatch.
     Args:
-        vocab_list (list): [Vocabulary list.]
-        beam_size (int): [Width for beam search.]
-        num_processes (int): [Number of parallel processes.]
-        param cutoff_prob (float): [Cutoff probability in vocabulary pruning,
-                            default 1.0, no pruning.]
-        cutoff_top_n (int): [Cutoff number in pruning, only top cutoff_top_n
+        vocab_list (list): Vocabulary list.
+        beam_size (int): Width for beam search.
+        num_processes (int): Number of parallel processes.
+        param cutoff_prob (float): Cutoff probability in vocabulary pruning,
+                            default 1.0, no pruning.
+        cutoff_top_n (int): Cutoff number in pruning, only top cutoff_top_n
                             characters with highest probs in vocabulary will be
-                            used in beam search, default 40.]
-        param ext_scorer (Scorer): [External scorer for partially decoded sentence, e.g. word count
-                                or language model.]
+                            used in beam search, default 40.
+        param ext_scorer (Scorer): External scorer for partially decoded sentence, e.g. word count
+                                or language model.
     """
 
     def __init__(self, vocab_list, batch_size, beam_size, num_processes,

diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py
@@ -174,7 +174,7 @@ def decode(self, audio, audio_len):
         self.decoder.reset_decoder(batch_size=batch_size)
         self.decoder.next(probs, eouts_len)
         trans_best, trans_beam = self.decoder.decode()
-        
+
         return trans_best
 
     @classmethod

diff --git a/paddlespeech/s2t/modules/ctc.py b/paddlespeech/s2t/modules/ctc.py
@@ -252,15 +252,15 @@ def init_decoder(self, batch_size, vocab_list, decoding_method,
         init ctc decoders
         Args:
             batch_size(int): Batch size for input data
-            vocab_list (list): [List of tokens in the vocabulary, for decoding.]
-            decoding_method (str): ["ctc_beam_search"]
-            lang_model_path (str): [language model path]
-            beam_alpha (float): [beam_alpha]
-            beam_beta (float): [beam_beta]
-            beam_size (int): [beam_size]
-            cutoff_prob (float): [cutoff probability in beam search]
-            cutoff_top_n (int): [cutoff_top_n]
-            num_processes (int): [num_processes]
+            vocab_list (list): List of tokens in the vocabulary, for decoding
+            decoding_method (str): ctc_beam_search
+            lang_model_path (str): language model path
+            beam_alpha (float): beam_alpha
+            beam_beta (float): beam_beta
+            beam_size (int): beam_size
+            cutoff_prob (float): cutoff probability in beam search
+            cutoff_top_n (int): cutoff_top_n
+            num_processes (int): num_processes
 
         Raises:
             ValueError: when decoding_method not support.
@@ -299,15 +299,15 @@ def decode_probs_offline(self, probs, logits_lens, vocab_list,
         Args:
             probs (Tensor): activation after softmax
             logits_lens (Tensor): audio output lens
-            vocab_list (list): [List of tokens in the vocabulary, for decoding.]
-            decoding_method (str): ["ctc_beam_search"]
-            lang_model_path (str): [language model path]
-            beam_alpha (float): [beam_alpha]
-            beam_beta (float): [beam_beta]
-            beam_size (int): [beam_size]
-            cutoff_prob (float): [cutoff probability in beam search]
-            cutoff_top_n (int): [cutoff_top_n]
-            num_processes (int): [num_processes]
+            vocab_list (list): List of tokens in the vocabulary, for decoding
+            decoding_method (str): ctc_beam_search
+            lang_model_path (str): language model path
+            beam_alpha (float): beam_alpha
+            beam_beta (float): beam_beta
+            beam_size (int): beam_size
+            cutoff_prob (float): cutoff probability in beam search
+            cutoff_top_n (int): cutoff_top_n
+            num_processes (int): num_processes
 
         Raises:
             ValueError: when decoding_method not support.
@@ -340,14 +340,14 @@ def get_decoder(self, vocab_list, batch_size, beam_alpha, beam_beta,
         """
         init get ctc decoder
         Args:
-            vocab_list (list): [List of tokens in the vocabulary, for decoding.]
+            vocab_list (list): List of tokens in the vocabulary, for decoding.
             batch_size(int): Batch size for input data
-            beam_alpha (float): [beam_alpha]
-            beam_beta (float): [beam_beta]
-            beam_size (int): [beam_size]
-            num_processes (int): [num_processes]
-            cutoff_prob (float): [cutoff probability in beam search]
-            cutoff_top_n (int): [cutoff_top_n]
+            beam_alpha (float): beam_alpha
+            beam_beta (float): beam_beta
+            beam_size (int): beam_size
+            num_processes (int): num_processes
+            cutoff_prob (float): cutoff probability in beam search
+            cutoff_top_n (int): cutoff_top_n
 
         Raises:
             ValueError: when decoding_method not support.
@@ -370,8 +370,8 @@ def next(self, probs, logits_lens):
         """
         Input probs into ctc decoder
         Args:
-            probs (list(list(float))): [probs for a batch of data]
-            logits_lens (list(int)): [logits lens for a batch of data]
+            probs (list(list(float))): probs for a batch of data
+            logits_lens (list(int)): logits lens for a batch of data
         Raises:
             Exception: when the ctc decoder is not initialized
             ValueError: when decoding_method not support.
@@ -405,8 +405,8 @@ def decode(self):
             Exception: when the ctc decoder is not initialized
             ValueError: when decoding_method not support.
         Returns:
-            results_best (list(str)): [The best result for a batch of data]
-            results_beam (list(list(str))): [The beam search result for a batch of data]
+            results_best (list(str)): The best result for a batch of data
+            results_beam (list(list(str))): The beam search result for a batch of data
         """
         if self.beam_search_decoder is None:
             raise Exception(
@@ -426,7 +426,12 @@ def decode(self):
 
         return results_best, results_beam
 
-    def reset_decoder(self, batch_size=-1, beam_size=-1, num_processes=-1, cutoff_prob=-1.0, cutoff_top_n=-1):
+    def reset_decoder(self,
+                      batch_size=-1,
+                      beam_size=-1,
+                      num_processes=-1,
+                      cutoff_prob=-1.0,
+                      cutoff_top_n=-1):
         if batch_size > 0:
             self.batch_size = batch_size
         if beam_size > 0:
@@ -439,13 +444,21 @@ def reset_decoder(self, batch_size=-1, beam_size=-1, num_processes=-1, cutoff_pr
             self.cutoff_top_n = cutoff_top_n
         """
         Reset the decoder state
+        Args:
+            batch_size(int): Batch size for input data
+            beam_size (int): beam_size
+            num_processes (int): num_processes
+            cutoff_prob (float): cutoff probability in beam search
+            cutoff_top_n (int): cutoff_top_n
         Raises:
             Exception: when the ctc decoder is not initialized
         """
         if self.beam_search_decoder is None:
             raise Exception(
                 "You need to initialize the beam_search_decoder firstly")
-        self.beam_search_decoder.reset_state(self.batch_size, self.beam_size, self.num_processes, self.cutoff_prob, self.cutoff_top_n)
+        self.beam_search_decoder.reset_state(
+            self.batch_size, self.beam_size, self.num_processes,
+            self.cutoff_prob, self.cutoff_top_n)
 
     def del_decoder(self):
         """

diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -129,7 +129,10 @@ def evaluate(args):
             idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"])
     elif am_name == 'speedyspeech':
         am = am_class(
-            vocab_size=vocab_size, tone_size=tone_size, **am_config["model"])
+            vocab_size=vocab_size,
+            tone_size=tone_size,
+            spk_num=spk_num,
+            **am_config["model"])
     elif am_name == 'tacotron2':
         am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
 
@@ -171,25 +174,31 @@ def evaluate(args):
                         InputSpec([-1], dtype=paddle.int64),
                         InputSpec([1], dtype=paddle.int64)
                     ])
-                paddle.jit.save(am_inference,
-                                os.path.join(args.inference_dir, args.am))
-                am_inference = paddle.jit.load(
-                    os.path.join(args.inference_dir, args.am))
             else:
                 am_inference = jit.to_static(
                     am_inference,
                     input_spec=[InputSpec([-1], dtype=paddle.int64)])
-                paddle.jit.save(am_inference,
-                                os.path.join(args.inference_dir, args.am))
-                am_inference = paddle.jit.load(
-                    os.path.join(args.inference_dir, args.am))
+            paddle.jit.save(am_inference,
+                            os.path.join(args.inference_dir, args.am))
+            am_inference = paddle.jit.load(
+                os.path.join(args.inference_dir, args.am))
         elif am_name == 'speedyspeech':
-            am_inference = jit.to_static(
-                am_inference,
-                input_spec=[
-                    InputSpec([-1], dtype=paddle.int64),
-                    InputSpec([-1], dtype=paddle.int64)
-                ])
+            if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
+                am_inference = jit.to_static(
+                    am_inference,
+                    input_spec=[
+                        InputSpec([-1], dtype=paddle.int64),  # text
+                        InputSpec([-1], dtype=paddle.int64),  # tone
+                        None,  # duration
+                        InputSpec([-1], dtype=paddle.int64)  # spk_id
+                    ])
+            else:
+                am_inference = jit.to_static(
+                    am_inference,
+                    input_spec=[
+                        InputSpec([-1], dtype=paddle.int64),
+                        InputSpec([-1], dtype=paddle.int64)
+                    ])
 
             paddle.jit.save(am_inference,
                             os.path.join(args.inference_dir, args.am))
@@ -242,7 +251,12 @@ def evaluate(args):
                         mel = am_inference(part_phone_ids)
                 elif am_name == 'speedyspeech':
                     part_tone_ids = tone_ids[i]
-                    mel = am_inference(part_phone_ids, part_tone_ids)
+                    if am_dataset in {"aishell3", "vctk"}:
+                        spk_id = paddle.to_tensor(args.spk_id)
+                        mel = am_inference(part_phone_ids, part_tone_ids,
+                                           spk_id)
+                    else:
+                        mel = am_inference(part_phone_ids, part_tone_ids)
                 elif am_name == 'tacotron2':
                     mel = am_inference(part_phone_ids)
                 # vocoder
@@ -269,8 +283,9 @@ def main():
         type=str,
         default='fastspeech2_csmsc',
         choices=[
-            'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech',
-            'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc'
+            'speedyspeech_csmsc', 'speedyspeech_aishell3', 'fastspeech2_csmsc',
+            'fastspeech2_ljspeech', 'fastspeech2_aishell3', 'fastspeech2_vctk',
+            'tacotron2_csmsc'
         ],
         help='Choose acoustic model type of tts task.')
     parser.add_argument(