Skip to content

Commit

Permalink
Merge branch 'develop' of https://github.com/PaddlePaddle/DeepSpeech
Browse files Browse the repository at this point in the history
…into ctcdecoders
  • Loading branch information
Jackwaterveg committed Jan 24, 2022
2 parents 624e86d + 4907628 commit eb4edad
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 61 deletions.
21 changes: 10 additions & 11 deletions paddlespeech/s2t/decoders/ctcdecoder/swig_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,20 +137,19 @@ def ctc_beam_search_decoding_batch(probs_split,
return batch_beam_results


class CTCBeamSearchDecoder(
paddlespeech_ctcdecoders.CtcBeamSearchDecoderBatch):
class CTCBeamSearchDecoder(paddlespeech_ctcdecoders.CtcBeamSearchDecoderBatch):
"""Wrapper for CtcBeamSearchDecoderBatch.
Args:
vocab_list (list): [Vocabulary list.]
beam_size (int): [Width for beam search.]
num_processes (int): [Number of parallel processes.]
param cutoff_prob (float): [Cutoff probability in vocabulary pruning,
default 1.0, no pruning.]
cutoff_top_n (int): [Cutoff number in pruning, only top cutoff_top_n
vocab_list (list): Vocabulary list.
beam_size (int): Width for beam search.
num_processes (int): Number of parallel processes.
param cutoff_prob (float): Cutoff probability in vocabulary pruning,
default 1.0, no pruning.
cutoff_top_n (int): Cutoff number in pruning, only top cutoff_top_n
characters with highest probs in vocabulary will be
used in beam search, default 40.]
param ext_scorer (Scorer): [External scorer for partially decoded sentence, e.g. word count
or language model.]
used in beam search, default 40.
param ext_scorer (Scorer): External scorer for partially decoded sentence, e.g. word count
or language model.
"""

def __init__(self, vocab_list, batch_size, beam_size, num_processes,
Expand Down
2 changes: 1 addition & 1 deletion paddlespeech/s2t/models/ds2/deepspeech2.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def decode(self, audio, audio_len):
self.decoder.reset_decoder(batch_size=batch_size)
self.decoder.next(probs, eouts_len)
trans_best, trans_beam = self.decoder.decode()

return trans_best

@classmethod
Expand Down
75 changes: 44 additions & 31 deletions paddlespeech/s2t/modules/ctc.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,15 +252,15 @@ def init_decoder(self, batch_size, vocab_list, decoding_method,
init ctc decoders
Args:
batch_size(int): Batch size for input data
vocab_list (list): [List of tokens in the vocabulary, for decoding.]
decoding_method (str): ["ctc_beam_search"]
lang_model_path (str): [language model path]
beam_alpha (float): [beam_alpha]
beam_beta (float): [beam_beta]
beam_size (int): [beam_size]
cutoff_prob (float): [cutoff probability in beam search]
cutoff_top_n (int): [cutoff_top_n]
num_processes (int): [num_processes]
vocab_list (list): List of tokens in the vocabulary, for decoding
decoding_method (str): ctc_beam_search
lang_model_path (str): language model path
beam_alpha (float): beam_alpha
beam_beta (float): beam_beta
beam_size (int): beam_size
cutoff_prob (float): cutoff probability in beam search
cutoff_top_n (int): cutoff_top_n
num_processes (int): num_processes
Raises:
ValueError: when decoding_method not support.
Expand Down Expand Up @@ -299,15 +299,15 @@ def decode_probs_offline(self, probs, logits_lens, vocab_list,
Args:
probs (Tensor): activation after softmax
logits_lens (Tensor): audio output lens
vocab_list (list): [List of tokens in the vocabulary, for decoding.]
decoding_method (str): ["ctc_beam_search"]
lang_model_path (str): [language model path]
beam_alpha (float): [beam_alpha]
beam_beta (float): [beam_beta]
beam_size (int): [beam_size]
cutoff_prob (float): [cutoff probability in beam search]
cutoff_top_n (int): [cutoff_top_n]
num_processes (int): [num_processes]
vocab_list (list): List of tokens in the vocabulary, for decoding
decoding_method (str): ctc_beam_search
lang_model_path (str): language model path
beam_alpha (float): beam_alpha
beam_beta (float): beam_beta
beam_size (int): beam_size
cutoff_prob (float): cutoff probability in beam search
cutoff_top_n (int): cutoff_top_n
num_processes (int): num_processes
Raises:
ValueError: when decoding_method not support.
Expand Down Expand Up @@ -340,14 +340,14 @@ def get_decoder(self, vocab_list, batch_size, beam_alpha, beam_beta,
"""
init get ctc decoder
Args:
vocab_list (list): [List of tokens in the vocabulary, for decoding.]
vocab_list (list): List of tokens in the vocabulary, for decoding.
batch_size(int): Batch size for input data
beam_alpha (float): [beam_alpha]
beam_beta (float): [beam_beta]
beam_size (int): [beam_size]
num_processes (int): [num_processes]
cutoff_prob (float): [cutoff probability in beam search]
cutoff_top_n (int): [cutoff_top_n]
beam_alpha (float): beam_alpha
beam_beta (float): beam_beta
beam_size (int): beam_size
num_processes (int): num_processes
cutoff_prob (float): cutoff probability in beam search
cutoff_top_n (int): cutoff_top_n
Raises:
ValueError: when decoding_method not support.
Expand All @@ -370,8 +370,8 @@ def next(self, probs, logits_lens):
"""
Input probs into ctc decoder
Args:
probs (list(list(float))): [probs for a batch of data]
logits_lens (list(int)): [logits lens for a batch of data]
probs (list(list(float))): probs for a batch of data
logits_lens (list(int)): logits lens for a batch of data
Raises:
Exception: when the ctc decoder is not initialized
ValueError: when decoding_method not support.
Expand Down Expand Up @@ -405,8 +405,8 @@ def decode(self):
Exception: when the ctc decoder is not initialized
ValueError: when decoding_method not support.
Returns:
results_best (list(str)): [The best result for a batch of data]
results_beam (list(list(str))): [The beam search result for a batch of data]
results_best (list(str)): The best result for a batch of data
results_beam (list(list(str))): The beam search result for a batch of data
"""
if self.beam_search_decoder is None:
raise Exception(
Expand All @@ -426,7 +426,12 @@ def decode(self):

return results_best, results_beam

def reset_decoder(self, batch_size=-1, beam_size=-1, num_processes=-1, cutoff_prob=-1.0, cutoff_top_n=-1):
def reset_decoder(self,
batch_size=-1,
beam_size=-1,
num_processes=-1,
cutoff_prob=-1.0,
cutoff_top_n=-1):
if batch_size > 0:
self.batch_size = batch_size
if beam_size > 0:
Expand All @@ -439,13 +444,21 @@ def reset_decoder(self, batch_size=-1, beam_size=-1, num_processes=-1, cutoff_pr
self.cutoff_top_n = cutoff_top_n
"""
Reset the decoder state
Args:
batch_size(int): Batch size for input data
beam_size (int): beam_size
num_processes (int): num_processes
cutoff_prob (float): cutoff probability in beam search
cutoff_top_n (int): cutoff_top_n
Raises:
Exception: when the ctc decoder is not initialized
"""
if self.beam_search_decoder is None:
raise Exception(
"You need to initialize the beam_search_decoder firstly")
self.beam_search_decoder.reset_state(self.batch_size, self.beam_size, self.num_processes, self.cutoff_prob, self.cutoff_top_n)
self.beam_search_decoder.reset_state(
self.batch_size, self.beam_size, self.num_processes,
self.cutoff_prob, self.cutoff_top_n)

def del_decoder(self):
"""
Expand Down
51 changes: 33 additions & 18 deletions paddlespeech/t2s/exps/synthesize_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,10 @@ def evaluate(args):
idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"])
elif am_name == 'speedyspeech':
am = am_class(
vocab_size=vocab_size, tone_size=tone_size, **am_config["model"])
vocab_size=vocab_size,
tone_size=tone_size,
spk_num=spk_num,
**am_config["model"])
elif am_name == 'tacotron2':
am = am_class(idim=vocab_size, odim=odim, **am_config["model"])

Expand Down Expand Up @@ -171,25 +174,31 @@ def evaluate(args):
InputSpec([-1], dtype=paddle.int64),
InputSpec([1], dtype=paddle.int64)
])
paddle.jit.save(am_inference,
os.path.join(args.inference_dir, args.am))
am_inference = paddle.jit.load(
os.path.join(args.inference_dir, args.am))
else:
am_inference = jit.to_static(
am_inference,
input_spec=[InputSpec([-1], dtype=paddle.int64)])
paddle.jit.save(am_inference,
os.path.join(args.inference_dir, args.am))
am_inference = paddle.jit.load(
os.path.join(args.inference_dir, args.am))
paddle.jit.save(am_inference,
os.path.join(args.inference_dir, args.am))
am_inference = paddle.jit.load(
os.path.join(args.inference_dir, args.am))
elif am_name == 'speedyspeech':
am_inference = jit.to_static(
am_inference,
input_spec=[
InputSpec([-1], dtype=paddle.int64),
InputSpec([-1], dtype=paddle.int64)
])
if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
am_inference = jit.to_static(
am_inference,
input_spec=[
InputSpec([-1], dtype=paddle.int64), # text
InputSpec([-1], dtype=paddle.int64), # tone
None, # duration
InputSpec([-1], dtype=paddle.int64) # spk_id
])
else:
am_inference = jit.to_static(
am_inference,
input_spec=[
InputSpec([-1], dtype=paddle.int64),
InputSpec([-1], dtype=paddle.int64)
])

paddle.jit.save(am_inference,
os.path.join(args.inference_dir, args.am))
Expand Down Expand Up @@ -242,7 +251,12 @@ def evaluate(args):
mel = am_inference(part_phone_ids)
elif am_name == 'speedyspeech':
part_tone_ids = tone_ids[i]
mel = am_inference(part_phone_ids, part_tone_ids)
if am_dataset in {"aishell3", "vctk"}:
spk_id = paddle.to_tensor(args.spk_id)
mel = am_inference(part_phone_ids, part_tone_ids,
spk_id)
else:
mel = am_inference(part_phone_ids, part_tone_ids)
elif am_name == 'tacotron2':
mel = am_inference(part_phone_ids)
# vocoder
Expand All @@ -269,8 +283,9 @@ def main():
type=str,
default='fastspeech2_csmsc',
choices=[
'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech',
'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc'
'speedyspeech_csmsc', 'speedyspeech_aishell3', 'fastspeech2_csmsc',
'fastspeech2_ljspeech', 'fastspeech2_aishell3', 'fastspeech2_vctk',
'tacotron2_csmsc'
],
help='Choose acoustic model type of tts task.')
parser.add_argument(
Expand Down

0 comments on commit eb4edad

Please sign in to comment.